# Explore and automatically select theme subset

In [2]:
from tqdm import tqdm
import pandas as pd
import chardet
import time
import os

# get extras headers
#from extras_csv_headers import gkg_header, relevant_gkg_header

In [None]:
#------------------------------------------------------------------------------
# FUNCTIONS & SETUP
#------------------------------------------------------------------------------

def split_on_semicol(s):
    if isinstance(s,float) or str(s) == "nan" or not s:
        return []
    # make cell into array and remove empty items resulting from split
    return [item for item in s.split(";") if item != '']

## Obtain List of Themes from Data (very slow)

In [None]:
#------------------------------------------------------------------------------
# MAIN
#------------------------------------------------------------------------------

# set working directory
csv_folder = "/home/insert_user/GDELT_GKG/gkg_csvs"
os.chdir(csv_folder)


all_themes = []
for item in tqdm(os.listdir(csv_folder)):
    if not item.endswith(".gkg.csv"): # exclude all files that are not csvs
        continue

    # GET DATAFRAMES GKG - seems to have multiple encodings, wtf
    try:
        gkg = pd.read_csv(item, sep='\t', names=gkg_header, header=0, 
                          encoding='unicode_escape', engine='python',
                          on_bad_lines = 'skip')
    except:
        try: 
            #print("Trying utf8")
            gkg = pd.read_csv(item, sep='\t', names=gkg_header, header=0, 
                              encoding='utf-8', engine='python',
                              on_bad_lines = 'skip')
        except:
            try:
                rawdata = open(item, "rb").read()
                encoding = chardet.detect(rawdata)['encoding']
                del rawdata
                gkg = pd.read_csv(item, sep='\t', names=gkg_header, header=0, 
                              encoding=encoding, engine='python',
                              on_bad_lines = 'skip')
            except:
                # there's a problem with some gkg files...
                print("Something went wrong with this file: {}".format(item))
                continue
    
    #--------------------------------------------------------------------------
    # THEMES COLUMN CLEANING
    #--------------------------------------------------------------------------
    
    # subselect columns
    gkg_lite = pd.DataFrame(gkg,columns=["V2SOURCECOMMONNAME","THEMES"])
    # fix THEMES formatting from string to list
    gkg_lite["THEMES"] = gkg_lite["THEMES"].apply(split_on_semicol)
    all_themes.extend(gkg_lite["THEMES"].values)

In [None]:
# get list of all themes found here
import itertools
import numpy as np
themes_series = pd.Series(pd.Series(list(itertools.chain(*all_themes))).astype(str))

In [None]:
del all_themes # clean up the poor memory

In [None]:
# subset by removing taxonomies
theme_counts = themes_series.value_counts()

In [None]:
theme_counts.to_csv("/home/insert_user/GDELT_GKG/all-themes-and-counts")

In [None]:
theme_counts.head()

## Modifying saved list of themes to automatically select subset

In [None]:
# read in themes if already saved and want to modify
theme_counts = pd.read_csv("/home/insert_user/GDELT_GKG/all-themes-and-counts")
theme_counts.head()
# something has gone wrong here, whereas it saved the names as the index before, now we just get enumerated index numbers....

In [None]:
# discard taxonomy themes
theme_counts = theme_counts[~theme_counts.index.str.startswith((
                                                    'TAX_',
                                                    'WB_',
                                                    'SOC_POINTSOFINTEREST',
                                                    'CRISISLEX',
                                                    'ECON_WORLDCURRENCIES',
                                                    'ECON_DEVELOPMENTORGS'
                                                    ))] # this takes care of NaNs

In [None]:
log_theme_counts = np.log(theme_counts)
count_mean = log_theme_counts.mean()
count_std = log_theme_counts.std()

# yields 408 themes
subset_themes = theme_counts[((log_theme_counts < count_mean+count_std) &
                              (log_theme_counts > count_mean-count_std) &
                              (theme_counts > 1000))]

print(len(subset_themes)) # yields 519

In [None]:
!pip3 install matplotlib

In [None]:
from matplotlib import pyplot as plt

subset_themes = pd.read_csv("/home/insert_user/GDELT_GKG/auto-theme-subset",names=["theme","amount"])
subset_themes.set_index("theme", inplace=True)

subset_themes.plot()
plt.xticks(rotation=90)

In [None]:
subset_themes.tail()

In [None]:
subset_themes.to_csv("/home/insert_user/GDELT_GKG/auto-theme-subset")

## Get Stats later on

In [3]:
all_themes = pd.read_csv("/home/insert_user/GDELT_GKG/extras/GDELT Descriptive Stats/all-themes-and-counts")

In [None]:
all_themes.shape

In [5]:
subset_themes = pd.read_csv("/home/insert_user/GDELT_GKG/extras/GDELT Descriptive Stats/auto-theme-subset")

In [None]:
subset_themes.shape

In [None]:
subset_themes