# Split data into train, test and valdiation sets

We need to have the same datasets for training, testing and validation for all models. Thus, we'll do all preprocessing here and save the preprocessed datasets in their own directory.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import joblib

# DEFINE EXPERIMENT NAME
"""
_wordbias experiments involve all features related to tone, polarity, pronoums use etc from gdelt
_otherbias experiments involve article, word count and IMGorEMBED features only
_allbias experiments involve both of the above features

So, we run RFE for each of these separately.

For MBFC, do we really want to do these analyses twice, both for with and without categorical data?
--> Can't hurt.

"""

experiment_name = "_mbfc_allbias_extrafeatures"

# say if it's Robertson/MBFC data
is_robertson = False
is_mbfc = True

# define theme list
theme_list = [
        # GDELT features -- wordbias experiments
        'THEMES_SUBSET', 'PosScore','NegScore','Polarity', 'ActRefDens', 'SelfRefDens',
        
        # get the eMFD features -- wordbias experiments
        #'care-harm','fairness-cheating', 'loyalty-betrayal', 'authority-subversion','sanctity-degradation',
        
        # MBFC variables
        "Factuality","PressFreedom","MediaType","Traffic","Credibility","Longitude","Latitude","tot_art"
        
        # article counts etc -- otherbias experiments
        "article_count","word_count","IMGorEMBED",
        # target variable
        "lean",
    ]

## Get Dataset with Known Outlets & Ground Truth

In [None]:
import os

if is_robertson:
    # get each batch that we made before, add it to list and then concat into final gdelt file
    data_parts = []
    for file in tqdm(os.listdir("GDELT_GKG/extras/")):
        if file.startswith("robertson_outlets_part"):
            data_p = pd.read_csv("GDELT_GKG/extras/{}".format(file))
            data_parts.append(data_p)

    data = pd.concat(data_parts)

    # we also have two outlet columns and the original score, let's remove these
    data.drop(["outlet.1"],axis=1,inplace=True)

    print(data.head())
    
    # looking at the data, we forgot to add the score from Robertson, let's do that now
    robertson = pd.read_csv("GDELT_GKG/extras/Bias Ratings/robertson.csv",usecols=["domain","score"])
    robertson = robertson.rename(columns={"domain":"outlet"})
    # let's recode robertson's scores
    robertson["lean"] = np.where( # if score lower than -0.6, it's "left"--> 0
                                robertson["score"] <= -0.6,0, 
                                # if score between -0.6 and -0.2, it's left lean --> 1
                                np.where((robertson["score"] > -0.6) & (robertson["score"] <= -0.2),1,
                                np.where((robertson["score"] > -0.2) & (robertson["score"] <=  0.2),2,
                                np.where((robertson["score"] > 0.2) & (robertson["score"] < 0.6),3,
                                np.where((robertson["score"] >= 0.6),4,-1)))))

    # combine the score from robertson to our data
    data = data.merge(robertson, on="outlet", how='left')

    # we also have two outlet columns and the original score, let's remove these
    data.drop(["score"],axis=1,inplace=True)

    data.head()

In [None]:
""" if using 3-way classification
# looking at the data, we forgot to add the score from Robertson, let's do that now
robertson = pd.read_csv("GDELT_GKG/extras/Bias Ratings/robertson.csv",usecols=["domain","score"])
robertson = robertson.rename(columns={"domain":"outlet"})
# let's recode robertson's scores to either 0:"left",1:"center" or 2:"right"
robertson["lean"] = np.where(robertson["score"] <= -0.33,0, 
                                  np.where(robertson["score"]>=0.33,2,1))

# combine the score from robertson to our data
data = data.merge(robertson, on="outlet", how='left')

# we also have two outlet columns and the original score, let's remove these
data.drop(["outlet.1","score"],axis=1,inplace=True)

data.head()
"""

In [None]:
if is_mbfc:
    data = pd.read_csv("GDELT_GKG/extras/mbfc_outlet_sentiments.csv")

In [None]:
data.head()

In [None]:
data.set_index("outlet", inplace=True)

print(data.shape)
# the Press Freedom column has some Nan's, which causes errors for the SVC, let's drop those
data = data.dropna() 
print(data.shape)

# extra - already removed outlets with less than 2 articles per day (aka, less than 730 in total)
data = data.loc[data["tot_art"] >= 100,:] # let's remove outlets with very few articles
print(data.shape) # how many rows do we have now

In [None]:
data.head()

## Split into Train and Test Sets

In [None]:
# note which columns are which type
label_col = "lean"
categor_cols = ["Factuality","PressFreedom","MediaType","Traffic","Credibility"]
# we ignore lean, long and lat, as we don't want to scale any of these
numeric_cols = data.columns[~data.columns.isin(categor_cols + [label_col])] # "Longitude","Latitude" also if MBFC

In [None]:
# set up dataset, first we do train_test splits
# make train and (validation+test) datasets
train, val = train_test_split(
        data,
        test_size=0.3, random_state=42,
        # Here we've stratified by lean, can do other variable
        stratify=data["lean"]
    )

In [None]:
# make validation and test datasets
val, test = train_test_split(
        val,
        test_size=0.5, random_state=42,
        stratify=val["lean"]
    )

print('Train set shape: ', train.shape)
print('Validation set shape: ', val.shape)
print('Test set shape: ', test.shape)

In [None]:
# init and fit the scaler
scaler = MinMaxScaler()

scaler.fit(train[numeric_cols])

# scale numeric columns, (long & lat not included)!
train[numeric_cols] = scaler.transform(
                            train[numeric_cols]).astype(np.float32)
val[numeric_cols] = scaler.transform(
                            val[numeric_cols]).astype(np.float32)
test[numeric_cols] = scaler.transform(
                            test[numeric_cols]).astype(np.float32)

# set categorical columns to int (if categorical columns are present, such as in MBFC)
train[categor_cols] = train[categor_cols].astype(np.int8)
val[categor_cols] = val[categor_cols].astype(np.int8)
test[categor_cols] = test[categor_cols].astype(np.int8)

In [None]:
# save scaler in case we need it later again
joblib.dump(scaler, 'GDELT_GKG/extras/Supporting Files/MinMaxScaler{}.save'.format(experiment_name))

In [None]:
train.head()

In [None]:
# make copy in case something goes wrong in the next few parts while testing
train_copy = train.copy(deep=True)
val_copy = val.copy(deep=True)
test_copy = test.copy(deep=True)

In [None]:
# reset train & test sets in case something has indeed gone wrong
train = train_copy.copy(deep=True)
val = val_copy.copy(deep=True)
test = test_copy.copy(deep=True)

In [None]:
print(train.shape, val.shape, test.shape)

## Alternative Preprocessing

In [None]:
def select_manual_features(X, theme_list):
    """
    Given list of strings of relevance (this can be 
    either themes or features), return DF with columns 
    containing these strings.
    """
    selected_cols = [col for col in X.columns for theme in theme_list if theme in col]
    return X.loc[:,selected_cols].copy(deep=True)

def remove_high_corr_features(X):
    """
    Given dataframe, remove features which correlate
    more than 0.95 with another feature. Returns
    list of columns to be dropped, and saves this list.
    """
    global experiment_name
    # Create correlation matrix
    corr_matrix = X.corr(numeric_only=True).abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), 
                                              k=1).astype(bool))
    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    # make sure that the target variable can't be accidentally deleted
    if "lean" in to_drop:
        to_drop.remove("lean")
    
    # save list of features to drop
    pd.Series(to_drop).to_csv("GDELT_GKG/extras/Supporting Files/high_corr_features_to_drop{}.csv".format(experiment_name))
    
    return to_drop

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

def run_feature_elimination(data):
    """
    Information from:
    -  https://machinelearningmastery.com/rfe-feature-selection-in-python/
    -  https://scikit-learn.org/stable/modules/feature_selection.html

    Using a Tree Classifier, we use the feature importances that model yields to determine
    which features should be dropped.

    This can be done with Cross Validation, as we don't have a particular number of features we need,
    but want to determine the optimal number of features automatically.
    
    Returns a bool mask of columns that were selected.
    """
    global experiment_name
    X = data.drop("lean",axis=1)
    y = data["lean"]
    selector = RFECV(estimator=RandomForestClassifier(random_state=42),
                     step=100, cv=10, 
                     min_features_to_select=10, 
                     verbose=0)
    print("fitting, this will take a while...")
    selector = selector.fit(X, y)
    print("finished fitting!")
    #print(selector.ranking_) # ranking of 1 denotes features that RFE determined as best.
    # in each rank, there is [step] number of features (in this case 100)
    # save the columns chosen by RFE
    pd.Series(selector.support_).to_csv("GDELT_GKG/extras/Supporting Files/RFE_selected_features{}.csv".format(experiment_name))
    return selector.support_



In [None]:
exclude_MBFC = False
manual_features = True
remove_high_corr = True
feature_elimination = True

In [None]:
if exclude_MBFC == True:
    # also exclude long and lat, as those come from MBFC data
    categor_cols.extend(["Longitude","Latitude"])
    train = train.drop(columns=categor_cols,axis=1,errors="ignore") # ignore errors about column not existing
    val = val.drop(columns=categor_cols,axis=1,errors="ignore")
    test = test.drop(columns=categor_cols,axis=1,errors="ignore")
    
    print(train.shape, val.shape, test.shape)
    

In [None]:
if manual_features == True:
    # theme list defined at start
    train = select_manual_features(train, theme_list=theme_list)
    val = select_manual_features(val, theme_list=theme_list)
    test = select_manual_features(test, theme_list=theme_list)
    
    print(train.shape, val.shape, test.shape)

In [None]:
if remove_high_corr == True:
    to_drop = remove_high_corr_features(train)
    # Drop features - 588 columns, Polarity and ActRefDens often corr with Pos/Neg
    train.drop(to_drop, axis=1, inplace=True)
    val.drop(to_drop, axis=1, inplace=True)
    test.drop(to_drop, axis=1, inplace=True)
    
    print(train.shape, val.shape, test.shape)

In [None]:
if feature_elimination == True:
    selected_features = run_feature_elimination(train)
    # we remove lean, subset to selected columns and add lean back in
    y_train = train["lean"]
    train = train.drop("lean",axis=1).loc[:,selected_features]
    train["lean"] = y_train
    # validation set
    y_val = val["lean"]
    val = val.drop("lean",axis=1).loc[:,selected_features]
    val["lean"] = y_val
    # test set
    y_test = test["lean"]
    test = test.drop("lean",axis=1).loc[:,selected_features]
    test["lean"] = y_test
    print(train.shape, val.shape, test.shape)
    

## Save files

In [None]:
conditions_dict = {"exclude_MBFC":exclude_MBFC,
                   "manual_features": manual_features,
                   "feature_elimination":feature_elimination,
                   "remove_high_corr":remove_high_corr}
# save which conditions were used for making this set
extra_save_string = ""

for name, cond in conditions_dict.items():
    if cond:
        extra_save_string += "_" + name

print(extra_save_string)

In [None]:
# save the datasets in csv files
train.to_csv("GDELT_GKG/data/train{}.csv".format(experiment_name))
val.to_csv("GDELT_GKG/data/val{}.csv".format(experiment_name))
test.to_csv("GDELT_GKG/data/test{}.csv".format(experiment_name))

In [None]:
experiment_name

# GDELT Datasets

The GDELT outlets that have no ground truth from MBFC also need to be preprocessed so they can be predicted on.

In [None]:
import os
# get each batch that we made before, add it to list and then concat into final gdelt file
gdelt_parts = []
for file in tqdm(os.listdir("GDELT_GKG/extras/")):
    if file.startswith("gdelt_outlets_2_part"):
        gdelt_p = pd.read_csv("GDELT_GKG/extras/{}".format(file))
        gdelt_parts.append(gdelt_p)
        
gdelt = pd.concat(gdelt_parts)

In [None]:
#gdelt = pd.read_csv("GDELT_GKG/extras/gdelt_outlets_part_00.csv")
gdelt.set_index("outlet", inplace=True)

print(gdelt.shape)
# let's drop those with nans
gdelt = gdelt.dropna()
print(gdelt.shape)

# extra - already removed outlets with less than 2 articles per day (aka, less than 730 in total)
gdelt = gdelt.loc[gdelt["tot_art"] >= 100,:] # let's remove outlets with very few articles
print(gdelt.shape) # how many rows do we have now

In [None]:
gdelt.head()

In [None]:
# scale columns - let's first make sure the columns are in the same order - need to take original columns from data instead of train, as train ahs been modified
gdelt = gdelt.loc[:,data[numeric_cols].columns]
# scale it now - do it on the loc to make sure we get the pandas DF format back
gdelt.loc[:,gdelt.columns] = scaler.transform(gdelt)

In [None]:
manual_features = True
remove_high_corr = True
feature_elimination = True
exclude_MBFC = False # not needed, as they're not available anyway

### Alternative Processing

In [None]:
theme_list

In [None]:
if manual_features == True:
    """ get theme list from before
    theme_list = [
        # manual themes if wished
        #"SLFID_MILITARY_SPENDING","POLICE","LGBT","IMMIGRATION","ECON_COST_OF_LIVING","MOVEMENT_ENVIRONMENTAL","UNEMPLOYMENT",
        # MBFC variables
        #"Factuality","PressFreedom","MediaType","Traffic","Credibility","Longitude","Latitude","tot_art"
        # target variable
        "lean",
        # only article counts
        "article_count","WordCount","IMGorEMBED"
    ]
    """
    gdelt_subset = select_manual_features(gdelt, theme_list=theme_list)
    
    print(gdelt_subset.shape)

In [None]:
# GET to_drop FROM RELEVANT EXPERIMENT
to_drop = pd.read_csv("GDELT_GKG/extras/Supporting Files/high_corr_features_to_drop{}.csv".format(experiment_name),index_col=False)
to_drop = to_drop["0"].to_list()

In [None]:
if remove_high_corr == True:
    # Drop features already picked for dropping with train & test sets - 588 columns, Polarity and ActRefDens often corr with Pos/Neg
    gdelt_subset.drop(to_drop, axis=1, inplace=True)
    print(gdelt_subset.shape)

In [None]:
# GET selected_features FROM RELEVANT EXPERIMENT
selected_features = pd.read_csv("GDELT_GKG/extras/Supporting Files/RFE_selected_features{}.csv".format(experiment_name))
selected_features = selected_features.set_index("Unnamed: 0").T.iloc[0].to_list()

In [None]:
if feature_elimination == True:
    # get the featrues we selected before
    gdelt_subset = gdelt_subset.loc[:,selected_features]
    print(gdelt_subset.shape)

### Save File

In [None]:
conditions_dict = {"exclude_MBFC":exclude_MBFC,
                   "manual_features": manual_features,
                   "feature_elimination":feature_elimination,
                   "remove_high_corr":remove_high_corr
                   }
# save which conditions were used for making this set
extra_save_string = ""

for name, cond in conditions_dict.items():
    if cond:
        extra_save_string += "_" + name

print(extra_save_string)

In [None]:
experiment_name

In [None]:
# save the dataset in csv files
gdelt_subset.to_csv("GDELT_GKG/data/gdelt{}.csv".format(experiment_name))

# Add Categorical variables to MBCF

In [17]:
train = pd.read_csv("GDELT_GKG/data/train_mbfc_allbias.csv")
train.set_index("outlet",inplace=True)

val = pd.read_csv("GDELT_GKG/data/val_mbfc_allbias.csv")
val.set_index("outlet",inplace=True)

test = pd.read_csv("GDELT_GKG/data/test_mbfc_allbias.csv")
test.set_index("outlet",inplace=True)

In [18]:
mbfc = pd.read_csv("GDELT_GKG/extras/Bias Ratings/MBFC_features.csv")
mbfc.rename({"URL":"outlet"},axis=1,inplace=True)
mbfc.drop("Bias Rating", axis=1,inplace=True)
mbfc.set_index("outlet",inplace=True)
mbfc.head()

Unnamed: 0_level_0,Factuality,PressFreedom,MediaType,Traffic,Credibility,Longitude,Latutude
outlet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9news.com,4,2.0,6,2,2,-100.445882,39.78373
nbc11news.com,4,2.0,6,1,2,-100.445882,39.78373
12news.com,4,2.0,6,1,2,-100.445882,39.78373
wibw.com,4,2.0,6,1,2,-100.445882,39.78373
wifr.com,4,2.0,6,1,2,-100.445882,39.78373


In [19]:
train_categ = pd.merge(train,mbfc,how="left",on="outlet")
val_categ = pd.merge(val,mbfc,how="left",on="outlet")
test_categ = pd.merge(test,mbfc,how="left",on="outlet")

In [21]:
categor_cols = ["Factuality","PressFreedom","MediaType","Traffic","Credibility"]
# set categorical columns to int (if categorical columns are present, such as in MBFC)
train_categ[categor_cols] = train_categ[categor_cols].astype(np.int8)
val_categ[categor_cols] = val_categ[categor_cols].astype(np.int8)
test_categ[categor_cols] = test_categ[categor_cols].astype(np.int8)

In [23]:

# save the datasets in csv files
train_categ.to_csv("GDELT_GKG/data/train{}.csv".format(experiment_name))
val_categ.to_csv("GDELT_GKG/data/val{}.csv".format(experiment_name))
test_categ.to_csv("GDELT_GKG/data/test{}.csv".format(experiment_name))