# Creating Feature Subsets

In [None]:
#imports 
import pandas as pd
import numpy as np
import os
import pickle as pkl
import networkx as nx
import sys
sys.path.append('/Users/sinclaireschuetze/Documents/GitHub/Trade-GNN-Thesis/src')
from utils.CreateFeatures import CreateFeatures
from tqdm import trange

## Econometric Feature Creation
This creates the standard networks with the initial node features, including centrality and centrality within each product group.

In [None]:
data_dict = {}

#GDP data goes up to 2018 -- trange up to 2019 spans up to 2018
for year in trange(1962, 2019):
    trade = CreateFeatures(year = year)
    trade.prepare_econ_features()
    trade.prepare_network_features()
    #trade.combine_normalize_features()
    trade.combine_features()
    
    data_dict[year] = trade

In [None]:
#with open("../feature_dicts/mis_normß.pkl", "wb") as f:
#    pkl.dump(data_dict, f)

with open("../feature_dicts/mis_norm.pkl", "rb") as f:
    data_dict = pkl.load(f)

In [None]:
# Initialize variables to keep track of the DataFrame with the most rows
max_rows = 0
df_with_max_rows = None

# Iterate over the dictionary
for key, df in data_dict.items():
    # If this DataFrame has more rows than the current maximum, update the maximum and the DataFrame
    if len(df.combined_features) > max_rows:
        max_rows = len(df.combined_features)
        df_with_max_rows = df.combined_features
        max_year = key

In [None]:
# Compute the variance of each column
numeric_columns = df_with_max_rows.drop(['country_code'], axis=1)
variances = numeric_columns.var()

# Find columns with variance less than 0.1 (this is the threshold, adjust as needed)
columns_to_drop = variances[variances < 0.1].index
filtered_df = df_with_max_rows.drop(columns_to_drop, axis=1)

In [None]:
# Iterate over the dictionary
for key, df in data_dict.items():
    df.combined_features = df.combined_features.drop(columns_to_drop, axis=1)

## Mutual Information Selection

One method of feature selection included MIS to remove highly correlated features. These were chosen by regressing the features on GDP and selecting the top 2 percent of features.

In [None]:
X_train = filtered_df.drop(['country_code','current_gdp_growth'], axis = 1)
Y_train = filtered_df['current_gdp_growth']

In [None]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectPercentile

mutual_info = mutual_info_regression(X_train, Y_train)

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

selected_top_columns = SelectPercentile(mutual_info_regression, percentile=2)
selected_top_columns.fit(X_train, Y_train)
selected_top_columns.get_support()

In [None]:
columns = X_train.columns[selected_top_columns.get_support()]
X_train = X_train[columns]

In [None]:
column_list = list(columns)
column_list.append('current_gdp_growth')
column_list.append('country_code')

In [None]:
for key, df in data_dict.items():
    df.combined_features = df.combined_features[column_list]

In [None]:
columns_all_zeros = []
for key, df in data_dict.items():
    zero_columns = [col for col in df.combined_features.columns if (df.combined_features[col] == 0).all()]

    # Iterate over list1
    for item in zero_columns:
        # If the item is not in list2, add it
        if item not in columns_all_zeros:
            columns_all_zeros.append(item)


In [None]:
for key, df in data_dict.items():
    df.combined_features = df.combined_features.drop(columns_all_zeros, axis=1)

In [None]:
mis_columns = ["IT.MLT.MAIN.P2","NE.CON.PRVT.KD.ZG","NE.CON.TOTL.KD.ZG","NV.IND.TOTL.KD.ZG","NV.SRV.TOTL.KD.ZG","NY.GDP.MKTP.KD.ZG","NY.GDP.PCAP.KD.ZG","NY.GNP.MKTP.KD.ZG","NY.GNP.PCAP.KD.ZG","SP.ADO.TFRT","SP.POP.2024.FE.5Y","SP.POP.2024.MA.5Y","SP.POP.6569.FE.5Y","SP.POP.6569.MA.5Y","SP.POP.65UP.MA.ZS"]

In [None]:
mis_norm = data_dict
for i in data_dict:
    year = i
    year_dict = data_dict[year].combined_features
    mis_norm[year].combined_features = year_dict[mis_columns]

In [None]:
with open("../feature_dicts/mis_norm.pkl", "wb") as f:
    pkl.dump(mis_norm, f)

## Random Subset Selection

Another method of feature selection as a way of understanding the true predictiveness of MIS features was just selecting random features.

In [None]:
X_train = filtered_df.drop(['country_code','current_gdp_growth'], axis = 1).iloc[:,1:-23]

In [None]:
random_columns = X_train.sample(n=15, axis=1).columns

In [None]:
random_columns = ['GC.XPN.TOTL.CN', 'DC.DAC.DNKL.CD', 'ST.INT.DPRT', 'SP.POP.6064.FE.5Y',
       'SL.IND.EMPL.MA.ZS', 'DT.NFL.PRVT.CD', 'SE.XPD.CSEC.ZS',
       'SP.URB.TOTL.IN.ZS', 'NY.GNP.MKTP.PP.KD', 'DC.DAC.POLL.CD',
       'TX.VAL.MRCH.R5.ZS', 'SE.PRM.OENR.FE.ZS', 'NE.CON.GOVT.CN',
       'SL.TLF.BASC.ZS', 'EN.ATM.PM25.MC.M3', 'country_code', 'current_gdp_growth']

In [None]:
# Iterate over the dictionary
for key, df in data_dict.items():

    # Combine the lists
    all_cols = ['country_code','current_gdp_growth'] + random_columns.tolist()

    # Select the columns from the dataframe
    df.combined_features = df.combined_features[all_cols]

In [None]:
with open("../feature_dicts/random_features_dict.pkl", "wb") as f:
    pkl.dump(data_dict, f)

In [None]:
with open("../feature_dicts/feature_dict_logged.pkl", "rb") as f:
    data_dict = pkl.load(f)

In [None]:
random_not_norm = data_dict
for i in enumerate(data_dict):
    year = i[1]
    year_dict = data_dict[year].combined_features
    random_not_norm[year].combined_features = year_dict[random_columns]

In [None]:
with open("../feature_dicts/random_dict_not_norm.pkl", "wb") as f:
    pkl.dump(random_not_norm, f)

## Logging of Data

The models were eventually trained on non-normalized data, normalized, and logged data to test which method worked the best.

In [None]:
data_dict_logged = data_dict
for year in data_dict:
    year_dict = data_dict[year].combined_features
    for column in year_dict.columns:
        if column != 'country_code':
            year_dict[column] = np.log(year_dict[column])
    data_dict_logged[year].combined_features = year_dict

In [None]:
for year in data_dict_logged:
    year_dict = data_dict_logged[year].combined_features
    year_dict.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

In [None]:
with open("../feature_dicts/feature_dict_logged.pkl", "wb") as f:
    pkl.dump(data_dict_logged, f)

In [None]:
with open("../feature_dicts/feature_dict_logged.pkl", "rb") as f:
    data_dict_logged = pkl.load(f)

In [None]:
with open("../feature_dicts/mis_features_not_norm.pkl", "rb") as f:
    original_mis = pkl.load(f)

mis_columns = original_mis[1962].combined_features.columns.to_list()

In [None]:
mis_logged = data_dict
for year in data_dict:
    year_dict = data_dict[year].combined_features
    filtered_dict = year_dict[mis_columns]

    for column in filtered_dict.columns:
        if column != 'country_code':
            filtered_dict[column] = np.log(filtered_dict[column])
            
    filtered_dict.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
    mis_logged[year].combined_features = filtered_dict

In [None]:
with open("../feature_dicts/random_logged.pkl", "wb") as f:
    pkl.dump(mis_logged, f)