# Create CSV files for gephi visualizations

In [1]:
import pickle as pkl
import networkx as nx
import pandas as pd
import numpy as np
import geopandas as gpd

In [2]:
with open('../graphs/exports_agg.pkl', 'rb') as file:
    export_agg = pkl.load(file)

## Standardizing Export Values

In [4]:
export_partners = []
for i in export_agg:
    df = i.pivot(index='location_code', columns='partner_code', values='export_value')
    df = df.fillna(0)
    export_partners.append(df)

max_index = []
max_len = len(export_partners[0].index)
for i in export_partners:
    if len(i.index) > max_len:
        max_index = i.index

for i in export_partners:
    for j in max_index:
        if j not in i.columns:
            i[j] = 0
        if j not in i.index:
            i.loc[j] = 0

max_columns = export_partners[0].columns
max_len = len(export_partners[0].columns)
for i in export_partners:
    if len(i.columns) > max_len:
        max_columns = i.columns

for i in export_partners:
    for j in max_columns:
        if j not in i.columns:
            i[j] = 0
        if j not in i.index:
            i.loc[j] = 0

for i in export_partners:
    if len(i) != 240:
        missing_cols = export_partners[38].columns.difference(i.columns)
        for j in missing_cols:
            i[j] = 0
            i.loc[j] = 0

for i in export_partners:
    if len(i.index)!= 240:
        missing_index = export_partners[38].index.difference(i.index)
        for j in missing_index:
            i.loc[j] = 0

export_graphs = []

for y in export_partners:
    G = nx.from_pandas_adjacency(y, create_using=nx.DiGraph())
    export_graphs.append(G)

In [27]:
with open('../csv_files/exports_graphs_raw.pkl', 'wb') as file:
    pkl.dump(export_graphs, file)

In [31]:
len(export_graphs[0].nodes())

240

## Format for Gephi

In [6]:
years = range(1962,2022)
for i, g in enumerate(export_graphs):
    nx.set_edge_attributes(g, f'<[{years[i]}, {years[i]+1}]>', 'timeset')

graph_df = pd.DataFrame()
for i, g in enumerate(export_graphs):
    graph_df = pd.concat([graph_df,nx.to_pandas_edgelist(g)], ignore_index=True)

graph_df = graph_df.rename(columns = {'source':'Source','target':'Target'})

df_sorted = graph_df.sort_values(by=['Source','timeset','weight'], ascending=[True, True,False])

## Get regions

In [7]:
locations = pd.read_stata("../location_classifications/location.dta")
locations['location_id'] = locations['location_id'].astype(int)
locations.drop(columns = ['location_id','location_name_short_en','level'], inplace = True)

In [8]:
regions = locations['parent_id'].unique()
regions = regions[np.logical_and(~np.isnan(regions), regions != 358)]

In [9]:
region_mappings = {354:'Oceania',
                   353:'Asia',
                   352:'Africa',
                   355:'Europe',
                   356:'North America',
                   357:'South America'}

## Get Latitude and Longitude

In [10]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Get the ISO codes that are present in the world shapefile
valid_iso_codes = world['iso_a3'].unique()

# Remove the rows with ISO codes that are not present in the world shapefile
countries = df_sorted[df_sorted['Source'].isin(valid_iso_codes)]

# Define a function to get the latitude and longitude of the centroid of a country
def get_centroid(iso_code):
    country_geometry = world.loc[world['iso_a3'] == iso_code, 'geometry'].values[0]
    centroid = country_geometry.centroid
    return pd.Series({'latitude': centroid.y, 'longitude': centroid.x})

# Apply the function to each ISO code in the DataFrame
countries[['latitude_source', 'longitude_source']] = countries['Source'].apply(get_centroid)

  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  countries[['latitude_source', 'longitude_source']] = countries['Source'].apply(get_centroid)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  countries[['latitude_source', 'longitude_source']] = countries['Source'].apply(get_centroid)


In [11]:
countries = countries[countries['Target'].isin(valid_iso_codes)]
countries[['latitude_target', 'longitude_target']] = countries['Target'].apply(get_centroid)

In [12]:
countries

Unnamed: 0,Source,Target,weight,timeset,latitude_source,longitude_source,latitude_target,longitude_target
12,AFG,RUS,51561720.0,"<[1962, 1963]>",33.856399,66.086690,61.980841,96.875223
14,AFG,USA,25773514.0,"<[1962, 1963]>",33.856399,66.086690,45.705628,-112.599436
2,AFG,DEU,25624158.0,"<[1962, 1963]>",33.856399,66.086690,51.133723,10.288485
5,AFG,GBR,21661120.0,"<[1962, 1963]>",33.856399,66.086690,53.914773,-2.853135
6,AFG,IND,14894652.0,"<[1962, 1963]>",33.856399,66.086690,22.925006,79.593704
...,...,...,...,...,...,...,...,...
921868,ZWE,EST,350250.0,"<[2021, 2022]>",-18.906988,29.788548,58.643695,25.824726
921895,ZWE,MDA,338042.0,"<[2021, 2022]>",-18.906988,29.788548,47.203676,28.410483
921863,ZWE,DNK,278840.0,"<[2021, 2022]>",-18.906988,29.788548,56.063934,9.876373
921890,ZWE,LAO,264000.0,"<[2021, 2022]>",-18.906988,29.788548,18.444978,103.750260


In [13]:
unique_values = pd.concat([countries['Source'], countries['Target']]).unique()

# Create a new DataFrame with these unique values
unique_countries_df = pd.DataFrame(unique_values, columns=['Source'])

In [14]:
targets_df = countries[['Target', 'latitude_target', 'longitude_target']].drop_duplicates()
targets_df.rename(columns = {'Target':'Source', 'latitude_target':'latitude', 'longitude_target':'longitude'}, inplace = True)

In [15]:
node_info = unique_countries_df.merge(targets_df, on='Source')
node_info.drop_duplicates(inplace = True)

In [16]:
node_info['id'] = node_info.index
node_info.rename(columns = {'Source':'Label'}, inplace = True)

In [17]:
df_with_regions = node_info.merge(locations, left_on = "Label", right_on = "location_code")
df_with_regions['region'] = df_with_regions['parent_id'].map(region_mappings)

In [18]:
df_with_regions

Unnamed: 0,Label,latitude,longitude,id,location_code,parent_id,region
0,AFG,33.856399,66.086690,0,AFG,353.0,Asia
1,AGO,-12.245869,17.470573,1,AGO,352.0,Africa
2,ALB,41.141353,20.032426,2,ALB,355.0,Europe
3,ARE,23.868634,54.206715,3,ARE,353.0,Asia
4,ARG,-35.446821,-65.175361,4,ARG,357.0,South America
...,...,...,...,...,...,...,...
168,VUT,-15.542677,167.073751,168,VUT,354.0,Oceania
169,YEM,15.913232,47.535045,169,YEM,353.0,Asia
170,ZAF,-28.947033,25.048014,170,ZAF,352.0,Africa
171,ZMB,-13.395068,27.727592,171,ZMB,352.0,Africa


In [32]:
with open('../csv_files/train_clusters.pkl', 'rb') as file:
    clusters = pkl.load(file)

In [19]:
iso_to_id = df_with_regions.set_index('Label')['id'].to_dict()

countries['Source'] = countries['Source'].map(iso_to_id)
countries['Target'] = countries['Target'].map(iso_to_id)

In [20]:
countries

Unnamed: 0,Source,Target,weight,timeset,latitude_source,longitude_source,latitude_target,longitude_target
12,0,133,51561720.0,"<[1962, 1963]>",33.856399,66.086690,61.980841,96.875223
14,0,164,25773514.0,"<[1962, 1963]>",33.856399,66.086690,45.705628,-112.599436
2,0,40,25624158.0,"<[1962, 1963]>",33.856399,66.086690,51.133723,10.288485
5,0,57,21661120.0,"<[1962, 1963]>",33.856399,66.086690,53.914773,-2.853135
6,0,73,14894652.0,"<[1962, 1963]>",33.856399,66.086690,22.925006,79.593704
...,...,...,...,...,...,...,...,...
921868,172,50,350250.0,"<[2021, 2022]>",-18.906988,29.788548,58.643695,25.824726
921895,172,99,338042.0,"<[2021, 2022]>",-18.906988,29.788548,47.203676,28.410483
921863,172,42,278840.0,"<[2021, 2022]>",-18.906988,29.788548,56.063934,9.876373
921890,172,89,264000.0,"<[2021, 2022]>",-18.906988,29.788548,18.444978,103.750260


In [22]:
df_with_regions.to_csv('../csv_files/country_attributes.csv', index = False)

In [24]:
countries.drop(columns = ['latitude_source','longitude_source','latitude_target','longitude_target'], inplace = True)

In [26]:
countries.to_csv('../csv_files/source_targets_years.csv', index = False)