In [2]:
import pandas as pd
import country_converter as coco
from functools import reduce
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from torch_geometric.utils import from_networkx
import torch

In [3]:
# Polity IV Index
polity_pd=pd.read_csv("../../../data/polity/democracy_index_polity.csv")
# Correlates of War Alliances
cow_pd=pd.read_csv("../../../data/cow_alliances/alliance_v4.1_by_dyad.csv")
# Worldwide Governance Indicators
wgi_pd=pd.read_csv("../../../data/world_bank_gi/wgidataset.csv", encoding="utf-8", sep=';')
# Varieties of Democracy
vdem_pd=pd.read_csv("../../../data/v-dem/V-Dem-CY-Core-v15.csv")
# Gini
gini_pd=pd.read_csv("../../../data/gini/API_SI.POV.GINI_DS2_en_csv_v2_38260.csv", skiprows=4 )

In [4]:
gini_pd["Country Name"]=coco.convert(names=gini_pd['Country Name'], to='ISO3')

Africa Eastern and Southern not found in regex
Africa Western and Central not found in regex
Arab World not found in regex
Central Europe and the Baltics not found in regex
Channel Islands not found in regex
Caribbean small states not found in regex
East Asia & Pacific ( not found in regex
Early-demographic dividend not found in regex
East Asia & Pacific not found in regex
Europe & Central Asia ( not found in regex
Europe & Central Asia not found in regex
Euro area not found in regex
European Union not found in regex
Fragile and conflict affected situations not found in regex
High income not found in regex
Heavily indebted poor countries (HIPC) not found in regex
IBRD only not found in regex
IDA & IBRD total not found in regex
IDA total not found in regex
IDA blend not found in regex
IDA only not found in regex
Not classified not found in regex
Latin America & Caribbean ( not found in regex
Latin America & Caribbean not found in regex
Least developed countries: UN classification not fo

In [5]:
gini_pd=gini_pd[
    (gini_pd['Country Name'] != "not found") &
    (~gini_pd['Country Name'].apply(lambda x: isinstance(x, list)))]

In [6]:
gini_pd = gini_pd.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    var_name="year",
    value_name="gini"
)

In [7]:
gini_pd=gini_pd[gini_pd["year"]!="Unnamed: 69"]
gini_pd.rename(columns={"Country Name":"country"}, inplace=True)
gini_pd['year'] = pd.to_numeric(gini_pd['year'], errors='coerce')
gini_pd=gini_pd[["country", "year", "gini"]]

In [8]:
gini_pd.to_parquet("data/gini.parquet")

### Prepping the Polity Dataframe

In [9]:
polity_pd

Unnamed: 0,Entity,Code,Year,Democracy
0,Afghanistan,AFG,1800,-6.0
1,Afghanistan,AFG,1801,-6.0
2,Afghanistan,AFG,1802,-6.0
3,Afghanistan,AFG,1803,-6.0
4,Afghanistan,AFG,1804,-6.0
...,...,...,...,...
22735,Zimbabwe,ZWE,2014,4.0
22736,Zimbabwe,ZWE,2015,4.0
22737,Zimbabwe,ZWE,2016,4.0
22738,Zimbabwe,ZWE,2017,4.0


In [13]:

custom_map = {
    "West Germany":"Germany",
    'USSR': 'Russia',
    "Democratic Republic of Vietnam": "Vietnam"
}
polity_pd['Entity'] = polity_pd['Entity'].replace(custom_map)
polity_pd["entity_convert"]=coco.convert(names=polity_pd['Entity'], to='name_short')
polity_pd=polity_pd[polity_pd["entity_convert"]!="not found"]

Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not f

In [70]:
polity_pd

Unnamed: 0,country,year,Democracy
0,Afghanistan,1800,-6.0
1,Afghanistan,1801,-6.0
2,Afghanistan,1802,-6.0
3,Afghanistan,1803,-6.0
4,Afghanistan,1804,-6.0
...,...,...,...
22735,Zimbabwe,2014,4.0
22736,Zimbabwe,2015,4.0
22737,Zimbabwe,2016,4.0
22738,Zimbabwe,2017,4.0


In [72]:
polity_pd[polity_pd.duplicated(subset=["country", "year"], keep=False)].sort_values(by=["country", "year"])

Unnamed: 0,country,year,Democracy
4388,Colombia,1832,2.0
8259,Colombia,1832,-5.0
5192,Czechia,1918,7.0
5286,Czechia,1918,7.0
5193,Czechia,1919,7.0
...,...,...,...
22549,Yemen,1989,-5.0
22573,Yemen,1989,-7.0
22449,Yemen,1990,0.0
22550,Yemen,1990,-5.0


In [74]:
# This handles duplicates for countries with two entries for territory/year combination
polity_pd = (
    polity_pd
    .groupby(['country', 'year'], as_index=False)
    .mean(numeric_only=True)
)

In [75]:
polity_pd[polity_pd.duplicated(subset=["country", "year"], keep=False)]

Unnamed: 0,country,year,Democracy


### Prepping COW dataframe

In [31]:
cow_pd

Unnamed: 0,version4id,ccode1,state_name1,ccode2,state_name2,dyad_st_day,dyad_st_month,dyad_st_year,dyad_end_day,dyad_end_month,dyad_end_year,left_censor,right_censor,defense,neutrality,nonaggression,entente,asymmetric,version
0,1,200,United Kingdom,235,Portugal,1,1,1816,,,,1,1,1,0,1,0.0,0,4.1
1,2,200,United Kingdom,380,Sweden,1,1,1816,15.0,2.0,1911.0,1,0,0,0,0,1.0,0,4.1
2,3,240,Hanover,245,Bavaria,1,1,1838,15.0,3.0,1848.0,0,0,1,0,1,1.0,0,4.1
3,3,240,Hanover,245,Bavaria,29,11,1850,15.0,6.0,1866.0,0,0,1,0,1,1.0,0,4.1
4,3,240,Hanover,255,Germany,1,1,1838,15.0,3.0,1848.0,0,0,1,0,1,1.0,0,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217,410,365,Russia,370,Belarus,3,2,2009,,,,0,1,1,0,0,0.0,0,4.1
3218,411,365,Russia,371,Armenia,20,8,2010,,,,0,1,1,0,0,1.0,1,4.1
3219,412,625,Sudan,626,South Sudan,10,2,2012,27.0,3.0,2012.0,0,0,0,0,1,0.0,0,4.1
3220,413,651,Egypt,666,Israel,26,3,1979,,,,0,1,0,0,1,0.0,0,4.1


In [32]:
cow_pd=cow_pd[(cow_pd["dyad_st_year"]>=1945)]

In [33]:
cow_pd.loc[cow_pd['state_name1'] == 'German Federal Republic', 'state_name1'] = "Germany"
cow_pd.loc[cow_pd['state_name2'] == 'German Federal Republic', 'state_name2'] = "Germany"
cow_pd["state1_convert"]=coco.convert(names=cow_pd["state_name1"], to='name_short')
cow_pd["state2_convert"]=coco.convert(names=cow_pd["state_name2"], to='name_short')
cow_pd['dyad_end_year'] = cow_pd['dyad_end_year'].astype('Int64')

Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Dem

In [34]:
cow_pd.dropna(subset=['state1_convert', "state2_convert"], inplace=True)
cow_pd=cow_pd[(cow_pd["state1_convert"]!="not found") & (cow_pd["state2_convert"]!="not found")]
cow_pd["dyad_end_year"]=cow_pd["dyad_end_year"].fillna(2025)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cow_pd.dropna(subset=['state1_convert', "state2_convert"], inplace=True)


In [35]:
cow_pd=cow_pd[["state1_convert", "state2_convert", "dyad_st_year","dyad_end_year", "left_censor", "right_censor", "defense", "neutrality", "nonaggression", "entente", "asymmetric"]]

In [38]:
cow_pd.drop_duplicates(inplace=True)

### Prepping WGI Governance Indicator

In [67]:
wgi_pd["country_convert"]=coco.convert(names=wgi_pd['countryname'], to='name_short')
wgi_pd=wgi_pd[wgi_pd["country_convert"]!="not found"]
wgi_pd.sort_values("country_convert",ascending=True)

KeyError: 'countryname'

In [68]:
wgi_pd

Unnamed: 0,country,year,estimate
0,Afghanistan,1996,-1.946667
1,Afghanistan,1998,-1.923333
2,Afghanistan,2000,-1.961667
3,Afghanistan,2002,-1.631667
4,Afghanistan,2003,-1.486667
...,...,...,...
5231,Zimbabwe,2019,-1.250000
5232,Zimbabwe,2020,-1.263333
5233,Zimbabwe,2021,-1.221667
5234,Zimbabwe,2022,-1.196667


In [69]:
wgi_pd[wgi_pd.duplicated(subset=["country", "year"])]

Unnamed: 0,country,year,estimate


### Prepping V-Dem Democracy Indices

In [41]:
vdem_pd=vdem_pd[["country_name", "country_text_id", "country_id", "year", 
                 "v2x_polyarchy", #electoral democracy index
                 "v2x_libdem", #liberal democracy index
                 "v2x_partipdem", #participatory democracy index
                 "v2x_delibdem", #deliberative democracy index
                 "v2x_egaldem" #egalitarian democracy index
                 ]]

In [42]:
vdem_pd["country_name_converted"]=coco.convert(names=vdem_pd['country_name'], to='name_short')
vdem_pd=vdem_pd[vdem_pd["country_name_converted"]!="not found"]

Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not f

In [None]:
# This handles duplicates for countries with two entries for territory/year combination
vdem_pd = (
    vdem_pd
    .groupby(['country', 'year'], as_index=False)
    .mean(numeric_only=True)
)

### Joining node features

In [44]:
polity_pd=polity_pd[["entity_convert", "Year","Democracy"]]
polity_pd.rename(columns={"entity_convert":"country", "Year":"year"}, inplace=True)
polity_pd

Unnamed: 0,country,year,Democracy
0,Afghanistan,1800,-6.0
1,Afghanistan,1801,-6.0
2,Afghanistan,1802,-6.0
3,Afghanistan,1803,-6.0
4,Afghanistan,1804,-6.0
...,...,...,...
22735,Zimbabwe,2014,4.0
22736,Zimbabwe,2015,4.0
22737,Zimbabwe,2016,4.0
22738,Zimbabwe,2017,4.0


In [45]:
vdem_pd

Unnamed: 0,country_name,country_text_id,country_id,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,country_name_converted
0,Mexico,MEX,3,1789,0.028,0.044,0.006,,,Mexico
1,Mexico,MEX,3,1790,0.028,0.044,0.006,,,Mexico
2,Mexico,MEX,3,1791,0.028,0.044,0.006,,,Mexico
3,Mexico,MEX,3,1792,0.028,0.044,0.006,,,Mexico
4,Mexico,MEX,3,1793,0.028,0.044,0.006,,,Mexico
...,...,...,...,...,...,...,...,...,...,...
27485,Papal States,PPS,361,1866,0.026,0.026,0.011,,,Vatican
27486,Papal States,PPS,361,1867,0.026,0.026,0.011,,,Vatican
27487,Papal States,PPS,361,1868,0.026,0.026,0.011,,,Vatican
27488,Papal States,PPS,361,1869,0.026,0.026,0.011,,,Vatican


In [46]:
vdem_pd=vdem_pd[["country_name_converted","year", "v2x_polyarchy", "v2x_libdem","v2x_partipdem","v2x_delibdem","v2x_egaldem"]]
vdem_pd.rename(columns={"country_name_converted":"country"}, inplace=True)
vdem_pd


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdem_pd.rename(columns={"country_name_converted":"country"}, inplace=True)


Unnamed: 0,country,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem
0,Mexico,1789,0.028,0.044,0.006,,
1,Mexico,1790,0.028,0.044,0.006,,
2,Mexico,1791,0.028,0.044,0.006,,
3,Mexico,1792,0.028,0.044,0.006,,
4,Mexico,1793,0.028,0.044,0.006,,
...,...,...,...,...,...,...,...
27485,Vatican,1866,0.026,0.026,0.011,,
27486,Vatican,1867,0.026,0.026,0.011,,
27487,Vatican,1868,0.026,0.026,0.011,,
27488,Vatican,1869,0.026,0.026,0.011,,


In [47]:
wgi_pd=wgi_pd[["country_convert","year", "estimate"]]
wgi_pd.rename(columns={"country_convert":"country"}, inplace=True)
wgi_pd["estimate"]=wgi_pd["estimate"].replace("..", np.nan).str.replace(",", ".", regex=False) 
wgi_pd["estimate"]=pd.to_numeric(wgi_pd["estimate"], errors="coerce")

wgi_pd=wgi_pd.groupby(['country', 'year'])['estimate'].mean().dropna().reset_index()


In [49]:
wgi_pd

Unnamed: 0,country,year,estimate
0,Afghanistan,1996,-1.946667
1,Afghanistan,1998,-1.923333
2,Afghanistan,2000,-1.961667
3,Afghanistan,2002,-1.631667
4,Afghanistan,2003,-1.486667
...,...,...,...
5231,Zimbabwe,2019,-1.250000
5232,Zimbabwe,2020,-1.263333
5233,Zimbabwe,2021,-1.221667
5234,Zimbabwe,2022,-1.196667


In [76]:

dfs = [vdem_pd, polity_pd, wgi_pd]

feature_nodes = reduce(
    lambda left, right: pd.merge(left, right, on=["country", "year"], how="outer"),
    dfs
)

In [77]:
feature_nodes[feature_nodes.duplicated()]

Unnamed: 0,country,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,Democracy,estimate


In [78]:
feature_nodes["country"]=coco.convert(names=feature_nodes["country"], to='ISO3')

In [79]:
# Interpolation of countries along time dimension

columns_to_interpolate = ['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem', 'Democracy', 'estimate']

for col in columns_to_interpolate:
    print(col,":",feature_nodes[col].isna().sum())

v2x_polyarchy : 3953
v2x_libdem : 4799
v2x_partipdem : 4158
v2x_delibdem : 10037
v2x_egaldem : 10037
Democracy : 10207
estimate : 23860


In [80]:

# Make sure data is sorted by group and time
feature_nodes = feature_nodes.sort_values(['country', 'year'])

# Apply interpolation to each column
for col in columns_to_interpolate:
    feature_nodes[col] = feature_nodes.groupby('country')[col].transform(
        lambda g: g.interpolate(method='linear', limit_direction='both')
    )

In [81]:
for col in columns_to_interpolate:
    print(col,":",feature_nodes[col].isna().sum())

v2x_polyarchy : 848
v2x_libdem : 848
v2x_partipdem : 848
v2x_delibdem : 926
v2x_egaldem : 926
Democracy : 1952
estimate : 78


In [82]:
feature_nodes = feature_nodes[(feature_nodes["year"] >= 2000) & (feature_nodes["year"] <= 2022)]
feature_nodes.dropna(inplace=True)

In [83]:
feature_nodes.shape

(3831, 9)

In [84]:
feature_nodes[feature_nodes.duplicated(subset=["country", "year"])]

Unnamed: 0,country,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,Democracy,estimate


### Edge Features 

In [85]:
cow_pd["state1_convert"]=coco.convert(names=cow_pd["state1_convert"], to='ISO3')
cow_pd["state2_convert"]=coco.convert(names=cow_pd["state2_convert"], to="ISO3")

In [86]:
cow_pd.sort_values(["state1_convert", "state2_convert","dyad_st_year"])

Unnamed: 0,state1_convert,state2_convert,dyad_st_year,dyad_end_year,left_censor,right_censor,defense,neutrality,nonaggression,entente,asymmetric
2326,AFG,CHN,1960,1979,0,0,0,0,1,0.0,0
3144,AFG,CHN,2002,2025,0,1,0,0,1,0.0,0
2848,AFG,PAK,1988,1989,0,0,0,0,1,0.0,0
3145,AFG,PAK,2002,2025,0,1,0,0,1,0.0,0
3142,AFG,TJK,2002,2025,0,1,0,0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
1138,YEM,YEM,1990,1990,0,0,0,0,1,1.0,0
2181,YEM,YEM,1990,1990,0,0,1,0,1,1.0,0
2834,ZAF,SWZ,1982,2025,0,1,0,0,1,0.0,0
3214,ZMB,SDN,2006,2025,0,1,1,0,1,1.0,0


### Constructing the graph

In [87]:
cow_pd['active_years'] = cow_pd.apply(lambda row: list(range(row['dyad_st_year'], row['dyad_end_year'] + 1)), axis=1)


In [88]:
edges_expanded=cow_pd.explode("active_years")
edges_expanded = edges_expanded.rename(columns={'active_years': 'year',
                                                "state1_convert":"state1",
                                                "state2_convert":"state2"})

# Step 4: Drop start and end years if you no longer need them
edges_expanded = edges_expanded.drop(columns=["dyad_st_year","dyad_end_year","left_censor", "right_censor"])


new_position = 2  # zero-based index

cols = list(edges_expanded.columns)
cols.insert(new_position, cols.pop(cols.index("year")))
edges_expanded=edges_expanded[cols]

edges_expanded=edges_expanded[(edges_expanded["year"]<=2022) & (edges_expanded["year"]>=2000)]

In [94]:
edges_merged = edges_expanded.groupby(
    ["state1", "state2", "year"], as_index=False
)[["defense", "neutrality", "nonaggression", "entente", "asymmetric"]].max()

In [96]:
edges_merged[edges_merged.duplicated(subset=["state1", "state2", "year"], keep=False)].sort_values(by=["state1", "state2", "year"])

Unnamed: 0,state1,state2,year,defense,neutrality,nonaggression,entente,asymmetric


In [106]:
edges_merged.to_parquet("data/edge_features.parquet")

In [108]:
feature_nodes.to_parquet("data/node_features.parquet")

In [109]:
nodes_pd=pd.read_parquet("data/node_features.parquet")
edges_pd=pd.read_parquet("data/edge_features.parquet")

In [110]:
edges_pd[edges_pd.duplicated(subset=["state1", "state2", "year"])]

Unnamed: 0,state1,state2,year,defense,neutrality,nonaggression,entente,asymmetric


In [111]:
nodes_pd[nodes_pd.duplicated(subset=["country", "year"])]

Unnamed: 0,country,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,Democracy,estimate
