In [1]:
import pandas as pd
import country_converter as coco
from functools import reduce
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from torch_geometric.utils import from_networkx
import torch

In [None]:
# Polity IV Index
polity_pd=pd.read_csv("data/polity/democracy_index_polity.csv")
# Correlates of War Alliances
cow_pd=pd.read_csv("data/cow_alliances/alliance_v4.1_by_dyad.csv")
# Worldwide Governance Indicators
wgi_pd=pd.read_csv("data/world_bank_gi/wgidataset.csv", encoding="utf-8", sep=';')
# Varieties of Democracy
vdem_pd=pd.read_csv("data/v-dem/V-Dem-CY-Core-v15.csv")
# Gini
gini_pd=pd.read_csv("data/API_SI.POV.GINI_DS2_en_csv_v2_38260.csv", skiprows=4 )

In [3]:
gini_pd["Country Name"]=coco.convert(names=gini_pd['Country Name'], to='name_short')

Africa Eastern and Southern not found in regex
Africa Western and Central not found in regex
Arab World not found in regex
Central Europe and the Baltics not found in regex
Channel Islands not found in regex
Caribbean small states not found in regex
East Asia & Pacific ( not found in regex
Early-demographic dividend not found in regex
East Asia & Pacific not found in regex
Europe & Central Asia ( not found in regex
Europe & Central Asia not found in regex
Euro area not found in regex
European Union not found in regex
Fragile and conflict affected situations not found in regex
High income not found in regex
Heavily indebted poor countries (HIPC) not found in regex
IBRD only not found in regex
IDA & IBRD total not found in regex
IDA total not found in regex
IDA blend not found in regex
IDA only not found in regex
Not classified not found in regex
Latin America & Caribbean ( not found in regex
Latin America & Caribbean not found in regex
Least developed countries: UN classification not fo

In [4]:
gini_pd=gini_pd[
    (gini_pd['Country Name'] != "not found") &
    (~gini_pd['Country Name'].apply(lambda x: isinstance(x, list)))]

In [5]:
gini_pd = gini_pd.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code"],
    var_name="year",
    value_name="gini"
)

In [6]:
gini_pd=gini_pd[gini_pd["year"]!="Unnamed: 69"]
gini_pd.rename(columns={"Country Name":"country"}, inplace=True)
gini_pd['year'] = pd.to_numeric(gini_pd['year'], errors='coerce')
gini_pd=gini_pd[["country", "year", "gini"]]

In [7]:
gini_pd

Unnamed: 0,country,year,gini
0,Aruba,1960,
1,Afghanistan,1960,
2,Angola,1960,
3,Albania,1960,
4,Andorra,1960,
...,...,...,...
14035,Kosovo,2024,
14036,Yemen,2024,
14037,South Africa,2024,
14038,Zambia,2024,


### Prepping the Polity Dataframe

In [8]:
polity_pd

Unnamed: 0,Entity,Code,Year,Democracy
0,Afghanistan,AFG,1800,-6.0
1,Afghanistan,AFG,1801,-6.0
2,Afghanistan,AFG,1802,-6.0
3,Afghanistan,AFG,1803,-6.0
4,Afghanistan,AFG,1804,-6.0
...,...,...,...,...
22735,Zimbabwe,ZWE,2014,4.0
22736,Zimbabwe,ZWE,2015,4.0
22737,Zimbabwe,ZWE,2016,4.0
22738,Zimbabwe,ZWE,2017,4.0


In [9]:

custom_map = {
    "West Germany":"Germany",
    'USSR': 'Russia',
    'Serbia and Montenegro': 'Serbia',
    "Democratic Republic of Vietnam": "Vietnam"
}
polity_pd['Entity'] = polity_pd['Entity'].replace(custom_map)
polity_pd["entity_convert"]=coco.convert(names=polity_pd['Entity'], to='name_short')
polity_pd=polity_pd[polity_pd["entity_convert"]!="not found"]

Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not found in regex
Africa not f

In [10]:
polity_pd.sort_values("entity_convert")

Unnamed: 0,Entity,Code,Year,Democracy,entity_convert
0,Afghanistan,AFG,1800,-6.0,Afghanistan
126,Afghanistan,AFG,1926,-6.0,Afghanistan
127,Afghanistan,AFG,1927,-6.0,Afghanistan
128,Afghanistan,AFG,1928,-6.0,Afghanistan
129,Afghanistan,AFG,1929,-6.0,Afghanistan
...,...,...,...,...,...
22715,Zimbabwe,ZWE,1994,-6.0,Zimbabwe
22716,Zimbabwe,ZWE,1995,-6.0,Zimbabwe
22717,Zimbabwe,ZWE,1996,-6.0,Zimbabwe
22719,Zimbabwe,ZWE,1998,-6.0,Zimbabwe


### Prepping COW dataframe

In [11]:
cow_pd

Unnamed: 0,version4id,ccode1,state_name1,ccode2,state_name2,dyad_st_day,dyad_st_month,dyad_st_year,dyad_end_day,dyad_end_month,dyad_end_year,left_censor,right_censor,defense,neutrality,nonaggression,entente,asymmetric,version
0,1,200,United Kingdom,235,Portugal,1,1,1816,,,,1,1,1,0,1,0.0,0,4.1
1,2,200,United Kingdom,380,Sweden,1,1,1816,15.0,2.0,1911.0,1,0,0,0,0,1.0,0,4.1
2,3,240,Hanover,245,Bavaria,1,1,1838,15.0,3.0,1848.0,0,0,1,0,1,1.0,0,4.1
3,3,240,Hanover,245,Bavaria,29,11,1850,15.0,6.0,1866.0,0,0,1,0,1,1.0,0,4.1
4,3,240,Hanover,255,Germany,1,1,1838,15.0,3.0,1848.0,0,0,1,0,1,1.0,0,4.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217,410,365,Russia,370,Belarus,3,2,2009,,,,0,1,1,0,0,0.0,0,4.1
3218,411,365,Russia,371,Armenia,20,8,2010,,,,0,1,1,0,0,1.0,1,4.1
3219,412,625,Sudan,626,South Sudan,10,2,2012,27.0,3.0,2012.0,0,0,0,0,1,0.0,0,4.1
3220,413,651,Egypt,666,Israel,26,3,1979,,,,0,1,0,0,1,0.0,0,4.1


In [12]:
cow_pd=cow_pd[(cow_pd["dyad_st_year"]>=1945)]

In [13]:
cow_pd.loc[cow_pd['state_name1'] == 'German Federal Republic', 'state_name1'] = "Germany"
cow_pd.loc[cow_pd['state_name2'] == 'German Federal Republic', 'state_name2'] = "Germany"
cow_pd["state1_convert"]=coco.convert(names=cow_pd["state_name1"], to='name_short')
cow_pd["state2_convert"]=coco.convert(names=cow_pd["state_name2"], to='name_short')
cow_pd['dyad_end_year'] = cow_pd['dyad_end_year'].astype('Int64')

Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
Yugoslavia not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Democratic Republic not found in regex
German Dem

In [14]:
cow_pd.dropna(subset=['state1_convert', "state2_convert"], inplace=True)
cow_pd=cow_pd[(cow_pd["state1_convert"]!="not found") & (cow_pd["state2_convert"]!="not found")]
cow_pd["dyad_end_year"]=cow_pd["dyad_end_year"].fillna(2025)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cow_pd.dropna(subset=['state1_convert', "state2_convert"], inplace=True)


In [15]:
cow_pd=cow_pd[["state1_convert", "state2_convert", "dyad_st_year","dyad_end_year", "left_censor", "right_censor", "defense", "neutrality", "nonaggression", "entente", "asymmetric"]]

In [16]:
cow_pd

Unnamed: 0,state1_convert,state2_convert,dyad_st_year,dyad_end_year,left_censor,right_censor,defense,neutrality,nonaggression,entente,asymmetric
400,Czechia,Russia,1945,1989,0,0,1,1,0,1.0,0
725,United States,Cuba,1945,1947,0,0,1,0,0,1.0,0
726,United States,Haiti,1945,1947,0,0,1,0,0,1.0,0
727,United States,Dominican Republic,1945,1947,0,0,1,0,0,1.0,0
728,United States,Mexico,1945,1947,0,0,1,0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...
3217,Russia,Belarus,2009,2025,0,1,1,0,0,0.0,0
3218,Russia,Armenia,2010,2025,0,1,1,0,0,1.0,1
3219,Sudan,South Sudan,2012,2012,0,0,0,0,1,0.0,0
3220,Egypt,Israel,1979,2025,0,1,0,0,1,0.0,0


### Prepping WGI Governance Indicator

In [17]:
wgi_pd["country_convert"]=coco.convert(names=wgi_pd['countryname'], to='name_short')
wgi_pd=wgi_pd[wgi_pd["country_convert"]!="not found"]
wgi_pd.sort_values("country_convert",ascending=True)

Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles (former) not found in regex
Netherlands Antilles

Unnamed: 0,codeindyr,code,countryname,year,indicator,estimate,stddev,nsource,pctrank,pctranklower,pctrankupper,country_convert
0,AFGcc1996,AFG,Afghanistan,1996,cc,-129,034,2,43,00,274,Afghanistan
14766,AFGrl2010,AFG,Afghanistan,2010,rl,-187,018,10,05,00,24,Afghanistan
30174,AFGrl2022,AFG,Afghanistan,2022,rl,-166,017,9,52,14,71,Afghanistan
14980,AFGrq2010,AFG,Afghanistan,2010,rq,-152,018,6,57,24,91,Afghanistan
856,AFGrq1996,AFG,Afghanistan,1996,rq,-209,039,1,16,00,60,Afghanistan
...,...,...,...,...,...,...,...,...,...,...,...,...
20971,ZWEge2015,ZWE,Zimbabwe,2015,ge,-125,017,12,90,48,167,Zimbabwe
20757,ZWEcc2015,ZWE,Zimbabwe,2015,cc,-134,013,15,67,19,119,Zimbabwe
20543,ZWEva2014,ZWE,Zimbabwe,2014,va,-126,012,14,138,74,202,Zimbabwe
20115,ZWErl2014,ZWE,Zimbabwe,2014,rl,-144,014,15,38,14,72,Zimbabwe


### Prepping V-Dem Democracy Indices

In [18]:
vdem_pd=vdem_pd[["country_name", "country_text_id", "country_id", "year", 
                 "v2x_polyarchy", #electoral democracy index
                 "v2x_libdem", #liberal democracy index
                 "v2x_partipdem", #participatory democracy index
                 "v2x_delibdem", #deliberative democracy index
                 "v2x_egaldem" #egalitarian democracy index
                 ]]

In [19]:
vdem_pd["country_name_converted"]=coco.convert(names=vdem_pd['country_name'], to='name_short')
vdem_pd=vdem_pd[vdem_pd["country_name_converted"]!="not found"]

Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not found in regex
Republic of Vietnam not f

In [20]:
vdem_pd.sort_values("country_name_converted")


Unnamed: 0,country_name,country_text_id,country_id,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,country_name_converted
5382,Afghanistan,AFG,36,1851,0.020,0.029,0.022,,,Afghanistan
5358,Afghanistan,AFG,36,1827,0.020,0.029,0.022,,,Afghanistan
5359,Afghanistan,AFG,36,1828,0.020,0.029,0.022,,,Afghanistan
5360,Afghanistan,AFG,36,1829,0.020,0.029,0.022,,,Afghanistan
5361,Afghanistan,AFG,36,1830,0.020,0.029,0.022,,,Afghanistan
...,...,...,...,...,...,...,...,...,...,...
9387,Zimbabwe,ZWE,62,1934,0.264,0.175,0.135,0.059,0.047,Zimbabwe
9386,Zimbabwe,ZWE,62,1933,0.264,0.175,0.136,0.059,0.047,Zimbabwe
9385,Zimbabwe,ZWE,62,1932,0.264,0.175,0.136,0.059,0.047,Zimbabwe
9398,Zimbabwe,ZWE,62,1945,0.255,0.176,0.132,0.057,0.048,Zimbabwe


### Joining node features

In [21]:
polity_pd=polity_pd[["entity_convert", "Year","Democracy"]]
polity_pd.rename(columns={"entity_convert":"country", "Year":"year"}, inplace=True)
polity_pd

Unnamed: 0,country,year,Democracy
0,Afghanistan,1800,-6.0
1,Afghanistan,1801,-6.0
2,Afghanistan,1802,-6.0
3,Afghanistan,1803,-6.0
4,Afghanistan,1804,-6.0
...,...,...,...
22735,Zimbabwe,2014,4.0
22736,Zimbabwe,2015,4.0
22737,Zimbabwe,2016,4.0
22738,Zimbabwe,2017,4.0


In [22]:
vdem_pd

Unnamed: 0,country_name,country_text_id,country_id,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem,country_name_converted
0,Mexico,MEX,3,1789,0.028,0.044,0.006,,,Mexico
1,Mexico,MEX,3,1790,0.028,0.044,0.006,,,Mexico
2,Mexico,MEX,3,1791,0.028,0.044,0.006,,,Mexico
3,Mexico,MEX,3,1792,0.028,0.044,0.006,,,Mexico
4,Mexico,MEX,3,1793,0.028,0.044,0.006,,,Mexico
...,...,...,...,...,...,...,...,...,...,...
27485,Papal States,PPS,361,1866,0.026,0.026,0.011,,,Vatican
27486,Papal States,PPS,361,1867,0.026,0.026,0.011,,,Vatican
27487,Papal States,PPS,361,1868,0.026,0.026,0.011,,,Vatican
27488,Papal States,PPS,361,1869,0.026,0.026,0.011,,,Vatican


In [23]:
vdem_pd=vdem_pd[["country_name_converted","year", "v2x_polyarchy", "v2x_libdem","v2x_partipdem","v2x_delibdem","v2x_egaldem"]]
vdem_pd.rename(columns={"country_name_converted":"country"}, inplace=True)
vdem_pd


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vdem_pd.rename(columns={"country_name_converted":"country"}, inplace=True)


Unnamed: 0,country,year,v2x_polyarchy,v2x_libdem,v2x_partipdem,v2x_delibdem,v2x_egaldem
0,Mexico,1789,0.028,0.044,0.006,,
1,Mexico,1790,0.028,0.044,0.006,,
2,Mexico,1791,0.028,0.044,0.006,,
3,Mexico,1792,0.028,0.044,0.006,,
4,Mexico,1793,0.028,0.044,0.006,,
...,...,...,...,...,...,...,...
27485,Vatican,1866,0.026,0.026,0.011,,
27486,Vatican,1867,0.026,0.026,0.011,,
27487,Vatican,1868,0.026,0.026,0.011,,
27488,Vatican,1869,0.026,0.026,0.011,,


In [24]:
wgi_pd=wgi_pd[["country_convert","year", "estimate"]]
wgi_pd.rename(columns={"country_convert":"country"}, inplace=True)
wgi_pd["estimate"]=wgi_pd["estimate"].replace("..", np.nan).str.replace(",", ".", regex=False) 
wgi_pd["estimate"]=pd.to_numeric(wgi_pd["estimate"], errors="coerce")

wgi_pd=wgi_pd.groupby(['country', 'year'])['estimate'].mean().dropna().reset_index()


In [25]:
wgi_pd

Unnamed: 0,country,year,estimate
0,Afghanistan,1996,-1.946667
1,Afghanistan,1998,-1.923333
2,Afghanistan,2000,-1.961667
3,Afghanistan,2002,-1.631667
4,Afghanistan,2003,-1.486667
...,...,...,...
5231,Zimbabwe,2019,-1.250000
5232,Zimbabwe,2020,-1.263333
5233,Zimbabwe,2021,-1.221667
5234,Zimbabwe,2022,-1.196667


In [26]:

dfs = [vdem_pd, polity_pd, wgi_pd]

feature_nodes = reduce(
    lambda left, right: pd.merge(left, right, on=["country", "year"], how="outer"),
    dfs
)

In [27]:
feature_nodes["country_code"]=coco.convert(names=feature_nodes["country"], to='ISO3')

In [28]:
# Interpolation of countries along time dimension

columns_to_interpolate = ['v2x_polyarchy', 'v2x_libdem', 'v2x_partipdem', 'v2x_delibdem', 'v2x_egaldem', 'Democracy', 'estimate']

for col in columns_to_interpolate:
    print(col,":",feature_nodes[col].isna().sum())

v2x_polyarchy : 3953
v2x_libdem : 4799
v2x_partipdem : 4158
v2x_delibdem : 10038
v2x_egaldem : 10038
Democracy : 10295
estimate : 24205


In [29]:

# Make sure data is sorted by group and time
feature_nodes = feature_nodes.sort_values(['country_code', 'year'])

# Apply interpolation to each column
for col in columns_to_interpolate:
    feature_nodes[col] = feature_nodes.groupby('country')[col].transform(
        lambda g: g.interpolate(method='linear', limit_direction='both')
    )

In [30]:
for col in columns_to_interpolate:
    print(col,":",feature_nodes[col].isna().sum())

v2x_polyarchy : 848
v2x_libdem : 848
v2x_partipdem : 848
v2x_delibdem : 926
v2x_egaldem : 926
Democracy : 1973
estimate : 78


In [31]:
feature_nodes = feature_nodes[(feature_nodes["year"] >= 2000) & (feature_nodes["year"] <= 2022)]
feature_nodes.dropna(inplace=True)

In [32]:
feature_nodes.shape

(3861, 10)

### Edge Features 

In [33]:
cow_pd.sort_values(["state1_convert", "dyad_st_year"])

Unnamed: 0,state1_convert,state2_convert,dyad_st_year,dyad_end_year,left_censor,right_censor,defense,neutrality,nonaggression,entente,asymmetric
2326,Afghanistan,China,1960,1979,0,0,0,0,1,0.0,0
2848,Afghanistan,Pakistan,1988,1989,0,0,0,0,1,0.0,0
3141,Afghanistan,Turkmenistan,2002,2025,0,1,0,0,1,0.0,0
3142,Afghanistan,Tajikistan,2002,2025,0,1,0,0,1,0.0,0
3143,Afghanistan,Uzbekistan,2002,2025,0,1,0,0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
2189,Yemen,Bahrain,1990,2025,0,1,1,0,1,1.0,0
2190,Yemen,Qatar,1990,2025,0,1,1,0,1,1.0,0
2191,Yemen,United Arab Emirates,1990,2025,0,1,1,0,1,1.0,0
3214,Zambia,Sudan,2006,2025,0,1,1,0,1,1.0,0


### Constructing the graph

In [34]:
G = nx.Graph()

# Add nodes
for _, row in feature_nodes.iterrows():
    node_id = (row['country'], row['year'])
    G.add_node(node_id, **row.to_dict())

#Add edges
for _, row in cow_pd.iterrows():
    c1 = row['state1_convert']
    c2 = row['state2_convert']
    start = row['dyad_st_year']
    end = row['dyad_end_year']
    try:
        start = int(start)
        end = int(end)
    except ValueError:
        continue

    # Enforce consistent order (e.g., alphabetically)
    country_a, country_b = sorted([c1, c2])

    for year in range(start, end + 1):
        node_a = (country_a, year)
        node_b = (country_b, year)

        if node_a in G and node_b in G:
            G.add_edge(node_a, node_b, alliance_year=year)


In [None]:
# # Define the year range
# year_range = range(2000, 2023)  # 2010 to 2022 inclusive

# # Select nodes whose second element falls within the year range
# nodes_for_years = [node for node in G.nodes if node[1] in year_range]

# # Create subgraph
# G_years = G.subgraph(nodes_for_years).copy()

In [None]:
# # Filter nodes that have a non-NaN gini value
# nodes_with_gini = [n for n, attr in G.nodes(data=True) if 'gini' in attr and pd.notna(attr['gini'])]

# # Create the subgraph
# G_gini = G.subgraph(nodes_with_gini).copy()

In [35]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")


Number of nodes: 3831
Number of edges: 22171


In [37]:
len(list(G.neighbors(('United States', 2005))))

46

In [38]:

nx.write_graphml(G, "political_graph.graphml")