In [1]:
from pathlib import Path

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np


## We need to link to the datasets. These are too large to put on github, perhaps we should set up a dvc?
# !! UPDATE PATHS AS NEEDED  !!
# Matti here, saving his paths yeehaw
# C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets    (or acled_sets, or gravity_sets)

BACI_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets"
BACI_folder_path = Path(BACI_folder_path_init).as_posix()

ACLED_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\acled_sets"
ACLED_folder_path = Path(ACLED_folder_path_init).as_posix()

Gravity_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\gravity_sets"
Gravity_folder_path = Path(Gravity_folder_path_init).as_posix()

## Combining all of ACLED Africa, Gravity & Refugee

In [2]:
# New link - We should combine all ACLED data from Africa with the Gravity dataset

## THIS IS DEPRECATED - IGNORE unless your name is Zen Rehda


#acled_af = f"{ACLED_folder_path}/africa_acled.csv"
#gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

#df_a = pd.read_csv(acled_af)
#df_g = pd.read_csv(gravity)


### 01. Prepare ACLED Africa

This run was done locally on Mattis Multimachine (Therefore we re-link the two folders above)
<!--  -->
We need to clean up the ACLED data and only keep relevant data.

In [3]:
acled_af = f"{ACLED_folder_path}/ACLEDAfricaData_1997_2026-02-02.csv"
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

df_a = pd.read_csv(acled_af)
df_g = pd.read_csv(gravity)

# We can filter to the relevant columns we want - More can be added here, but update dummy code below if need be
#   inter1 is the perpetrator, inter2 is the target
df_a_filter = df_a[["country", "year", "disorder_type", "event_type", "inter1", "inter2", "fatalities"]
].copy()

# We save a list of all unique possible values for the columns for future use and reference.
country_list = df_a_filter["country"].unique()
country_list.sort()
disorder_types = df_a_filter["disorder_type"].unique()
event_types = df_a_filter["event_type"].unique()
attack_groups = df_a_filter["inter1"].unique()
target_groups = df_a_filter["inter2"].unique()

# Now we need to link the countries to the tags in Gravity, so the countries can be linked between the two datasets
df_a_filter.tail(5)

  df_g = pd.read_csv(gravity)


Unnamed: 0,country,year,disorder_type,event_type,inter1,inter2,fatalities
418055,Niger,2025,Political violence,Violence against civilians,Political militia,Civilians,0
418056,Cameroon,2025,Strategic developments,Strategic developments,Political militia,Civilians,0
418057,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418058,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418059,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0


In [4]:
df = df_a_filter.copy()

# We create dummy values for each type of disorder, event, attackers and target
dummies = pd.get_dummies(
    df[['disorder_type', 'event_type', 'inter1', 'inter2']],
    prefix=['disorder', 'event', 'perpetrator', 'target']
)

# We add the numeric columns back to the dummy dataset
dummies['fatalities'] = df['fatalities']
dummies['country'] = df['country']
dummies['year'] = df['year']

# Now we can group by country and year, and sum over the dummy categories. Perfect!
acled_result = dummies.groupby(['country', 'year']).sum().reset_index()

## We want to map the countries from ACLED onto the iso names from the Gravity dataset. I made this stupid dictionary BY HAND, please respect that.
country_iso_dict = {
    "Algeria": "DZA", "Angola": "AGO", 'Benin': "BEN", 'Botswana': "BWA", 'Burkina Faso': "BFA", "Burundi": "BDI", 'Cameroon': "CMR", 'Cape Verde': "CPV",
    "Central African Republic": "CAF", 'Chad': "TCD", 'Comoros': "COM", 'Democratic Republic of Congo': "COD", 'Djibouti': "DJI", 'Egypt': "EGY",
    'Equatorial Guinea': "GNQ", 'Eritrea': "ERI", 'Ethiopia': "ETH", 'Gabon': "GAB",'Gambia': "GMB", 'Ghana': "GHA", 'Guinea': "GIN", 'Guinea-Bissau': "GNB", 
    'Ivory Coast': "CIV", 'Kenya': "KEN", 'Lesotho': "LSO", 'Liberia': "LBR", 'Libya': "LBY", 'Madagascar': "MDG", 'Malawi': "MWI",'Mali': "MLI", 
    'Mauritania': "MRT", 'Mauritius': "MUS", 'Mayotte': "MYT", 'Morocco': "MAR",'Mozambique': "MOZ", 'Namibia': "NAM", 'Niger': "NER", 'Nigeria': "NGA", 
    'Republic of Congo': "COG", 'Reunion': "REU", 'Rwanda': "RWA", 'Saint Helena, Ascension and Tristan da Cunha': "SHN", 'Sao Tome and Principe': "STP", 
    'Senegal': "SEN", 'Seychelles': "SYC", 'Sierra Leone': "SLE", 'Somalia': "SOM", 'South Africa': "ZAF", 'South Sudan': "SSD", 'Sudan': "SDN", 
    'Tanzania': "TZA", 'Togo': "TGO", 'Tunisia': "TUN", 'Uganda': "UGA", 'Zambia': "ZMB", 'Zimbabwe': "ZWE", 'eSwatini': "SWZ"
}

# We update the dataframe to have a new column for the iso-tags.
acled_result["iso"] = acled_result["country"].map(country_iso_dict)

# ACLED is now ready for merging

### 02. Prepare Gravity

Gravity requires some extensive cleaning, as the dataset has a lot of superfluous data.

In [5]:
display(df_g.columns)

Index(['year', 'country_id_o', 'country_id_d', 'iso3_o', 'iso3_d', 'iso3num_o',
       'iso3num_d', 'country_exists_o', 'country_exists_d',
       'gmt_offset_2020_o', 'gmt_offset_2020_d', 'distw_harmonic',
       'distw_arithmetic', 'distw_harmonic_jh', 'distw_arithmetic_jh', 'dist',
       'main_city_source_o', 'main_city_source_d', 'distcap', 'contig',
       'diplo_disagreement', 'scaled_sci_2021', 'comlang_off', 'comlang_ethno',
       'comcol', 'col45', 'legal_old_o', 'legal_old_d', 'legal_new_o',
       'legal_new_d', 'comleg_pretrans', 'comleg_posttrans',
       'transition_legalchange', 'comrelig', 'heg_o', 'heg_d', 'col_dep_ever',
       'col_dep', 'col_dep_end_year', 'col_dep_end_conflict', 'empire',
       'sibling_ever', 'sibling', 'sever_year', 'sib_conflict', 'pop_o',
       'pop_d', 'gdp_o', 'gdp_d', 'gdpcap_o', 'gdpcap_d', 'pop_source_o',
       'pop_source_d', 'gdp_source_o', 'gdp_source_d', 'gdp_ppp_o',
       'gdp_ppp_d', 'gdpcap_ppp_o', 'gdpcap_ppp_d', 'pop_pwt_o',

In [6]:
# First, we define the parameters that sounded interesting to us after going over the documentation. Of these we will need to find the statistical significant ones 
target = ["iso3_o", "iso3_d", "country_exists_o", "country_exists_d", "distw_harmonic", "distw_arithmetic", "dist", "distcap", "diplo_disagreement", "scaled_sci_2021", "comlang_off", "comlang_ethno", "comleg_posttrans", "comrelig", "heg_o", "heg_d", "col_dep_ever", "col_dep", "col_dep_end_conflict", "sibling_ever", "sibling", "sever_year", "gdpcap_ppp", "wto", "eu", "fta_wto", "rta_type", "entry_tp", "tradeflow_comtrade_o", "tradeflow_comtrade_d", "tradeflow_baci", "manuf_tradeflow_baci", "tradeflow_imf_o", "tradeflow_imf_d"]


# Some parameters in the Gravity dataset differ from the documentation. These are below:
emp = []
for i in df_g.columns:
    if i in target:
        emp.append(i)

for i in target:
    if i not in emp:
        print(i)


gdpcap_ppp
wto
eu
entry_tp


In [7]:
## We add the missing parameters:
# gdpcap_ppp_o and d
# wto_o and d
# eu_o and d
# entry_tp_o and d

# Now we can start cleaning up the Gravity dataset to prepare for combining with the ACLED dataframe above

df_g_filter = df_g[["year", "iso3_o", "iso3_d", "country_exists_o", "country_exists_d", "distw_harmonic", "distw_arithmetic", "dist", "distcap", "diplo_disagreement", "scaled_sci_2021", "comlang_off", "comlang_ethno", "comleg_posttrans", "comrelig", "heg_o", "heg_d", "col_dep_ever", "col_dep", "col_dep_end_conflict", "sibling_ever", "sibling", "sever_year", "gdpcap_ppp_o", "gdpcap_ppp_d", "wto_o", "wto_d", "eu_o", "eu_d","fta_wto", "rta_type", "entry_tp_o", "entry_tp_d", "tradeflow_comtrade_o", "tradeflow_comtrade_d", "tradeflow_baci", "manuf_tradeflow_baci", "tradeflow_imf_o", "tradeflow_imf_d"]
]

display(df_g_filter.head())

Unnamed: 0,year,iso3_o,iso3_d,country_exists_o,country_exists_d,distw_harmonic,distw_arithmetic,dist,distcap,diplo_disagreement,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
0,1948,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
1,1949,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
2,1950,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
3,1951,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
4,1952,ABW,ABW,0,0,,,,,,...,,,,,,,,,,


In [8]:
country_iso_dict.values()

dict_values(['DZA', 'AGO', 'BEN', 'BWA', 'BFA', 'BDI', 'CMR', 'CPV', 'CAF', 'TCD', 'COM', 'COD', 'DJI', 'EGY', 'GNQ', 'ERI', 'ETH', 'GAB', 'GMB', 'GHA', 'GIN', 'GNB', 'CIV', 'KEN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MLI', 'MRT', 'MUS', 'MYT', 'MAR', 'MOZ', 'NAM', 'NER', 'NGA', 'COG', 'REU', 'RWA', 'SHN', 'STP', 'SEN', 'SYC', 'SLE', 'SOM', 'ZAF', 'SSD', 'SDN', 'TZA', 'TGO', 'TUN', 'UGA', 'ZMB', 'ZWE', 'SWZ'])

In [9]:
''' We can now apply a lot of clean-up filters: 

- remove any rows for years before 1997
- remove rows where any of the two countries do not exist
- remove rows where origin and destination country is identical (these are silly)
- remove rows where neither origin or destination is in Africa (this we can change later if we go beyond Africa)
'''

# We can now apply a lot of clean-up filters: 
# 
# Filter if any of the countries do not exist AND remove any data from before the year 1997.

df_g_filter = df_g_filter[
    (df_g["year"] >= 1997) &
    (df_g["country_exists_o"] == 1) &
    (df_g["country_exists_d"] == 1) &
    (df_g["iso3_o"] != df_g["iso3_d"]) &
    (
        (df_g["iso3_o"].isin(country_iso_dict.values())) |
        (df_g["iso3_d"].isin(country_iso_dict.values()))
    )
]

# This gives us ~1420 origin rows per country 

# Gravity is now ready for merging.

In [10]:
df_g_filter

Unnamed: 0,year,iso3_o,iso3_d,country_exists_o,country_exists_d,distw_harmonic,distw_arithmetic,dist,distcap,diplo_disagreement,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
197,1997,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
198,1998,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
199,1999,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
200,2000,ABW,AGO,1,1,9584.0,9587.0,9505.0,9505.0,,...,0.0,,,,,,,,,
201,2001,ABW,AGO,1,1,9584.0,9587.0,9505.0,9505.0,,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4699217,2017,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,1.266,...,1.0,4.0,70.0,15.5,58320.884,74451.111,82443.668,57765.023,58352.516,74509.453
4699218,2018,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,0.791,...,1.0,4.0,41.0,15.5,66557.311,73844.045,78839.303,60417.984,66580.102,74169.570
4699219,2019,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,0.124,...,1.0,4.0,36.0,15.5,59552.535,61651.194,77331.594,58542.078,59552.559,61790.566
4699220,2020,ZWE,ZMB,1,1,479.0,505.0,399.0,399.0,0.226,...,1.0,4.0,,,52563.421,62310.411,71653.898,53614.934,52563.441,62381.145


### 03. Preparing the refugee data.

We have a small extra dataset for refugee data that can be added for extra fun!

In [11]:
refugee_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\bachelor_2026\data\sit_sets\persons_of_concern.csv"
refugee_path = Path(refugee_path_init).as_posix()

# Now we can include the refugee data.
df_s = pd.read_csv(refugee_path)


In [12]:
# Let's rename the columns to fit the conventions of Gravity
df_s = df_s.rename(columns={
    "Year": "year",
    "Country of Asylum": "ref_country_d",
    "Country of Origin": "ref_country_o",
    "Country of Asylum ISO": "ref_iso_d",
    "Country of Origin ISO": "ref_iso_o"    
})

In [13]:
df_s.columns


Index(['year', 'ref_country_d', 'ref_country_o', 'ref_iso_d', 'ref_iso_o',
       'Refugees', 'Asylum-seekers', 'IDPs',
       'Other people in need of international protection', 'Stateless persons',
       'Host community', 'Others of concern'],
      dtype='object')

In [14]:
'''
WIP for the data filtering on the refugee data. We want to remove:

- We remove all rows from before 1997
- We remove all refugees whose origin is "Unknown" (we want the country of origin to be known for training)
'''

df_s_filter = df_s[
    (df_s["year"] >= 1997) &                            
    (df_s["ref_country_o"] != "Unknown ")
]



df_s_filter = df_s_filter[
    (df_s["year"] >= 1997) &
    (df_s["ref_iso_o"] != df_s["ref_iso_d"]) &
    (
        (df_s["ref_iso_o"].isin(country_iso_dict.values())) |
        (df_s["ref_iso_d"].isin(country_iso_dict.values()))
    )
]

df_s_filter

  df_s_filter = df_s_filter[


Unnamed: 0,year,ref_country_d,ref_country_o,ref_iso_d,ref_iso_o,Refugees,Asylum-seekers,IDPs,Other people in need of international protection,Stateless persons,Host community,Others of concern
12087,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,0,0
12088,1997,Algeria,Niger,DZA,NER,2516,0,0,0,0,0,0
12089,1997,Algeria,Rwanda,DZA,RWA,11,0,0,0,0,0,0
12090,1997,Algeria,Western Sahara,DZA,ESH,165000,0,0,0,0,0,0
12092,1997,Angola,Burundi,AGO,BDI,5,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
138209,2025,Zimbabwe,Somalia,ZWE,SOM,26,25,0,0,0,0,0
138210,2025,Zimbabwe,Sudan,ZWE,SDN,5,5,0,0,0,0,0
138211,2025,Zimbabwe,South Sudan,ZWE,SSD,6,25,0,0,0,0,0
138212,2025,Zimbabwe,Uganda,ZWE,UGA,10,5,0,0,0,0,0


In [19]:
df_merged_full_situ = pd.merge(
    df_s_filter, 
    df_g_filter, 
    left_on=['ref_iso_o', 'year'], 
    right_on=['iso3_o', 'year'], 
    how='inner'
)
print(f"Situation filter shape: {df_s_filter.shape}")
print(f"Gravity filter shape: {df_g_filter.shape}")
print(f"Merged master shape: {df_merged_full_situ.shape}")



Situation filter shape: (56679, 12)
Gravity filter shape: (579204, 39)
Merged master shape: (10232450, 50)


In [20]:
df_merged_full_situ

Unnamed: 0,year,ref_country_d,ref_country_o,ref_iso_d,ref_iso_o,Refugees,Asylum-seekers,IDPs,Other people in need of international protection,Stateless persons,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
0,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,...,0.0,,,,,,,,,
1,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,...,0.0,,,,,,,,,
2,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,...,0.0,,,,14.655,,,,,
3,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,...,0.0,,,,,,,,,
4,1997,Algeria,Mali,DZA,MLI,3200,0,0,0,0,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10232445,2021,Zimbabwe,Zambia,ZWE,ZMB,0,0,0,0,0,...,0.0,,,,,,,,,
10232446,2021,Zimbabwe,Zambia,ZWE,ZMB,0,0,0,0,0,...,0.0,,,,,,,,,
10232447,2021,Zimbabwe,Zambia,ZWE,ZMB,0,0,0,0,0,...,0.0,,,,,,,,,
10232448,2021,Zimbabwe,Zambia,ZWE,ZMB,0,0,0,0,0,...,1.0,4.0,,,,,,,,


### 04. Merging the two datasets - ACLED and Gravity

Time to put our differences aside and get these two datasets into one grand dataset! 

In [15]:
df_merged_full = pd.merge(
    acled_result, 
    df_g_filter, 
    left_on=['iso', 'year'], 
    right_on=['iso3_o', 'year'], 
    how='inner'
)
print(f"Raw ACLED shape: {df_a.shape}")
print(f"Gravity filter shape: {df_g_filter.shape}")
print(f"Merged master shape: {df_merged_full.shape}")



Raw ACLED shape: (418060, 31)
Gravity filter shape: (579204, 39)
Merged master shape: (283311, 68)


In [17]:
# We have some extra / superfluous columns. Let's drop them 
#
# iso - We have multiple of these
# country_exists - We filter for these earlier, no longer useful.

df_merged_full = df_merged_full.drop(columns=['iso', 'country_exists_o', 'country_exists_d'])

In [18]:
df_merged_full

Unnamed: 0,country,year,disorder_Demonstrations,disorder_Political violence,disorder_Political violence; Demonstrations,disorder_Strategic developments,event_Battles,event_Explosions/Remote violence,event_Protests,event_Riots,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
0,Algeria,1997,0,141,0,0,8,17,0,0,...,0.0,,,,,,,,,
1,Algeria,1997,0,141,0,0,8,17,0,0,...,0.0,,,,,,,,,
2,Algeria,1997,0,141,0,0,8,17,0,0,...,0.0,,,,6.097,,,,6.013,6.614
3,Algeria,1997,0,141,0,0,8,17,0,0,...,0.0,,,,,,,,,
4,Algeria,1997,0,141,0,0,8,17,0,0,...,0.0,,,,,214.42,201.176,0.0,,217.467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283306,eSwatini,2021,117,53,12,4,1,0,83,71,...,0.0,,,,,,,,,
283307,eSwatini,2021,117,53,12,4,1,0,83,71,...,0.0,,,,,,,,,
283308,eSwatini,2021,117,53,12,4,1,0,83,71,...,1.0,1.0,,,,,,,,
283309,eSwatini,2021,117,53,12,4,1,0,83,71,...,1.0,4.0,,,,,,,,
