In [26]:
from pathlib import Path

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np


## We need to link to the datasets. These are too large to put on github, perhaps we should set up a dvc?
# !! UPDATE PATHS AS NEEDED  !!
# Matti here, saving his paths yeehaw
# C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets    (or acled_sets, or gravity_sets)

BACI_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets"
BACI_folder_path = Path(BACI_folder_path_init).as_posix()

ACLED_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\acled_sets"
ACLED_folder_path = Path(ACLED_folder_path_init).as_posix()

Gravity_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\gravity_sets"
Gravity_folder_path = Path(Gravity_folder_path_init).as_posix()

## Combining all of ACLED Africa and Gravity

In [28]:
# New link - We should combine all ACLED data from Africa with the Gravity dataset

acled_af = f"{ACLED_folder_path}/africa_acled.csv"
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

df_a = pd.read_csv(acled_af)
df_g = pd.read_csv(gravity)


  df_g = pd.read_csv(gravity)


### 01. Prepare ACLED Africa

This run was done locally on Mattis Multimachine (Therefore we re-link the two folders above)

We need to clean up the ACLED data and only keep relevant data.

In [21]:
acled_af = f"{ACLED_folder_path}/ACLEDAfricaData_1997_2026-02-02.csv"
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

df_a = pd.read_csv(acled_af)
df_g = pd.read_csv(gravity)

# We can filter to the relevant columns we want - More can be added here, but update dummy code below if need be
#   inter1 is the perpetrator, inter2 is the target
df_a_filter = df_a[["country", "year", "disorder_type", "event_type", "inter1", "inter2", "fatalities"]
].copy()

# We save a list of all unique possible values for the columns for future use and reference.
country_list = df_a_filter["country"].unique()
country_list.sort()
disorder_types = df_a_filter["disorder_type"].unique()
event_types = df_a_filter["event_type"].unique()
attack_groups = df_a_filter["inter1"].unique()
target_groups = df_a_filter["inter2"].unique()

# Now we need to link the countries to the tags in Gravity, so the countries can be linked between the two datasets
df_a_filter.tail(5)

  df_g = pd.read_csv(gravity)


Unnamed: 0,country,year,disorder_type,event_type,inter1,inter2,fatalities
418055,Niger,2025,Political violence,Violence against civilians,Political militia,Civilians,0
418056,Cameroon,2025,Strategic developments,Strategic developments,Political militia,Civilians,0
418057,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418058,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418059,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0


In [22]:
df = df_a_filter.copy()

# We create dummy values for each type of disorder, event, attackers and target
dummies = pd.get_dummies(
    df[['disorder_type', 'event_type', 'inter1', 'inter2']],
    prefix=['disorder', 'event', 'perpetrator', 'target']
)

# We add the numeric columns back to the dummy dataset
dummies['fatalities'] = df['fatalities']
dummies['country'] = df['country']
dummies['year'] = df['year']

# Now we can group by country and year, and sum over the dummy categories. Perfect!
acled_result = dummies.groupby(['country', 'year']).sum().reset_index()

## We want to map the countries from ACLED onto the iso names from the Gravity dataset. I made this stupid dictionary BY HAND, please respect that.
country_iso_dict = {
    "Algeria": "DZA", "Angola": "AGO", 'Benin': "BEN", 'Botswana': "BWA", 'Burkina Faso': "BFA", "Burundi": "BDI", 'Cameroon': "CMR", 'Cape Verde': "CPV",
    "Central African Republic": "CAF", 'Chad': "TCD", 'Comoros': "COM", 'Democratic Republic of Congo': "COD", 'Djibouti': "DJI", 'Egypt': "EGY",
    'Equatorial Guinea': "GNQ", 'Eritrea': "ERI", 'Ethiopia': "ETH", 'Gabon': "GAB",'Gambia': "GMB", 'Ghana': "GHA", 'Guinea': "GIN", 'Guinea-Bissau': "GNB", 
    'Ivory Coast': "CIV", 'Kenya': "KEN", 'Lesotho': "LSO", 'Liberia': "LBR", 'Libya': "LBY", 'Madagascar': "MDG", 'Malawi': "MWI",'Mali': "MLI", 
    'Mauritania': "MRT", 'Mauritius': "MUS", 'Mayotte': "MYT", 'Morocco': "MAR",'Mozambique': "MOZ", 'Namibia': "NAM", 'Niger': "NER", 'Nigeria': "NGA", 
    'Republic of Congo': "COG", 'Reunion': "REU", 'Rwanda': "RWA", 'Saint Helena, Ascension and Tristan da Cunha': "SHN", 'Sao Tome and Principe': "STP", 
    'Senegal': "SEN", 'Seychelles': "SYC", 'Sierra Leone': "SLE", 'Somalia': "SOM", 'South Africa': "ZAF", 'South Sudan': "SSD", 'Sudan': "SDN", 
    'Tanzania': "TZA", 'Togo': "TGO", 'Tunisia': "TUN", 'Uganda': "UGA", 'Zambia': "ZMB", 'Zimbabwe': "ZWE", 'eSwatini': "SWZ"
}

# We update the dataframe to have a new column for the iso-tags.
acled_result["iso"] = acled_result["country"].map(country_iso_dict)

# ACLED is now ready for merging

### 02. Prepare Gravity

Gravity requires some extensive cleaning, as the dataset has a lot of superfluous data.

In [23]:
display(df_g.columns)

Index(['year', 'country_id_o', 'country_id_d', 'iso3_o', 'iso3_d', 'iso3num_o',
       'iso3num_d', 'country_exists_o', 'country_exists_d',
       'gmt_offset_2020_o', 'gmt_offset_2020_d', 'distw_harmonic',
       'distw_arithmetic', 'distw_harmonic_jh', 'distw_arithmetic_jh', 'dist',
       'main_city_source_o', 'main_city_source_d', 'distcap', 'contig',
       'diplo_disagreement', 'scaled_sci_2021', 'comlang_off', 'comlang_ethno',
       'comcol', 'col45', 'legal_old_o', 'legal_old_d', 'legal_new_o',
       'legal_new_d', 'comleg_pretrans', 'comleg_posttrans',
       'transition_legalchange', 'comrelig', 'heg_o', 'heg_d', 'col_dep_ever',
       'col_dep', 'col_dep_end_year', 'col_dep_end_conflict', 'empire',
       'sibling_ever', 'sibling', 'sever_year', 'sib_conflict', 'pop_o',
       'pop_d', 'gdp_o', 'gdp_d', 'gdpcap_o', 'gdpcap_d', 'pop_source_o',
       'pop_source_d', 'gdp_source_o', 'gdp_source_d', 'gdp_ppp_o',
       'gdp_ppp_d', 'gdpcap_ppp_o', 'gdpcap_ppp_d', 'pop_pwt_o',

In [29]:
# First, we define the parameters that sounded interesting to us after going over the documentation. Of these we will need to find the statistical significant ones 
target = ["iso3_o", "iso3_d", "country_exists_o", "country_exists_d", "distw_harmonic", "distw_arithmetic", "dist", "distcap", "diplo_disagreement", "scaled_sci_2021", "comlang_off", "comlang_ethno", "comleg_posttrans", "comrelig", "heg_o", "heg_d", "col_dep_ever", "col_dep", "col_dep_end_conflict", "sibling_ever", "sibling", "sever_year", "gdpcap_ppp", "wto", "eu", "fta_wto", "rta_type", "entry_tp", "tradeflow_comtrade_o", "tradeflow_comtrade_d", "tradeflow_baci", "manuf_tradeflow_baci", "tradeflow_imf_o", "tradeflow_imf_d"]


# Some parameters in the Gravity dataset differ from the documentation. These are below:
emp = []
for i in df_g.columns:
    if i in target:
        emp.append(i)

for i in target:
    if i not in emp:
        print(i)

    

gdpcap_ppp
wto
eu
entry_tp


In [30]:
## We add the missing parameters:
# gdpcap_ppp_o and d
# wto_o and d
# eu_o and d
# entry_tp_o and d

# Now we can start cleaning up the Gravity dataset to prepare for combining with the ACLED dataframe above

df_g_filter = df_g[["year", "iso3_o", "iso3_d", "country_exists_o", "country_exists_d", "distw_harmonic", "distw_arithmetic", "dist", "distcap", "diplo_disagreement", "scaled_sci_2021", "comlang_off", "comlang_ethno", "comleg_posttrans", "comrelig", "heg_o", "heg_d", "col_dep_ever", "col_dep", "col_dep_end_conflict", "sibling_ever", "sibling", "sever_year", "gdpcap_ppp_o", "gdpcap_ppp_d", "wto_o", "wto_d", "eu_o", "eu_d","fta_wto", "rta_type", "entry_tp_o", "entry_tp_d", "tradeflow_comtrade_o", "tradeflow_comtrade_d", "tradeflow_baci", "manuf_tradeflow_baci", "tradeflow_imf_o", "tradeflow_imf_d"]
]

display(df_g_filter.head())

Unnamed: 0,year,iso3_o,iso3_d,country_exists_o,country_exists_d,distw_harmonic,distw_arithmetic,dist,distcap,diplo_disagreement,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
0,1948,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
1,1949,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
2,1950,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
3,1951,ABW,ABW,0,0,,,,,,...,,,,,,,,,,
4,1952,ABW,ABW,0,0,,,,,,...,,,,,,,,,,


In [31]:
country_iso_dict.values()

dict_values(['DZA', 'AGO', 'BEN', 'BWA', 'BFA', 'BDI', 'CMR', 'CPV', 'CAF', 'TCD', 'COM', 'COD', 'DJI', 'EGY', 'GNQ', 'ERI', 'ETH', 'GAB', 'GMB', 'GHA', 'GIN', 'GNB', 'CIV', 'KEN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MLI', 'MRT', 'MUS', 'MYT', 'MAR', 'MOZ', 'NAM', 'NER', 'NGA', 'COG', 'REU', 'RWA', 'SHN', 'STP', 'SEN', 'SYC', 'SLE', 'SOM', 'ZAF', 'SSD', 'SDN', 'TZA', 'TGO', 'TUN', 'UGA', 'ZMB', 'ZWE', 'SWZ'])

In [32]:
''' We can now apply a lot of clean-up filters: 

- remove any rows for years before 1997
- remove rows where any of the two countries do not exist
- remove rows where origin and destination country is identical (these are silly)
- remove rows where neither origin or destination is in Africa (this we can change later if we go beyond Africa)
'''

# We can now apply a lot of clean-up filters: 
# 
# Filter if any of the countries do not exist AND remove any data from before the year 1997.

df_g_filter = df_g_filter[
    (df_g["year"] >= 1997) &
    (df_g["country_exists_o"] == 1) &
    (df_g["country_exists_d"] == 1) &
    (df_g["iso3_o"] != df_g["iso3_d"]) &
    (
        (df_g["iso3_o"].isin(country_iso_dict.values())) |
        (df_g["iso3_d"].isin(country_iso_dict.values()))
    )
]

# This gives us ~1420 origin rows per country 

# Gravity is now ready for merging.

In [18]:
df_g_filter

Unnamed: 0,year,iso3_o,iso3_d,country_exists_o,country_exists_d,distw_harmonic,distw_arithmetic,dist,distcap,diplo_disagreement,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
197,1997,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
198,1998,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
199,1999,ABW,AGO,1,1,9590.0,9593.0,9505.0,9505.0,,...,0.0,,,,,,,,,
200,2000,ABW,AGO,1,1,9584.0,9587.0,9505.0,9505.0,,...,0.0,,,,,,,,,
201,2001,ABW,AGO,1,1,9584.0,9587.0,9505.0,9505.0,,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4699217,2017,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,1.266,...,1.0,4.0,70.0,15.5,58320.884,74451.111,82443.668,57765.023,58352.516,74509.453
4699218,2018,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,0.791,...,1.0,4.0,41.0,15.5,66557.311,73844.045,78839.303,60417.984,66580.102,74169.570
4699219,2019,ZWE,ZMB,1,1,484.0,511.0,399.0,399.0,0.124,...,1.0,4.0,36.0,15.5,59552.535,61651.194,77331.594,58542.078,59552.559,61790.566
4699220,2020,ZWE,ZMB,1,1,479.0,505.0,399.0,399.0,0.226,...,1.0,4.0,,,52563.421,62310.411,71653.898,53614.934,52563.441,62381.145


### 03. Merging the two datasets

Time to put our differences aside and get these two datasets into one grand dataset! 

In [25]:
acled_result

Unnamed: 0,country,year,disorder_Demonstrations,disorder_Political violence,disorder_Political violence; Demonstrations,disorder_Strategic developments,event_Battles,event_Explosions/Remote violence,event_Protests,event_Riots,...,target_Civilians,target_External/Other forces,target_Identity militia,target_Political militia,target_Protesters,target_Rebel group,target_Rioters,target_State forces,fatalities,iso
0,Algeria,1997,0,141,0,0,8,17,0,0,...,132,0,0,0,0,6,0,2,3579,DZA
1,Algeria,1998,1,47,0,0,14,13,1,0,...,33,0,0,0,0,4,0,10,1269,DZA
2,Algeria,1999,0,63,0,3,27,11,0,0,...,31,0,0,4,0,4,0,24,369,DZA
3,Algeria,2000,2,168,0,0,95,12,2,0,...,69,0,0,6,0,62,0,32,1389,DZA
4,Algeria,2001,94,155,0,1,78,11,18,94,...,56,0,1,6,0,25,2,137,945,DZA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428,eSwatini,2021,117,53,12,4,1,0,83,71,...,49,1,0,0,0,0,0,48,46,SWZ
1429,eSwatini,2022,81,115,5,40,22,0,63,68,...,93,2,0,0,0,0,3,70,33,SWZ
1430,eSwatini,2023,31,47,1,5,1,0,31,27,...,41,0,0,2,0,0,0,6,32,SWZ
1431,eSwatini,2024,21,16,1,2,0,0,22,10,...,17,0,0,0,0,0,0,2,10,SWZ


### Zen's Zupreme Code below:

This is the good stuff- but the merge is BUGGED 

In [33]:
df_g_filter['year'] = pd.to_numeric(df_g_filter["year"])
df_g_filter_cut = df_g_filter[df_g_filter["year"] >= 1997]
print(df_g_filter_cut.shape)

(579204, 39)


In [42]:
country_iso_dict = {
    "Algeria": "DZA", "Angola": "AGO", 'Benin': "BEN", 'Botswana': "BWA", 'Burkina Faso': "BFA", "Burundi": "BDI", 'Cameroon': "CMR", 'Cape Verde': "CPV",
    "Central African Republic": "CAF", 'Chad': "TCD", 'Comoros': "COM", 'Democratic Republic of Congo': "COD", 'Djibouti': "DJI", 'Egypt': "EGY",
    'Equatorial Guinea': "GNQ", 'Eritrea': "ERI", 'Ethiopia': "ETH", 'Gabon': "GAB",'Gambia': "GMB", 'Ghana': "GHA", 'Guinea': "GIN", 'Guinea-Bissau': "GNB", 
    'Ivory Coast': "CIV", 'Kenya': "KEN", 'Lesotho': "LSO", 'Liberia': "LBR", 'Libya': "LBY", 'Madagascar': "MDG", 'Malawi': "MWI",'Mali': "MLI", 
    'Mauritania': "MRT", 'Mauritius': "MUS", 'Mayotte': "MYT", 'Morocco': "MAR",'Mozambique': "MOZ", 'Namibia': "NAM", 'Niger': "NER", 'Nigeria': "NGA", 
    'Republic of Congo': "COG", 'Reunion': "REU", 'Rwanda': "RWA", 'Saint Helena, Ascension and Tristan da Cunha': "SHN", 'Sao Tome and Principe': "STP", 
    'Senegal': "SEN", 'Seychelles': "SYC", 'Sierra Leone': "SLE", 'Somalia': "SOM", 'South Africa': "ZAF", 'South Sudan': "SSD", 'Sudan': "SDN", 
    'Tanzania': "TZA", 'Togo': "TGO", 'Tunisia': "TUN", 'Uganda': "UGA", 'Zambia': "ZMB", 'Zimbabwe': "ZWE", 'eSwatini': "SWZ"
}
#df_a['iso'] = df_a['country'].map(country_iso_dict)
# Perform an outer merge on 'year' and ISO code (mapping ACLED to the 'origin' side of Gravity)
df_merged_full = pd.merge(
    acled_result, 
    df_g_filter_cut, 
    left_on=['iso', 'year'], 
    right_on=['iso3_o', 'year'], 
    how='outer'
)
print(f"Raw ACLED shape: {df_a.shape}")
print(f"Gravity filter shape: {df_g_filter_cut.shape}")
print(f"Merged master shape: {df_merged_full.shape}")

Raw ACLED shape: (1433, 31)
Gravity filter shape: (579204, 39)
Merged master shape: (579421, 68)


In [37]:
df_merged_full

Unnamed: 0.1,Unnamed: 0,country,year,disorder_Demonstrations,disorder_Political violence,disorder_Political violence; Demonstrations,disorder_Strategic developments,event_Battles,event_Explosions/Remote violence,event_Protests,...,fta_wto,rta_type,entry_tp_o,entry_tp_d,tradeflow_comtrade_o,tradeflow_comtrade_d,tradeflow_baci,manuf_tradeflow_baci,tradeflow_imf_o,tradeflow_imf_d
0,,,1997,,,,,,,,...,0.0,,,,,,,,,
1,,,1997,,,,,,,,...,0.0,,,,,,,,,
2,,,1997,,,,,,,,...,0.0,,,,,,,,,
3,,,1997,,,,,,,,...,0.0,,,,,,,,,
4,,,1997,,,,,,,,...,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579416,1399.0,Zimbabwe,2021,61.0,83.0,1.0,7.0,1.0,2.0,51.0,...,1.0,4.0,,,,,,,,
579417,1400.0,Zimbabwe,2022,52.0,140.0,2.0,18.0,2.0,1.0,51.0,...,,,,,,,,,,
579418,1401.0,Zimbabwe,2023,37.0,172.0,1.0,24.0,0.0,1.0,36.0,...,,,,,,,,,,
579419,1402.0,Zimbabwe,2024,29.0,48.0,1.0,12.0,1.0,0.0,25.0,...,,,,,,,,,,


In [35]:
refugee_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\bachelor_2026\data\sit_sets\persons_of_concern.csv"
refugee_path = Path(refugee_path_init).as_posix()

# Now we can include the refugee data.
df_s = pd.read_csv(refugee_path)

# Rename as you already were
df_s.columns = df_s.columns.str.lower().str.replace(' ', '_').str.strip()

# NEW: Remove any duplicate column names created by the lowercasing
df_s = df_s.loc[:, ~df_s.columns.duplicated()]

# Now this will work because 'year' is a single column (Series)
df_s['year'] = pd.to_numeric(df_s["year"], errors='coerce')
df_s = df_s[df_s["year"] >= 1997]
print(df_s.shape)
print(df_s.head())

(126142, 12)
       year country_of_asylum                     country_of_origin  \
12082  1997       Afghanistan                           Afghanistan   
12083  1997       Afghanistan                                  Iraq   
12084  1997           Albania                                  Iraq   
12085  1997           Albania  Serbia and Kosovo: S/RES/1244 (1999)   
12086  1997           Albania                      Syrian Arab Rep.   

      country_of_asylum_iso country_of_origin_iso  refugees  asylum-seekers  \
12082                   AFG                   AFG         0               0   
12083                   AFG                   IRQ         5               0   
12084                   ALB                   IRQ         5               0   
12085                   ALB                   SRB        24               0   
12086                   ALB                   SYR         5               0   

         idps  other_people_in_need_of_international_protection  \
12082  296795     

In [36]:
# 1. Clean df_s (assuming not already done)
df_s.columns = df_s.columns.str.lower().str.replace(' ', '_').str.strip()
df_s = df_s.loc[:, ~df_s.columns.duplicated()]
df_s['year'] = pd.to_numeric(df_s['year'], errors='coerce')

# 2. Merge df_s with df_merged_full
# Match Key: 
#   Year -> Year
#   iso3_o (Gravity Origin) -> country_of_origin_iso
#   iso3_d (Gravity Destination) -> country_of_asylum_iso
df_final = pd.merge(
    df_merged_full,
    df_s,
    left_on=['year', 'iso3_o', 'iso3_d'],
    right_on=['year', 'country_of_origin_iso', 'country_of_asylum_iso'],
    how='outer'
)

# 3. Quick Verification
print(f"Merge Complete.")
print(f"Total Rows: {len(df_final)}")
# Check for a few important columns
all_columns = df_final.columns.tolist()
print(all_columns)
display(df_final.head())

Merge Complete.
Total Rows: 659677
['Unnamed: 0', 'country', 'year', 'disorder_Demonstrations', 'disorder_Political violence', 'disorder_Political violence; Demonstrations', 'disorder_Strategic developments', 'event_Battles', 'event_Explosions/Remote violence', 'event_Protests', 'event_Riots', 'event_Strategic developments', 'event_Violence against civilians', 'perpetrator_Civilians', 'perpetrator_External/Other forces', 'perpetrator_Identity militia', 'perpetrator_Political militia', 'perpetrator_Protesters', 'perpetrator_Rebel group', 'perpetrator_Rioters', 'perpetrator_State forces', 'target_Civilians', 'target_External/Other forces', 'target_Identity militia', 'target_Political militia', 'target_Protesters', 'target_Rebel group', 'target_Rioters', 'target_State forces', 'fatalities', 'iso', 'iso3_o', 'iso3_d', 'country_exists_o', 'country_exists_d', 'distw_harmonic', 'distw_arithmetic', 'dist', 'distcap', 'diplo_disagreement', 'scaled_sci_2021', 'comlang_off', 'comlang_ethno', 'com

Unnamed: 0.1,Unnamed: 0,country,year,disorder_Demonstrations,disorder_Political violence,disorder_Political violence; Demonstrations,disorder_Strategic developments,event_Battles,event_Explosions/Remote violence,event_Protests,...,country_of_origin,country_of_asylum_iso,country_of_origin_iso,refugees,asylum-seekers,idps,other_people_in_need_of_international_protection,stateless_persons,host_community,others_of_concern
0,,,1997,,,,,,,,...,,,,,,,,,,
1,,,1997,,,,,,,,...,,,,,,,,,,
2,,,1997,,,,,,,,...,,,,,,,,,,
3,,,1997,,,,,,,,...,,,,,,,,,,
4,,,1997,,,,,,,,...,,,,,,,,,,
