In [1]:
from pathlib import Path

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np


## We need to link to the datasets. These are too large to put on github, perhaps we should set up a dvc?
# !! UPDATE PATHS AS NEEDED  !!

BACI_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets"
BACI_folder_path = Path(BACI_folder_path_init).as_posix()

ACLED_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\ACLED_sets"
ACLED_folder_path = Path(ACLED_folder_path_init).as_posix()

Gravity_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\Gravity_sets"
Gravity_folder_path = Path(Gravity_folder_path_init).as_posix()

## Combining all of ACLED Africa and Gravity

In [2]:
# New link - We should combine all ACLED data from Africa with the Gravity dataset

acled_af = f"{ACLED_folder_path}/ACLEDAfricaData_1997_2026-02-02.csv"
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

df_a = pd.read_csv(acled_af)
df_g = pd.read_csv(gravity)


  df_g = pd.read_csv(gravity)


### 01. Prepare ACLED Africa

In [None]:
# We can filter to the relevant columns we want - More can be added here, but update dummy code below if need be
#   inter1 is the perpetrator, inter2 is the target
df_a_filter = df_a[["country", "year", "disorder_type", "event_type", "inter1", "inter2", "fatalities"]
].copy()

# We save a list of all unique possible values for the columns for future use and reference.
country_list = df_a_filter["country"].unique()
country_list.sort()
disorder_types = df_a_filter["disorder_type"].unique()
event_types = df_a_filter["event_type"].unique()
attack_groups = df_a_filter["inter1"].unique()
target_groups = df_a_filter["inter2"].unique()

# Now we need to link the countries to the tags in Gravity

df_a_filter.tail(5)

Unnamed: 0,country,year,disorder_type,event_type,inter1,inter2,fatalities
418055,Niger,2025,Political violence,Violence against civilians,Political militia,Civilians,0
418056,Cameroon,2025,Strategic developments,Strategic developments,Political militia,Civilians,0
418057,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418058,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0
418059,Cameroon,2025,Political violence,Violence against civilians,Rebel group,Civilians,0


In [None]:
df = df_a_filter.copy()

# We create dummy values for each type of disorder, event, attackers and target
dummies = pd.get_dummies(
    df[['disorder_type', 'event_type', 'inter1', 'inter2']],
    prefix=['disorder', 'event', 'perpetrator', 'target']
)

# We add the numeric columns back to the dummy dataset
dummies['fatalities'] = df['fatalities']
dummies['country'] = df['country']
dummies['year'] = df['year']

# Now we can group by country and year, and sum over the dummy categories. Perfect!
result = dummies.groupby(['country', 'year']).sum().reset_index()

## We want to map the countries from ACLED onto the iso names from the Gravity dataset.
country_iso_dict = {
    "Algeria": "DZA", "Angola": "AGO", 'Benin': "BEN", 'Botswana': "BWA", 'Burkina Faso': "BFA", "Burundi": "BDI", 'Cameroon': "CMR", 'Cape Verde': "CPV",
    "Central African Republic": "CAF", 'Chad': "TCD", 'Comoros': "COM", 'Democratic Republic of Congo': "COD", 'Djibouti': "DJI", 'Egypt': "EGY",
    'Equatorial Guinea': "GNQ", 'Eritrea': "ERI", 'Ethiopia': "ETH", 'Gabon': "GAB",'Gambia': "GMB", 'Ghana': "GHA", 'Guinea': "GIN", 'Guinea-Bissau': "GNB", 
    'Ivory Coast': "CIV", 'Kenya': "KEN", 'Lesotho': "LSO", 'Liberia': "LBR", 'Libya': "LBY", 'Madagascar': "MDG", 'Malawi': "MWI",'Mali': "MLI", 
    'Mauritania': "MRT", 'Mauritius': "MUS", 'Mayotte': "MYT", 'Morocco': "MAR",'Mozambique': "MOZ", 'Namibia': "NAM", 'Niger': "NER", 'Nigeria': "NGA", 
    'Republic of Congo': "COG", 'Reunion': "REU", 'Rwanda': "RWA", 'Saint Helena, Ascension and Tristan da Cunha': "SHN", 'Sao Tome and Principe': "STP", 
    'Senegal': "SEN", 'Seychelles': "SYC", 'Sierra Leone': "SLE", 'Somalia': "SOM", 'South Africa': "ZAF", 'South Sudan': "SSD", 'Sudan': "SDN", 
    'Tanzania': "TZA", 'Togo': "TGO", 'Tunisia': "TUN", 'Uganda': "UGA", 'Zambia': "ZMB", 'Zimbabwe': "ZWE", 'eSwatini': "SWZ"
}

# We update the dataframe to have a new column for the iso-tags.
result["iso"] = result["country"].map(country_iso_dict)

# ACLED is now ready for merging

### 02. Clean up Gravity and merge

In [None]:
# Now we can start cleaning up the Gravity dataset to prepare for combining with the ACLED dataframe above

##### Test run - ignore

In [4]:
# Let's try and link up the Gravity and ACLED datasets.
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"
acled = f"{ACLED_folder_path}/Middle-East_aggregated_data_up_to-2025-11-01.xlsx"

In [6]:
df_g = pd.read_csv(gravity)
df_a = pd.read_excel(acled)

  df_g = pd.read_csv(gravity)


In [None]:
#print(df_g.info())
#print(df_a.info())

#df_g_filter = df_g[["year", "country_id_o", "distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]]
#df_a_filter = df_a[["WEEK", "COUNTRY", "EVENT_TYPE", "DISORDER_TYPE", "ADMIN1"]]

In [42]:
df_g_filter = df_g.loc[
    (df_g["year"].between(2012, 2025)) &
    (df_g["country_id_o"] == "TUR") & 
    (df_g["country_id_d"] == "DEU.2"),
    ["year", "country_id_o", "country_id_d", "distw_harmonic", 
    "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]
].copy()

In [43]:
df_a_filter = df_a.loc[
    (df_a["COUNTRY"] == "Turkey"),
    ["WEEK", "COUNTRY", "EVENT_TYPE", "DISORDER_TYPE", "ADMIN1"]
].copy()

In [53]:

iso = df_a_filter["WEEK"].dt.isocalendar()
df_a_filter["YEAR"] = iso["year"]


yearly_counts = (
    df_a_filter
    .groupby(["YEAR", "EVENT_TYPE"])
    .size()
    .reset_index(name="COUNT")
    .sort_values(["YEAR", "EVENT_TYPE"])
)


yearly_pivot = yearly_counts.pivot(index="YEAR", columns="EVENT_TYPE", values="COUNT").fillna(0).astype(int)

yearly_counts, yearly_pivot.head()


(    YEAR                  EVENT_TYPE  COUNT
 0   2015                     Battles      6
 1   2015  Explosions/Remote violence      5
 2   2015                    Protests      5
 3   2015                       Riots      1
 4   2015      Strategic developments      4
 ..   ...                         ...    ...
 61  2025  Explosions/Remote violence      4
 62  2025                    Protests   1568
 63  2025                       Riots     31
 64  2025      Strategic developments    184
 65  2025  Violence against civilians     20
 
 [66 rows x 3 columns],
 EVENT_TYPE  Battles  Explosions/Remote violence  Protests  Riots  \
 YEAR                                                               
 2015              6                           5         5      1   
 2016            424                         350       769     67   
 2017            338                         147       367     19   
 2018            286                         130       742     68   
 2019            252

In [None]:
yearly_pivot.reset_index(inplace=True)   # so YEAR becomes a column

yearly_pivot.columns.name = None

yearly_pivot

Unnamed: 0,YEAR,Battles,Explosions/Remote violence,Protests,Riots,Strategic developments,Violence against civilians
0,2015,6,5,5,1,4,3
1,2016,424,350,769,67,196,102
2,2017,338,147,367,19,159,77
3,2018,286,130,742,68,233,113
4,2019,252,134,1153,70,561,166
5,2020,176,246,836,25,901,130
6,2021,134,115,1287,37,935,58
7,2022,116,112,1415,52,810,49
8,2023,76,51,1690,128,696,70
9,2024,47,55,1973,92,726,51


In [80]:
df_g_filter = df_g_filter.rename(columns={"year": "YEAR"})

In [81]:
df_merged = pd.merge(yearly_pivot, df_g_filter, on="YEAR", how="left")

In [82]:
df_merged

Unnamed: 0,YEAR,Battles,Explosions/Remote violence,Protests,Riots,Strategic developments,Violence against civilians,country_id_o,country_id_d,distw_harmonic,diplo_disagreement,tradeflow_baci,scaled_sci_2021
0,2015,6,5,5,1,4,3,TUR,DEU.2,2087.0,0.593,14617270.0,15784.0
1,2016,424,350,769,67,196,102,TUR,DEU.2,2087.0,0.609,15363760.0,15784.0
2,2017,338,147,367,19,159,77,TUR,DEU.2,2087.0,0.518,16692100.0,15784.0
3,2018,286,130,742,68,233,113,TUR,DEU.2,2087.0,0.607,17725910.0,15784.0
4,2019,252,134,1153,70,561,166,TUR,DEU.2,2087.0,0.607,17007150.0,15784.0
5,2020,176,246,836,25,901,130,TUR,DEU.2,2089.0,0.685,16326620.0,15784.0
6,2021,134,115,1287,37,935,58,TUR,DEU.2,2089.0,,,15784.0
7,2022,116,112,1415,52,810,49,,,,,,
8,2023,76,51,1690,128,696,70,,,,,,
9,2024,47,55,1973,92,726,51,,,,,,


In [83]:
df_merged.corr(numeric_only=True)

event_cols = yearly_pivot.columns.drop("YEAR")
param_cols = ["distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]

for event in event_cols:
    print(f"\nCorrelation for event type: {event}")
    print(df_merged[[event] + param_cols].corr()[event].sort_values(ascending=False))


Correlation for event type: Battles
Battles               1.000000
tradeflow_baci        0.407484
diplo_disagreement   -0.258265
distw_harmonic       -0.373944
scaled_sci_2021            NaN
Name: Battles, dtype: float64

Correlation for event type: Explosions/Remote violence
Explosions/Remote violence    1.000000
diplo_disagreement            0.340550
distw_harmonic                0.122213
tradeflow_baci                0.029466
scaled_sci_2021                    NaN
Name: Explosions/Remote violence, dtype: float64

Correlation for event type: Protests
Protests              1.000000
tradeflow_baci        0.596528
distw_harmonic        0.504345
diplo_disagreement    0.457953
scaled_sci_2021            NaN
Name: Protests, dtype: float64

Correlation for event type: Riots
Riots                 1.000000
tradeflow_baci        0.543016
diplo_disagreement    0.166257
distw_harmonic       -0.246664
scaled_sci_2021            NaN
Name: Riots, dtype: float64

Correlation for event type: Strateg

In [85]:

from sklearn.linear_model import LinearRegression
import numpy as np

X = df_merged[["distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]]
y = df_merged["Protests"]

model = LinearRegression()
model.fit(X, y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("RÂ²:", model.score(X, y))


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values