In [None]:
from pathlib import Path

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np


## We need to link to the datasets. These are too large to put on github, perhaps we should set up a dvc?
# !! UPDATE PATHS AS NEEDED  !!

BACI_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\BACI_sets"
BACI_folder_path = Path(BACI_folder_path_init).as_posix()

ACLED_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\ACLED_sets"
ACLED_folder_path = Path(ACLED_folder_path_init).as_posix()

Gravity_folder_path_init = r"C:\Users\mhm25\Desktop\ITU\6thSemester\bachelorproj\data\Gravity_sets"
Gravity_folder_path = Path(Gravity_folder_path_init).as_posix()

## Combining all of ACLED Africa and Gravity

In [None]:
# New link - We should combine all ACLED data from Africa with the Gravity dataset

acled_af = f"{ACLED_folder_path}/ACLEDAfricaData_1997_2026-02-02.csv"
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"

df_a = pd.read_csv(acled_af)
df_g = pd.read_csv(gravity)


### 01. Prepare ACLED Africa

In [None]:
# We can filter to the relevant columns we want - More can be added here, but update dummy code below if need be
#   inter1 is the perpetrator, inter2 is the target
df_a_filter = df_a[["country", "year", "disorder_type", "event_type", "inter1", "inter2", "fatalities"]
].copy()

# We save a list of all unique possible values for the columns for future use and reference.
country_list = df_a_filter["country"].unique()
country_list.sort()
disorder_types = df_a_filter["disorder_type"].unique()
event_types = df_a_filter["event_type"].unique()
attack_groups = df_a_filter["inter1"].unique()
target_groups = df_a_filter["inter2"].unique()

# Now we need to link the countries to the tags in Gravity

df_a_filter.tail(5)

In [None]:
df = df_a_filter.copy()

# We create dummy values for each type of disorder, event, attackers and target
dummies = pd.get_dummies(
    df[['disorder_type', 'event_type', 'inter1', 'inter2']],
    prefix=['disorder', 'event', 'perpetrator', 'target']
)

# We add the numeric columns back to the dummy dataset
dummies['fatalities'] = df['fatalities']
dummies['country'] = df['country']
dummies['year'] = df['year']

# Now we can group by country and year, and sum over the dummy categories. Perfect!
result = dummies.groupby(['country', 'year']).sum().reset_index()

## We want to map the countries from ACLED onto the iso names from the Gravity dataset.
country_iso_dict = {
    "Algeria": "DZA", "Angola": "AGO", 'Benin': "BEN", 'Botswana': "BWA", 'Burkina Faso': "BFA", "Burundi": "BDI", 'Cameroon': "CMR", 'Cape Verde': "CPV",
    "Central African Republic": "CAF", 'Chad': "TCD", 'Comoros': "COM", 'Democratic Republic of Congo': "COD", 'Djibouti': "DJI", 'Egypt': "EGY",
    'Equatorial Guinea': "GNQ", 'Eritrea': "ERI", 'Ethiopia': "ETH", 'Gabon': "GAB",'Gambia': "GMB", 'Ghana': "GHA", 'Guinea': "GIN", 'Guinea-Bissau': "GNB", 
    'Ivory Coast': "CIV", 'Kenya': "KEN", 'Lesotho': "LSO", 'Liberia': "LBR", 'Libya': "LBY", 'Madagascar': "MDG", 'Malawi': "MWI",'Mali': "MLI", 
    'Mauritania': "MRT", 'Mauritius': "MUS", 'Mayotte': "MYT", 'Morocco': "MAR",'Mozambique': "MOZ", 'Namibia': "NAM", 'Niger': "NER", 'Nigeria': "NGA", 
    'Republic of Congo': "COG", 'Reunion': "REU", 'Rwanda': "RWA", 'Saint Helena, Ascension and Tristan da Cunha': "SHN", 'Sao Tome and Principe': "STP", 
    'Senegal': "SEN", 'Seychelles': "SYC", 'Sierra Leone': "SLE", 'Somalia': "SOM", 'South Africa': "ZAF", 'South Sudan': "SSD", 'Sudan': "SDN", 
    'Tanzania': "TZA", 'Togo': "TGO", 'Tunisia': "TUN", 'Uganda': "UGA", 'Zambia': "ZMB", 'Zimbabwe': "ZWE", 'eSwatini': "SWZ"
}

# We update the dataframe to have a new column for the iso-tags.
result["iso"] = result["country"].map(country_iso_dict)

# ACLED is now ready for merging

### 02. Clean up Gravity and merge

In [None]:
print(df_g.columns)

In [None]:
# Now we can start cleaning up the Gravity dataset to prepare for combining with the ACLED dataframe above

df_g_filter = df_g[
    ["iso3_o", "iso3_d", "country_exists_o", "country_exists_d", "distw_harmonic", "distw_arithmetic", "dist", "distcap", "diplo_disagreement", "scaled_sci_2021", "comlang_off", "comlang_ethno", "comleg_posttrans", "comrelig", "heg_o", "heg_d", "col_dep_ever", "col_dep", "col_dep_end_conflict", "sibling_ever", "sibling", "sever_year", "gdpcap_ppp", "wto", "eu", "fta_wto", "rta_type", "entry_tp", "tradeflow_comtrade_o", "tradeflow_comtrade_d", "tradeflow_baci", "manuf_tradeflow_baci", "tradeflow_imf_o", "tradeflow_imf_d" ]
]

df_g_filter.head(5)

##### Test run - ignore

In [None]:
# Let's try and link up the Gravity and ACLED datasets.
gravity = f"{Gravity_folder_path}/Gravity_V202211.csv"
acled = f"{ACLED_folder_path}/Middle-East_aggregated_data_up_to-2025-11-01.xlsx"

In [None]:
df_g = pd.read_csv(gravity)
df_a = pd.read_excel(acled)

In [None]:
#print(df_g.info())
#print(df_a.info())

#df_g_filter = df_g[["year", "country_id_o", "distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]]
#df_a_filter = df_a[["WEEK", "COUNTRY", "EVENT_TYPE", "DISORDER_TYPE", "ADMIN1"]]

In [None]:
df_g_filter = df_g.loc[
    (df_g["year"].between(2012, 2025)) &
    (df_g["country_id_o"] == "TUR") & 
    (df_g["country_id_d"] == "DEU.2"),
    ["year", "country_id_o", "country_id_d", "distw_harmonic", 
    "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]
].copy()

In [None]:
df_a_filter = df_a.loc[
    (df_a["COUNTRY"] == "Turkey"),
    ["WEEK", "COUNTRY", "EVENT_TYPE", "DISORDER_TYPE", "ADMIN1"]
].copy()

In [None]:

iso = df_a_filter["WEEK"].dt.isocalendar()
df_a_filter["YEAR"] = iso["year"]


yearly_counts = (
    df_a_filter
    .groupby(["YEAR", "EVENT_TYPE"])
    .size()
    .reset_index(name="COUNT")
    .sort_values(["YEAR", "EVENT_TYPE"])
)


yearly_pivot = yearly_counts.pivot(index="YEAR", columns="EVENT_TYPE", values="COUNT").fillna(0).astype(int)

yearly_counts, yearly_pivot.head()


In [None]:
yearly_pivot.reset_index(inplace=True)   # so YEAR becomes a column

yearly_pivot.columns.name = None

yearly_pivot

In [None]:
df_g_filter = df_g_filter.rename(columns={"year": "YEAR"})

In [None]:
df_merged = pd.merge(yearly_pivot, df_g_filter, on="YEAR", how="left")

In [None]:
df_merged

In [None]:
df_merged.corr(numeric_only=True)

event_cols = yearly_pivot.columns.drop("YEAR")
param_cols = ["distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]

for event in event_cols:
    print(f"\nCorrelation for event type: {event}")
    print(df_merged[[event] + param_cols].corr()[event].sort_values(ascending=False))

In [None]:

from sklearn.linear_model import LinearRegression
import numpy as np

X = df_merged[["distw_harmonic", "diplo_disagreement", "tradeflow_baci", "scaled_sci_2021"]]
y = df_merged["Protests"]

model = LinearRegression()
model.fit(X, y)

print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)
print("RÂ²:", model.score(X, y))
