In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dataframe_image as dfi
from datetime import datetime
import scipy
import itertools
import geopandas as gpd
import xarray as xr
import regionmask
import statsmodels.api as sm
import statsmodels.formula.api as smf

Data from UEA CRU TS4.07 (1991-2022)

In [2]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\cru_ts4.07.1901.2022.tmx.dat.nc"
file2 = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\cru_ts4.07.1901.2022.pre.dat.nc"

file_paths_list =[file,file2]
monthly_forecast=xr.Dataset()

for file in file_paths_list:
        monthly_forecast = xr.merge([monthly_forecast,xr.open_mfdataset(file)], compat='override')

Data on conflict events from ACLED

In [3]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\1997-01-01-2023-07-18-Eastern_Africa-Somalia.csv"
df = pd.read_csv(file)

# Keep protests and strategic developments?

df=df[df['event_type'] != 'Protests']
df=df[df['event_type'] != 'Strategic developments']

Shapefile with administrative boundaries of Somalia

In [4]:
path = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\som_adm_ocha_itos_20230308_shp\som_admbnda_adm1_ocha_20230308.shp"
states_gdf = gpd.read_file(path) 

Limit the lat-lon and time 

In [5]:
def get_aoi(shp, world=True):
    lon_lat = {}
    # Get lat min, max
    aoi_lat = [float(shp.total_bounds[1]), float(shp.total_bounds[3])]
    aoi_lon = [float(shp.total_bounds[0]), float(shp.total_bounds[2])]

    lon_lat["lon"] = aoi_lon
    lon_lat["lat"] = aoi_lat
    return lon_lat

bounds = get_aoi(states_gdf)

In [6]:
start_date = '1901-01-16'
end_date = '2022-12-16'

region = monthly_forecast[["pre",'tmx']].sel(
    time=slice(start_date, end_date),
    lon=slice(bounds["lon"][0], bounds["lon"][1]),
    lat=slice(bounds["lat"][0], bounds["lat"][1]))

In [7]:
region_mask = regionmask.mask_3D_geopandas(states_gdf,
                                         monthly_forecast.lon,
                                         monthly_forecast.lat)

somalia = region.where(region_mask).groupby("time").mean(["lat", "lon"]).to_dataframe().reset_index()

In [8]:
replacement_dict = {0  :  'Awdal',
1    :         'Bakool',
2      :       'Banadir',
3      :         'Bari',
4       :         'Bay',
5        :  'Galgaduud',
6          :      'Gedo',
7          :   'Hiraan',
8   :       'Lower Juba',
9   :   'Lower Shabelle',
10  :      'Middle Juba',
11   : 'Middle Shabelle',
12    :          'Mudug',
13    :        'Nugaal',
14      :       'Sanaag',
15       :        'Sool',
16        :   'Togdheer',
17   : 'Woqooyi Galbeed'}

somalia['region'] = somalia['region'].replace(replacement_dict)

In [9]:
# Modify event_date column to datetime

v = df["event_date"].values
dt = [datetime.strptime(v[i], "%d %B %Y") for i in range(len(v))]
q=[]

for i in range(len(dt)):
    q.append(datetime.timestamp(dt[i]))
    
df.insert(loc=3, column='date_timestamp', value=q)
df = df.sort_values("date_timestamp")

df['event_date'] = pd.to_datetime(df['event_date'])
df=df.set_index('event_date') 

In [10]:
conflict = df.groupby([pd.Grouper(freq='40320min'),"admin1"]).count()
conflict.reset_index(level=[0, 1], inplace=True)
conflict = conflict[['event_date','admin1','year']].rename(columns={'year': 'conflicts','event_date': 'time'})
conflict['datetime_column'] = pd.to_datetime(conflict['time'])

# Aggregate the datetime objects by month
conf = conflict.groupby([pd.Grouper(key='time', freq='M'),'admin1'])['conflicts'].sum().to_frame()

In [11]:
dates = conf.index.get_level_values('time').unique()
districts = conf.index.get_level_values('admin1').unique()
all_combinations = pd.MultiIndex.from_product([dates, districts], names=['time', 'admin1'])

conf = conf.reindex(all_combinations, fill_value=0).reset_index()    # Reindex the DataFrame with all combinations

In [12]:
conflicts = conf.sort_values(by=['time', 'admin1'], ascending=[True, True])
conflicts.reset_index(drop=True, inplace=True)

In [13]:
temp_pre=somalia.rename(columns={'region': 'admin1'})
len(temp_pre)

24888

In [14]:
# open csv
spei = pd.read_csv(r'C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\drought\spei_somalia.csv')

In [15]:
# Add Banadir region with tmx and pre as mean of the neighbouring regions

district1 = 'Lower Shabelle'  
district2 = 'Middle Shabelle'  

# Step 2: Calculate the mean tmx and pre for the neighboring districts
mean_t = temp_pre[(temp_pre['admin1']==district1) | (temp_pre['admin1']==district2)].groupby('time')['tmx'].mean()
mean_p = temp_pre[(temp_pre['admin1']==district1) | (temp_pre['admin1']==district2)].groupby('time')['pre'].mean()
m_t = spei[(spei['admin1']==district1) | (spei['admin1']==district2)].groupby('time')['spei'].mean()

new_data = pd.DataFrame({ 'admin1': 'Banadir', 'tmx': mean_t, 'pre': mean_p}).reset_index()
new_data2 = pd.DataFrame({ 'admin1': 'Banadir', 'spei': m_t}).reset_index()

# Step 4: Append the new DataFrame to the original DataFrame
df3 = pd.concat([temp_pre, new_data])
df4 = pd.concat([spei, new_data2])

temp_pre = df3.sort_values(by=['time', 'admin1'], ascending=[True, True]).reset_index(drop=True)
spei = df4.sort_values(by=['time', 'admin1'], ascending=[True, True]).reset_index(drop=True)

In [16]:
temp_pre['time']=pd.to_datetime(temp_pre['time']).dt.strftime('%Y-%m-%d')
spei['time']=pd.to_datetime(spei['time']).dt.strftime('%Y-%m-%d')
temp_pre['month'] = temp_pre['time'].str[5:7]
spei['month'] = spei['time'].str[5:7]
temp_pre['month_year'] = temp_pre['time'].str[:7]
spei['month_year'] = spei['time'].str[:7]
conflicts['time'] = conflicts['time'].dt.strftime('%Y-%m').values

temp_pre = temp_pre[['month_year','admin1','tmx','pre']]
spei = spei[['month_year','admin1','spei']]
temp_pre = temp_pre.rename(columns={'month_year':'time'})
spei = spei.rename(columns={'month_year':'time'})

In [17]:
# Select a subset of the dataframe, where there is conflict data

start='1997-01'
end='2022-12'
temp_pre_97_22 = temp_pre[(temp_pre['time'] >= start) & (temp_pre['time'] <= end)]

In [18]:
# Perform an outer merge on the month_year column, which includes all month and year combinations from both dataframes
df_97_22 = pd.merge(temp_pre_97_22, conflicts, on=['time','admin1'], how='outer')
df_97_22 = df_97_22.fillna(0)

df_97_22=df_97_22[df_97_22['tmx'] != 0]

In [19]:
# Split the dataframe into regions

reg=[]
for admin in temp_pre['admin1'].unique():
    a = temp_pre[temp_pre['admin1']==admin].reset_index(drop=True)
    reg.append(a)

In [20]:
# Calculate the TA (temperature anomaly) and PA (precipitation anomaly) for each region

avg_t = np.zeros(18)
avg_p = np.zeros(18)
std_t = np.zeros(18)
std_p = np.zeros(18)

for i in range(18):

    reg[i]['year'] = reg[i]['time'].str[:4]
    reg[i]['month'] = reg[i]['time'].str[5:7]

    mean_temp_i = reg[i].groupby('month')['tmx'].mean()
    std_temp_i = reg[i].groupby('month')['tmx'].std()
    reg[i]['avg_temp'] = reg[i]['month'].map(mean_temp_i)
    reg[i]['std_temp'] = reg[i]['month'].map(std_temp_i)
    
    reg[i]['difference']= (reg[i]['tmx']-reg[i]['avg_temp'])/reg[i]['std_temp']
    reg[i]['TA'] = (reg[i]['difference'].shift(2) + reg[i]['difference'].shift(1) + reg[i]['difference'])/3
    
    mean_pre_i = reg[i].groupby('month')['pre'].mean()
    std_pre_i = reg[i].groupby('month')['pre'].std()
    reg[i]['avg_pre'] = reg[i]['month'].map(mean_pre_i)
    reg[i]['std_pre'] = reg[i]['month'].map(std_pre_i)
    
    reg[i]['diff_p']= (reg[i]['pre']-reg[i]['avg_pre'])/reg[i]['std_pre']
    reg[i]['PA'] = (reg[i]['diff_p'].shift(2) + reg[i]['diff_p'].shift(1) + reg[i]['diff_p'])/3
    
    reg[i] = reg[i].reindex(columns=['time','admin1','TA','PA'])
    reg[i] = reg[i].reset_index()

In [21]:
df_c_97_22=pd.concat([reg[i] for i in range(18)], axis=0)
df_c_97_22 = df_c_97_22.dropna()

df_c_97_22 = pd.merge(df_97_22, df_c_97_22, on=['time','admin1'], how='outer')
df_c_97_22 = pd.merge(df_c_97_22, spei, on=['time','admin1'], how='outer')
df_c_97_22 = df_c_97_22.dropna()
df_c_97_22 = df_c_97_22[['time','admin1','TA','PA','spei','conflicts']]

In [22]:
df_c_97_22 = df_c_97_22.sort_values(by=['time', 'admin1'], ascending=[True, True]).reset_index(drop=True)

In [23]:
# Create variable DL for drought lenght

df_c_97_22['DL'] = 0
mask = df_c_97_22['TA'] > 0

group_id = (mask != mask.shift()).cumsum()   # Create a group identifier for each consecutive group

count = df_c_97_22.groupby([group_id,'admin1']).cumcount() + 1    # Calculate the count within each group

df_c_97_22['DL'] = np.where(mask, count, 0)      # Assign the count values to the 'DL' column

df_c_97_22 = df_c_97_22[['time','admin1','TA','PA','DL','spei','conflicts']]

In [24]:
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Lower Shabelle', 'Lower_Shabelle')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Middle Shabelle', 'Middle_Shabelle')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Lower Juba', 'Lower_Juba')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Middle Juba', 'Middle_Juba')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Woqooyi Galbeed', 'Woqooyi_Galbeed')

In [25]:
# Create the dummy variables

#one for each country
df_dummies = pd.get_dummies(df_c_97_22['admin1'])
df_with_dummies = df_c_97_22.join(df_dummies)

#one for each month
df_c_97_22['month'] = pd.DatetimeIndex(df_c_97_22['time']).month_name()
df_dummies_m = pd.get_dummies(df_c_97_22['month'])
df_with_dummies = df_with_dummies.join(df_dummies_m)
df_with_dummies['month'] = pd.DatetimeIndex(df_c_97_22['time']).month

#one for each for each country-month pair
df_dummies_mr = pd.get_dummies(df_c_97_22['admin1'] + df_c_97_22['month'])
df_with_dummies = df_with_dummies.join(df_dummies_mr)
df_with_dummies = df_with_dummies.replace({True: 1, False: 0})

In [27]:
y_var_name = 'conflicts'
X_var_names = ['TA','PA','DL','spei']

In [28]:
# Regression expression for OLS with dummies

unit_names = df_c_97_22['admin1'].unique().tolist()
unit_names.sort()
unit_names_t = df_c_97_22['month'].unique().tolist()
unit_names_mr = (df_c_97_22['admin1'] + df_c_97_22['month']).unique().tolist()

lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name
for dummy_name_t in unit_names_t[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name_t
#for dummy_name_mr in unit_names_mr[:-1]:
    #lsdv_expr = lsdv_expr + ' + ' + dummy_name_mr
 
print('Regression expression for OLS with dummies=' + lsdv_expr)

Regression expression for OLS with dummies=conflicts ~ TA + PA + DL + spei + Awdal + Bakool + Banadir + Bari + Bay + Galgaduud + Gedo + Hiraan + Lower_Juba + Lower_Shabelle + Middle_Juba + Middle_Shabelle + Mudug + Nugaal + Sanaag + Sool + Togdheer + January + February + March + April + May + June + July + August + September + October + November


In [32]:
lsdv_model = smf.ols(formula=lsdv_expr, data=df_with_dummies)
lsdv_model_results = lsdv_model.fit()
print(lsdv_model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              conflicts   R-squared:                       0.396
Model:                            OLS   Adj. R-squared:                  0.393
Method:                 Least Squares   F-statistic:                     110.1
Date:                Wed, 26 Jul 2023   Prob (F-statistic):               0.00
Time:                        15:31:01   Log-Likelihood:                -20077.
No. Observations:                5400   AIC:                         4.022e+04
Df Residuals:                    5367   BIC:                         4.044e+04
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.3030      0.748     

In [30]:
#export summary to txt
#with open('summary_spei_no_multicol.txt', 'w') as fh:
   # fh.write(lsdv_model_results.summary().as_text())