In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dataframe_image as dfi
from datetime import datetime
import scipy
import itertools
import geopandas as gpd
import xarray as xr
import regionmask
import statsmodels.api as sm
import statsmodels.formula.api as smf

Data from POWER Data Access Viewer - NASA (1997-2020)

In [2]:
da1 = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\POWER_Regional_monthly_1997_2020.nc"
da2 = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\POWER_Regional_monthly_1997_2020_south.nc"

file_paths_list =[da1,da2]
monthly_forecast=xr.Dataset()

for file in file_paths_list:
        monthly_forecast = xr.merge([monthly_forecast,xr.open_mfdataset(file)], compat='no_conflicts')

Data from UEA CRU TS4.07 (1991-2022)

In [3]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\cru_ts4.07.1901.2022.tmx.dat.nc"
file2 = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\cru_ts4.07.1901.2022.pre.dat.nc"

file_paths_list =[file,file2]
monthly_forecast=xr.Dataset()

for file in file_paths_list:
        monthly_forecast = xr.merge([monthly_forecast,xr.open_mfdataset(file)], compat='override')

Data on conflict events from ACLED

In [4]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\1997-01-01-2023-07-18-Eastern_Africa-Somalia.csv"
df = pd.read_csv(file)

# Keep protests and strategic developments?

df=df[df['event_type'] != 'Protests']
df=df[df['event_type'] != 'Strategic developments']

Shapefile with administrative boundaries of Somalia

In [5]:
path = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\som_adm_ocha_itos_20230308_shp\som_admbnda_adm1_ocha_20230308.shp"
states_gdf = gpd.read_file(path) 

Limit the lat-lon and time 

In [6]:
def get_aoi(shp, world=True):
    lon_lat = {}
    # Get lat min, max
    aoi_lat = [float(shp.total_bounds[1]), float(shp.total_bounds[3])]
    aoi_lon = [float(shp.total_bounds[0]), float(shp.total_bounds[2])]

    lon_lat["lon"] = aoi_lon
    lon_lat["lat"] = aoi_lat
    return lon_lat

bounds = get_aoi(states_gdf)

In [7]:
start_date = '1901-01-16'
end_date = '2022-12-16'

region = monthly_forecast[["pre",'tmx']].sel(
    time=slice(start_date, end_date),
    lon=slice(bounds["lon"][0], bounds["lon"][1]),
    lat=slice(bounds["lat"][0], bounds["lat"][1]))

In [8]:
region_mask = regionmask.mask_3D_geopandas(states_gdf,
                                         monthly_forecast.lon,
                                         monthly_forecast.lat)

somalia = region.where(region_mask).groupby("time").mean(["lat", "lon"]).to_dataframe().reset_index()

In [9]:
replacement_dict = {0  :  'Awdal',
1    :         'Bakool',
2      :       'Banadir',
3      :         'Bari',
4       :         'Bay',
5        :  'Galgaduud',
6          :      'Gedo',
7          :   'Hiraan',
8   :       'Lower Juba',
9   :   'Lower Shabelle',
10  :      'Middle Juba',
11   : 'Middle Shabelle',
12    :          'Mudug',
13    :        'Nugaal',
14      :       'Sanaag',
15       :        'Sool',
16        :   'Togdheer',
17   : 'Woqooyi Galbeed'}

somalia['region'] = somalia['region'].replace(replacement_dict)

In [10]:
# Modify event_date column to datetime

v = df["event_date"].values
dt = [datetime.strptime(v[i], "%d %B %Y") for i in range(len(v))]
q=[]

for i in range(len(dt)):
    q.append(datetime.timestamp(dt[i]))
    
df.insert(loc=3, column='date_timestamp', value=q)
df = df.sort_values("date_timestamp")

df['event_date'] = pd.to_datetime(df['event_date'])
df=df.set_index('event_date') 

In [11]:
conflict = df.groupby([pd.Grouper(freq='40320min'),"admin1"]).count()
conflict.reset_index(level=[0, 1], inplace=True)
conflict = conflict[['event_date','admin1','year']].rename(columns={'year': 'conflicts','event_date': 'time'})
conflict['datetime_column'] = pd.to_datetime(conflict['time'])

# Aggregate the datetime objects by month
conf = conflict.groupby([pd.Grouper(key='time', freq='M'),'admin1'])['conflicts'].sum().to_frame()

In [12]:
dates = conf.index.get_level_values('time').unique()
districts = conf.index.get_level_values('admin1').unique()
all_combinations = pd.MultiIndex.from_product([dates, districts], names=['time', 'admin1'])

conf = conf.reindex(all_combinations, fill_value=0).reset_index()    # Reindex the DataFrame with all combinations

In [13]:
conflicts = conf.sort_values(by=['time', 'admin1'], ascending=[True, True])
conflicts.reset_index(drop=True, inplace=True)

In [14]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\all codes\temp_pre_somalia.csv"
temp_pre = pd.read_csv(file)
#rename columns
temp_pre=temp_pre.rename(columns={'region': 'admin1'})

In [15]:
# Add Banadir region with tmx and pre as mean of the neighbouring regions

district1 = 'Lower Shabelle'  
district2 = 'Middle Shabelle'  

# Step 2: Calculate the mean tmx and pre for the neighboring districts
mean_t = temp_pre[(temp_pre['admin1']==district1) | (temp_pre['admin1']==district2)].groupby('time')['tmx'].mean()
mean_p = temp_pre[(temp_pre['admin1']==district1) | (temp_pre['admin1']==district2)].groupby('time')['pre'].mean()

new_data = pd.DataFrame({ 'admin1': 'Banadir', 'tmx': mean_t, 'pre': mean_p}).reset_index()

# Step 4: Append the new DataFrame to the original DataFrame
df3 = pd.concat([temp_pre, new_data])

temp_pre = df3.sort_values(by=['time', 'admin1'], ascending=[True, True]).reset_index(drop=True)

In [16]:
temp_pre['month'] = temp_pre['time'].str[5:7]
temp_pre['month_year'] = temp_pre['time'].str[:7]
conflicts['time'] = conflicts['time'].dt.strftime('%Y-%m').values

temp_pre = temp_pre[['month_year','admin1','tmx','pre']]
temp_pre = temp_pre.rename(columns={'month_year':'time'})

In [17]:
# Select a subset of the dataframe, where there is conflict data

start='1997-01'
end='2022-12'
temp_pre_97_22 = temp_pre[(temp_pre['time'] >= start) & (temp_pre['time'] <= end)]

In [18]:
# Perform an outer merge on the month_year column, which includes all month and year combinations from both dataframes
df_97_22 = pd.merge(temp_pre_97_22, conflicts, on=['time','admin1'], how='outer')
df_97_22 = df_97_22.fillna(0)

df_97_22=df_97_22[df_97_22['tmx'] != 0]

In [19]:
# Split the dataframe into regions

reg=[]
for admin in temp_pre['admin1'].unique():
    a = temp_pre[temp_pre['admin1']==admin].reset_index(drop=True)
    reg.append(a)

In [20]:
# Calculate the TA (temperature anomaly) and PA (precipitation anomaly) for each region

avg_t = np.zeros(18)
avg_p = np.zeros(18)
std_t = np.zeros(18)
std_p = np.zeros(18)

for i in range(18):

    reg[i]['year'] = reg[i]['time'].str[:4]
    reg[i]['month'] = reg[i]['time'].str[5:7]

    mean_temp_i = reg[i].groupby('month')['tmx'].mean()
    std_temp_i = reg[i].groupby('month')['tmx'].std()
    reg[i]['avg_temp'] = reg[i]['month'].map(mean_temp_i)
    reg[i]['std_temp'] = reg[i]['month'].map(std_temp_i)
    
    reg[i]['difference']= (reg[i]['tmx']-reg[i]['avg_temp'])/reg[i]['std_temp']
    reg[i]['TA'] = (reg[i]['difference'].shift(2) + reg[i]['difference'].shift(1) + reg[i]['difference'])/3
    
    mean_pre_i = reg[i].groupby('month')['pre'].mean()
    std_pre_i = reg[i].groupby('month')['pre'].std()
    reg[i]['avg_pre'] = reg[i]['month'].map(mean_pre_i)
    reg[i]['std_pre'] = reg[i]['month'].map(std_pre_i)
    
    reg[i]['diff_p']= (reg[i]['pre']-reg[i]['avg_pre'])/reg[i]['std_pre']
    reg[i]['PA'] = (reg[i]['diff_p'].shift(2) + reg[i]['diff_p'].shift(1) + reg[i]['diff_p'])/3
    
    #reg[i] = reg[i].reindex(columns=['time','admin1','TA','PA'])
    reg[i] = reg[i].reset_index()

In [21]:
df_c_97_22=pd.concat([reg[i] for i in range(18)], axis=0)
df_c_97_22 = df_c_97_22.dropna()

df_c_97_22 = pd.merge(df_97_22, df_c_97_22, on=['time','admin1'], how='outer')
df_c_97_22 = df_c_97_22.dropna()
df_c_97_22 = df_c_97_22[['time','admin1','TA','PA','conflicts']]

In [22]:
# Create variable DL for drought lenght

df_c_97_22['DL'] = 0
mask = df_c_97_22['TA'] > 0

group_id = (mask != mask.shift()).cumsum()   # Create a group identifier for each consecutive group

count = df_c_97_22.groupby(group_id).cumcount() + 1    # Calculate the count within each group

df_c_97_22['DL'] = np.where(mask, count, 0)      # Assign the count values to the 'DL' column

df_c_97_22 = df_c_97_22[['time','admin1','TA','PA','DL','conflicts']]

In [23]:
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Lower Shabelle', 'Lower_Shabelle')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Middle Shabelle', 'Middle_Shabelle')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Lower Juba', 'Lower_Juba')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Middle Juba', 'Middle_Juba')
df_c_97_22['admin1'] = df_c_97_22['admin1'].str.replace('Woqooyi Galbeed', 'Woqooyi_Galbeed')

In [24]:
df=df_c_97_22

In [25]:
# Create the dummy variables

#one for each country
df_dummies = pd.get_dummies(df_c_97_22['admin1'])
df_with_dummies = df_c_97_22.join(df_dummies)

#one for each month
df_c_97_22['month'] = pd.DatetimeIndex(df_c_97_22['time']).month_name()
df_dummies_m = pd.get_dummies(df_c_97_22['month'])
df_with_dummies = df_with_dummies.join(df_dummies_m)
df_with_dummies['month'] = pd.DatetimeIndex(df_c_97_22['time']).month

#one for each for each country-month pair
df_dummies_mr = pd.get_dummies(df_c_97_22['admin1'] + df_c_97_22['month'])
df_with_dummies = df_with_dummies.join(df_dummies_mr)
df_with_dummies = df_with_dummies.replace({True: 1, False: 0})

In [26]:
# Noise in the independent variables

#df_with_dummies['TA_noise'] = df_with_dummies['TA'] + np.random.normal(0, 0.1, len(df_with_dummies))
#df_with_dummies['PA_noise'] = df_with_dummies['PA'] + np.random.normal(0, 0.1, len(df_with_dummies))
#df_with_dummies['DL_noise'] = df_with_dummies['DL'] + np.random.normal(0, 0.1, len(df_with_dummies))

In [27]:
y_var_name = 'conflicts'
X_var_names = ['TA','PA','DL']

In [28]:
# Regression expression for OLS with dummies

unit_names = df['admin1'].unique().tolist()
unit_names.sort()
unit_names_t = df['month'].unique().tolist()
unit_names_mr = (df['admin1'] + df['month']).unique().tolist()

lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name
for dummy_name_t in unit_names_t[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name_t
for dummy_name_mr in unit_names_mr[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name_mr
 
print('Regression expression for OLS with dummies=' + lsdv_expr)

Regression expression for OLS with dummies=conflicts ~ TA + PA + DL + Awdal + Bakool + Banadir + Bari + Bay + Galgaduud + Gedo + Hiraan + Lower_Juba + Lower_Shabelle + Middle_Juba + Middle_Shabelle + Mudug + Nugaal + Sanaag + Sool + Togdheer + January + February + March + April + May + June + July + August + September + October + November + AwdalJanuary + BakoolJanuary + BanadirJanuary + BariJanuary + BayJanuary + GalgaduudJanuary + GedoJanuary + HiraanJanuary + Lower_JubaJanuary + Lower_ShabelleJanuary + Middle_JubaJanuary + Middle_ShabelleJanuary + MudugJanuary + NugaalJanuary + SanaagJanuary + SoolJanuary + TogdheerJanuary + Woqooyi_GalbeedJanuary + AwdalFebruary + BakoolFebruary + BanadirFebruary + BariFebruary + BayFebruary + GalgaduudFebruary + GedoFebruary + HiraanFebruary + Lower_JubaFebruary + Lower_ShabelleFebruary + Middle_JubaFebruary + Middle_ShabelleFebruary + MudugFebruary + NugaalFebruary + SanaagFebruary + SoolFebruary + TogdheerFebruary + Woqooyi_GalbeedFebruary + Awd

In [29]:
lsdv_model = smf.ols(formula=lsdv_expr, data=df_with_dummies)
lsdv_model_results = lsdv_model.fit()
print(lsdv_model_results.summary())

                            OLS Regression Results                            
Dep. Variable:              conflicts   R-squared:                       0.411
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     17.26
Date:                Tue, 25 Jul 2023   Prob (F-statistic):               0.00
Time:                        11:32:11   Log-Likelihood:                -21005.
No. Observations:                5616   AIC:                         4.245e+04
Df Residuals:                    5397   BIC:                         4.390e+04
Df Model:                         218                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [30]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [31]:
# Assume 'X' contains the independent variables and 'y' contains the dependent variable
X = df_with_dummies.drop(['time','conflicts','admin1'], axis=1)
y = df_with_dummies['conflicts']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform PCA and select the number of components
n_components = 9  # Choose an appropriate number of components based on explained variance or cross-validation
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_train_pca = sm.add_constant(X_train_pca, prepend=False)  # Adding constant without centering
X_test_pca = sm.add_constant(X_test_pca, prepend=False)

model = sm.OLS(y_train, X_train_pca)
result = model.fit()

# Print the summary of the regression model
print(result.summary())

# Get the coefficients and other statistics from the result object
coefficients = result.params
intercept = coefficients[0]
other_coefficients = coefficients[1:]


                            OLS Regression Results                            
Dep. Variable:              conflicts   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.071
Method:                 Least Squares   F-statistic:                     39.12
Date:                Tue, 25 Jul 2023   Prob (F-statistic):           1.25e-67
Time:                        11:32:12   Log-Likelihood:                -17895.
No. Observations:                4492   AIC:                         3.581e+04
Df Residuals:                    4482   BIC:                         3.587e+04
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.1528      0.110      1.387      0.1

In [32]:

# Calculate the coefficients of the initial independent variables
coefficients_pca = result.params[1:]  # Coefficients of the principal components (excluding the intercept)

# Calculate the loadings of the principal components
loadings = pca.components_.T  # Transpose the loadings to match the principal component scores

# Calculate the coefficients of the initial independent variables
coefficients_original = loadings.dot(coefficients_pca)

# Print the coefficients of the initial independent variables
print("Coefficients of the initial independent variables:")
print(coefficients_original)

Coefficients of the initial independent variables:
[-3.50715282e-01  1.81764672e-01 -3.89328697e-02 -9.97909356e-01
  1.01792060e-01 -1.43361419e+00  1.54848349e-01 -2.49461834e-01
  4.20229105e-01  1.93047939e-01  7.18677601e-01  5.79997102e-01
 -6.99593924e-01  8.57065331e-03 -5.32048074e-02  1.39549712e+00
  6.65194436e-01 -1.20214552e+00  2.52156615e-01  3.88901344e-01
 -1.44974770e-01  2.40550412e+00  1.64460096e+00 -6.16341406e-02
 -8.08079037e-01  1.69956157e-01  1.36409066e+00 -8.22583582e-01
 -1.14377666e+00 -1.30730494e+00 -6.80474024e-01 -6.21891780e-01
 -1.91599071e-01  9.55782522e-03  2.69335169e-01  1.11130821e-01
 -2.66618464e-01 -4.86568178e-01 -2.50322655e-01  6.01452336e-02
 -4.38039779e-01 -5.16974746e-01 -5.85234699e-01 -4.31831954e-01
 -4.38305414e-01 -3.23861194e-01  5.72796581e-01  4.05894513e-01
 -9.83858357e-04 -1.48281572e-01  6.23374434e-02  3.49759638e-01
 -1.87354956e-01 -2.21243503e-01 -2.91791947e-01 -1.41593483e-01
 -1.34220192e-01 -2.25700574e-02  1.369

In [33]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Add a constant term (intercept) to the independent variables for VIF calculation
X_with_const = sm.add_constant(X)

# Calculate VIF for each independent variable
vif = pd.DataFrame()
vif["Variable"] = X_with_const.columns
vif["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]

# Print the VIF dataframe
print(vif)

  return 1 - self.ssr/self.centered_tss
  vif = 1. / (1. - r_squared_i)


                     Variable       VIF
0                       const  0.000000
1                          TA  1.682592
2                          PA  1.115559
3                          DL  1.455796
4                       Awdal       inf
..                        ...       ...
246      Woqooyi_GalbeedMarch       inf
247        Woqooyi_GalbeedMay       inf
248   Woqooyi_GalbeedNovember       inf
249    Woqooyi_GalbeedOctober       inf
250  Woqooyi_GalbeedSeptember       inf

[251 rows x 2 columns]


In [34]:
vif

Unnamed: 0,Variable,VIF
0,const,0.000000
1,TA,1.682592
2,PA,1.115559
3,DL,1.455796
4,Awdal,inf
...,...,...
246,Woqooyi_GalbeedMarch,inf
247,Woqooyi_GalbeedMay,inf
248,Woqooyi_GalbeedNovember,inf
249,Woqooyi_GalbeedOctober,inf


In [35]:
#export csv file
dff=df_with_dummies.drop(['month'], axis=1)
#dff.to_csv(r'C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\R\df_with_dummies.csv', index = False, header=True)

In [36]:
#df.to_csv(r'C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\R\df.csv', index = False, header=True)

In [37]:
#df.drop(['month'], axis=1).to_csv(r'C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\R\df.csv', index = False, header=True)