In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import statsmodels.formula.api as smf
import torch

In [2]:
file = r"C:\Users\PcLaptop\Documents\GitHub\Climate-and-conflict\reg_TA_PA.csv"
df = pd.read_csv(file)
df['admin1'] = df['admin1'].str.replace('Lower Shabelle', 'Lower_Shabelle')
df['admin1'] = df['admin1'].str.replace('Middle Shabelle', 'Middle_Shabelle')
df['admin1'] = df['admin1'].str.replace('Lower Juba', 'Lower_Juba')
df['admin1'] = df['admin1'].str.replace('Middle Juba', 'Middle_Juba')
df['admin1'] = df['admin1'].str.replace('Woqooyi Galbeed', 'Woqooyi_Galbeed')

In [3]:
#Create a new column with the drought lenght DL
df['DL'] = 0
mask = df['TA'] > 0
# Create a group identifier for each consecutive group
group_id = (mask != mask.shift()).cumsum()
# Calculate the count within each group
count = df.groupby(group_id).cumcount() + 1
# Assign the count values to the 'DL' column
df['DL'] = np.where(mask, count, 0)
#in df remove all rows that contain a NaN
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [27]:
#Create the dummy variables, one for each country
df_dummies = pd.get_dummies(df['admin1'])
df_with_dummies = df.join(df_dummies)

#Create the dummy variables, one for each month
df['month'] = pd.DatetimeIndex(df['time_x']).month_name()
df_dummies_m = pd.get_dummies(df['month'])
df_with_dummies = df_with_dummies.join(df_dummies_m)
df_with_dummies['month'] = pd.DatetimeIndex(df['time_x']).month

#Create the dummy variables, one for each for each country-month pair
df_dummies_mr = pd.get_dummies(df['admin1'] + df['month'])
df_with_dummies = df_with_dummies.join(df_dummies_mr)

df_with_dummies = df_with_dummies.replace({True: 1, False: 0})

In [28]:
#Create tensors for TA, PA, DL and random noise

tensors_ta = []
tensors_pa = []
tensors_dl = []

# Iterate over unique regions
for region in df['admin1'].unique():
    # Filter the DataFrame for the current region
    region_df = df[df['admin1'] == region]
    
    # Get the 'TA' and 'PA' values for the current region as numpy arrays
    ta_values = region_df['TA'].values
    pa_values = region_df['PA'].values 
    dl_values = region_df['DL'].values
        
    # Create a tensor from the numpy arrays
    tensor_t = torch.tensor(ta_values)
    tensor_p = torch.tensor(pa_values)
    tensor_d = torch.tensor(dl_values)
    
    # Append the tensor to the list
    tensors_ta.append(tensor_t)
    tensors_pa.append(tensor_p)
    tensors_dl.append(tensor_d)

ta_tensor = torch.stack(tensors_ta, dim=0)
pa_tensor = torch.stack(tensors_pa, dim=0)
dl_tensor = torch.stack(tensors_dl, dim=0)

# Create a tensor with random noise
noise_gauss = torch.tensor(np.random.normal(0, 0.1, ta_tensor.size()), dtype=torch.float)
noise_uniform = torch.randn_like(ta_tensor)

In [58]:
#Regression formula with tensors to get synthetic data

c_tensor = ta_tensor + pa_tensor + dl_tensor + torch.tensor(np.random.normal(0, 1, ta_tensor.size()), dtype=torch.float)

#from c_tensor create an array 

df_with_dummies['conflicts_synth'] = c_tensor.view(-1,1).numpy()

In [59]:
#Create formula for OLS with synthetic data

y_var_name = 'conflicts_synth'
X_var_names = ['TA','PA','DL']

unit_names = df['admin1'].unique().tolist()
unit_names.sort()
unit_names_t = df['month'].unique().tolist()
unit_names_mr = (df['admin1'] + df['month']).unique().tolist()

lsdv_expr = y_var_name + ' ~ '
i = 0
for X_var_name in X_var_names:
    if i > 0:
        lsdv_expr = lsdv_expr + ' + ' + X_var_name
    else:
        lsdv_expr = lsdv_expr + X_var_name
    i = i + 1
for dummy_name in unit_names[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name
for dummy_name_t in unit_names_t[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name_t
for dummy_name_mr in unit_names_mr[:-1]:
    lsdv_expr = lsdv_expr + ' + ' + dummy_name_mr
 
print('Regression expression for OLS with dummies = ' + lsdv_expr)

Regression expression for OLS with dummies = conflicts_synth ~ TA + PA + DL + Awdal + Bakool + Banadir + Bari + Bay + Galgaduud + Gedo + Hiraan + Lower_Juba + Lower_Shabelle + Middle_Juba + Middle_Shabelle + Mudug + Nugaal + Sanaag + Sool + Togdheer + April + May + June + July + August + September + October + November + December + January + February + AwdalApril + AwdalMay + AwdalJune + AwdalJuly + AwdalAugust + AwdalSeptember + AwdalOctober + AwdalNovember + AwdalDecember + AwdalJanuary + AwdalFebruary + AwdalMarch + BakoolApril + BakoolMay + BakoolJune + BakoolJuly + BakoolAugust + BakoolSeptember + BakoolOctober + BakoolNovember + BakoolDecember + BakoolJanuary + BakoolFebruary + BakoolMarch + BanadirApril + BanadirMay + BanadirJune + BanadirJuly + BanadirAugust + BanadirSeptember + BanadirOctober + BanadirNovember + BanadirDecember + BanadirJanuary + BanadirFebruary + BanadirMarch + BariApril + BariMay + BariJune + BariJuly + BariAugust + BariSeptember + BariOctober + BariNovember 

In [60]:
lsdv_model = smf.ols(formula=lsdv_expr, data=df_with_dummies)
lsdv_model_results = lsdv_model.fit()
print(lsdv_model_results.summary())

                            OLS Regression Results                            
Dep. Variable:        conflicts_synth   R-squared:                       0.767
Model:                            OLS   Adj. R-squared:                  0.746
Method:                 Least Squares   F-statistic:                     37.69
Date:                Tue, 11 Jul 2023   Prob (F-statistic):               0.00
Time:                        16:38:52   Log-Likelihood:                -3756.8
No. Observations:                2718   AIC:                             7952.
Df Residuals:                    2499   BIC:                             9245.
Df Model:                         218                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

In [38]:
with open('ols_synthetic_data.txt', 'w') as f:
    f.write(lsdv_model_results.summary().as_text())