In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score


In [29]:
#parameters setting

#Define the path to your datafolder below
your_datapath = ''

#Define search space for number of trees in random forest and depth of trees
num_trees_min = 8
num_trees_max = 9

depth_min = 4
depth_max = 5

In [35]:
datapath = ''
district_name = 'Afgooye'
prevalence_df = pd.read_csv(datapath + 'prevalence_estimates.csv', parse_dates=['date'])
ipc_df = pd.read_csv(datapath + 'ipc2.csv', parse_dates=['date'])
risk_df = pd.read_csv(datapath + 'FSNAU_riskfactors.csv', parse_dates=['date'])
production_df = pd.read_csv(datapath + 'production.csv', parse_dates=['date'])
    
admissions_df = pd.read_csv(datapath + 'admissions.csv', parse_dates=['date'])
conflict_df = pd.read_csv(datapath + 'conflict.csv', parse_dates=['date'])
    
#date only
covid_df = pd.read_csv(datapath + 'covid.csv', parse_dates=['date'])
    
    
#Select data for specific district
prevalence_df = prevalence_df[prevalence_df['district']==district_name]
ipc_df = ipc_df[ipc_df['district']==district_name]
risk_df = risk_df[risk_df['district']==district_name]
production_df = production_df[production_df['district']==district_name]
admissions_df = admissions_df[admissions_df['district']==district_name]
conflict_df = conflict_df[conflict_df['district']==district_name]
    
conflict_df.fillna(0,inplace=True)
production_df.fillna(0,inplace=True)
    
#GroupBy "key", 6M = 6 months, x.replace(day=1) = the first day of that month
risk_df = risk_df.groupby(pd.Grouper(key='date', freq='6M')).mean() 
risk_df = risk_df.reset_index()
risk_df['date'] = risk_df['date'].apply(lambda x : x.replace(day=1))
    
covid_df = covid_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
covid_df = covid_df.reset_index()
covid_df['date'] = covid_df['date'].apply(lambda x : x.replace(day=1))
    
conflict_df = conflict_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
conflict_df = conflict_df.reset_index()
conflict_df['date'] = conflict_df['date'].apply(lambda x : x.replace(day=1))
    
admissions_df = admissions_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
admissions_df = admissions_df.reset_index()
admissions_df['date'] = admissions_df['date'].apply(lambda x : x.replace(day=1))

production_df = production_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
production_df = production_df.reset_index()
production_df['date'] = production_df['date'].apply(lambda x : x.replace(day=1))

#production_df['cropdiv'] = production_df.count(axis=1)
    
#Sort dataframes on date
prevalence_df.sort_values('date', inplace=True)
covid_df.sort_values('date', inplace=True)
ipc_df.sort_values('date', inplace=True)
risk_df.sort_values('date', inplace=True)
production_df.sort_values('date', inplace=True)
admissions_df.sort_values('date', inplace=True)
conflict_df.sort_values('date', inplace=True)
    

#Merge dataframes, only joining on current or previous dates as to prevent data leakage
df = pd.merge_asof(left=prevalence_df, right=ipc_df, direction='backward', on='date')
df = pd.merge_asof(left=df, right=production_df, direction='backward', on='date')
df = pd.merge_asof(left=df, right=risk_df, direction='backward', on='date')
df = pd.merge_asof(left=df, right=covid_df, direction='backward', on='date')
    
df = pd.merge_asof(left=df, right=admissions_df, direction='backward', on='date')
df = pd.merge_asof(left=df, right=conflict_df, direction='backward', on='date')
    
    
#Calculate prevalence 6lag
df['prevalence_6lag'] = df['GAM Prevalence'].shift(1)
df['next_prevalence'] = df['GAM Prevalence'].shift(-1)

df.rename(columns={"GAM Prevalence": "prevalence", "new_cases": "covid", "ndvi_score": "ndvi", "phase3plus_perc": "ipc", "total population": "population"}, inplace = True)
    
    
#Add month column
df['month'] = df['date'].dt.month
    
#Add target variable: increase for next month prevalence (boolean)
increase = [False if x[1]<x[0] else True for x in list(zip(df['prevalence'], df['prevalence'][1:]))]
increase.append(False)
df['increase'] = increase
df.iloc[-1, df.columns.get_loc('increase')] = np.nan #No info on next month
    
#Add target variable: increase for next month prevalence (boolean)
increase_numeric = [x[1] - x[0] for x in list(zip(df['prevalence'], df['prevalence'][1:]))]
increase_numeric.append(0)
df['increase_numeric'] = increase_numeric
df.iloc[-1, df.columns.get_loc('increase_numeric')] = np.nan #No info on next month
    
df.loc[(df.date < pd.to_datetime('2020-03-01')), 'covid'] = 0
df

Unnamed: 0.1,Unnamed: 0,date,district_x,population,Under-Five Population,GAM,MAM,SAM,prevalence,SAM Prevalence,...,n_protests,n_riots,n_strategicdev,n_violcivilians,n_conflict_total,prevalence_6lag,next_prevalence,month,increase,increase_numeric
0,606,2017-07-01,Afgooye,309683.385,61936.677,22867.021148,17713.889622,5153.131526,0.3692,0.0832,...,0.0,0.0,3.0,22.0,109.0,,0.351,7,False,-0.0182
1,532,2018-01-01,Afgooye,309683.385,61936.677,21739.773627,16747.677461,4992.096166,0.351,0.0806,...,3.0,1.0,12.0,25.0,166.0,0.3692,0.384859,1,True,0.033859
2,453,2018-07-01,Afgooye,262835.818868,52567.163774,20230.94924,16460.215189,3770.734052,0.384859,0.071732,...,1.0,1.0,6.0,19.0,120.0,0.351,0.43834,7,True,0.053481
3,375,2019-01-01,Afgooye,262835.818868,52567.163774,23042.3088,18830.718275,4211.590525,0.43834,0.080118,...,0.0,1.0,0.0,13.0,103.0,0.384859,0.462159,1,True,0.023819
4,297,2019-07-01,Afgooye,262835.818868,52567.163774,24294.394753,20573.68392,3720.710833,0.462159,0.07078,...,0.0,0.0,11.0,15.0,79.0,0.43834,0.449223,7,False,-0.012937
5,260,2020-01-01,Afgooye,191430.835956,38286.167191,17199.009189,14577.634376,2621.374813,0.449223,0.068468,...,0.0,1.0,4.0,19.0,127.0,0.462159,0.458235,1,True,0.009013
6,186,2020-07-01,Afgooye,191431.0,38286.0,17544.0,14278.0,3266.0,0.458235,0.085305,...,1.0,0.0,2.0,21.0,135.0,0.449223,0.484132,7,True,0.025897
7,118,2021-01-01,Afgooye,191431.0,38285.0,18535.0,,3395.0,0.484132,0.088677,...,2.0,0.0,2.0,14.0,141.0,0.458235,0.463764,1,False,-0.020368
8,1,2021-07-01,Afgooye,,94444.6,43800.0,,8930.0,0.463764,0.094553,...,0.0,0.0,1.0,9.0,175.0,0.484132,,7,,


In [31]:
#Function that creates a pandas dataframe for a single district with columns for the baseline model with semiyearly entries
def make_district_df_semiyearly(datapath, district_name):
    """
    Function that creates a pandas dataframe for a single district with columns for the baseline model with semiyearly entries

    Parameters
    ----------
    datapath : string
        Path to the datafolder
    district_name : string
        Name of the district

    Returns
    -------
    df : pandas dataframe
    """

	#Read all relevant datasets
    #date and district 
    prevalence_df = pd.read_csv(datapath + 'prevalence_estimates.csv', parse_dates=['date'])
    ipc_df = pd.read_csv(datapath + 'ipc2.csv', parse_dates=['date'])
    risk_df = pd.read_csv(datapath + 'FSNAU_riskfactors.csv', parse_dates=['date'])
    production_df = pd.read_csv(datapath + 'production.csv', parse_dates=['date'])
    
    admissions_df = pd.read_csv(datapath + 'admissions.csv', parse_dates=['date'])
    conflict_df = pd.read_csv(datapath + 'conflict.csv', parse_dates=['date'])
    
    #date only
    covid_df = pd.read_csv(datapath + 'covid.csv', parse_dates=['date'])
    
    
    #Select data for specific district
    prevalence_df = prevalence_df[prevalence_df['district']==district_name]
    ipc_df = ipc_df[ipc_df['district']==district_name]
    risk_df = risk_df[risk_df['district']==district_name]
    production_df = production_df[production_df['district']==district_name]
    admissions_df = admissions_df[admissions_df['district']==district_name]
    conflict_df = conflict_df[conflict_df['district']==district_name]
    
    conflict_df.fillna(0,inplace=True)
    production_df.fillna(0,inplace=True)
    
    #GroupBy "key", 6M = 6 months, x.replace(day=1) = the first day of that month
    risk_df = risk_df.groupby(pd.Grouper(key='date', freq='6M')).mean() 
    risk_df = risk_df.reset_index()
    risk_df['date'] = risk_df['date'].apply(lambda x : x.replace(day=1))
    
    covid_df = covid_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
    covid_df = covid_df.reset_index()
    covid_df['date'] = covid_df['date'].apply(lambda x : x.replace(day=1))
    
    conflict_df = conflict_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
    conflict_df = conflict_df.reset_index()
    conflict_df['date'] = conflict_df['date'].apply(lambda x : x.replace(day=1))
    
    admissions_df = admissions_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
    admissions_df = admissions_df.reset_index()
    admissions_df['date'] = admissions_df['date'].apply(lambda x : x.replace(day=1))

    production_df = production_df.groupby(pd.Grouper(key='date', freq='6M')).sum()
    production_df = production_df.reset_index()
    production_df['date'] = production_df['date'].apply(lambda x : x.replace(day=1))
    
    #production_df['cropdiv'] = production_df.count(axis=1)
    
    #Sort dataframes on date
    prevalence_df.sort_values('date', inplace=True)
    covid_df.sort_values('date', inplace=True)
    ipc_df.sort_values('date', inplace=True)
    risk_df.sort_values('date', inplace=True)
    production_df.sort_values('date', inplace=True)
    admissions_df.sort_values('date', inplace=True)
    conflict_df.sort_values('date', inplace=True)
    

    #Merge dataframes, only joining on current or previous dates as to prevent data leakage
    df = pd.merge_asof(left=prevalence_df, right=ipc_df, direction='backward', on='date')
    df = pd.merge_asof(left=df, right=production_df, direction='backward', on='date')
    df = pd.merge_asof(left=df, right=risk_df, direction='backward', on='date')
    df = pd.merge_asof(left=df, right=covid_df, direction='backward', on='date')
    
    df = pd.merge_asof(left=df, right=admissions_df, direction='backward', on='date')
    df = pd.merge_asof(left=df, right=conflict_df, direction='backward', on='date')
    
    
    #Calculate prevalence 6lag
    df['prevalence_6lag'] = df['GAM Prevalence'].shift(1)
    df['next_prevalence'] = df['GAM Prevalence'].shift(-1)
    
    '''    
    #Select needed columns
    df = df[['date', 'district', 'GAM Prevalence', 'next_prevalence', 'prevalence_6lag', 'new_cases', 'ndvi_score', 'phase3plus_perc', 'cropdiv', 'total population']]
    df.columns = ['date', 'district', 'prevalence', 'next_prevalence', 'prevalence_6lag', 'covid', 'ndvi', 'ipc', 'cropdiv', 'population']
    '''
    df.rename(columns={"GAM Prevalence": "prevalence", "new_cases": "covid", "ndvi_score": "ndvi", "phase3plus_perc": "ipc", "total population": "population"}, inplace = True)
    
    #Add month column
    df['month'] = df['date'].dt.month
    
    #Add target variable: increase for next month prevalence (boolean)
    increase = [False if x[1]<x[0] else True for x in list(zip(df['prevalence'], df['prevalence'][1:]))]
    increase.append(False)
    df['increase'] = increase
    df.iloc[-1, df.columns.get_loc('increase')] = np.nan #No info on next month
    
    #Add target variable: increase for next month prevalence (boolean)
    increase_numeric = [x[1] - x[0] for x in list(zip(df['prevalence'], df['prevalence'][1:]))]
    increase_numeric.append(0)
    df['increase_numeric'] = increase_numeric
    df.iloc[-1, df.columns.get_loc('increase_numeric')] = np.nan #No info on next month
    
    df.loc[(df.date < pd.to_datetime('2020-03-01')), 'covid'] = 0
    
    return(df)

In [32]:
#Function that combines the semiyearly dataset (from the function make_district_df_semiyearly) of all districts
def make_combined_df_semiyearly(datapath):
    """
    Function that creates a pandas dataframe for all districts with columns for the baseline model with semiyearly entries

    Parameters
    ----------
    datapath : string
        Path to the datafolder

    Returns
    -------
    df : pandas dataframe
    """

    prevdf = pd.read_csv(datapath + 'prevalence_estimates.csv', parse_dates=['date'])
    districts = prevdf['district'].unique()
    
    df_list = []
    for district in districts:
        district_df = make_district_df_semiyearly(datapath, district)
        district_df['district'] = district
        df_list.append(district_df)
        
    df = pd.concat(df_list, ignore_index=True)
    df['district_encoded'] = df['district'].astype('category').cat.codes

    return df


In [33]:
#Function that returns every possible subset (except the empty set) of the input list l
def subsets (l):
    subset_list = []
    for i in range(len(l) + 1):
        for j in range(i):
            subset_list.append(l[j: i])
    return subset_list


In [34]:
'''------------SECTION DATAFRAME CREATION--------------'''
#Create the dataframe for all districts
df = make_combined_df_semiyearly(your_datapath)

#Drop every row with missing values
df.dropna(inplace=True)

#Sort dataframe on date and reset the index
df.sort_values('date', inplace=True)
df.reset_index(inplace=True, drop=True)

#Drop disctricts with less than 7 observations: 'Burco', 'Saakow', 'Rab Dhuure', 'Baydhaba', 'Afmadow'
df.drop(df[df['district'].isin(['Burco', 'Saakow', 'Rab Dhuure', 'Baydhaba', 'Afmadow'])].index, inplace=True)
df

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

In [27]:
'''------------SECTION RANDOM FOREST CROSS VALIDATION--------------'''
#WARNING: this process can take some time, since there are a lot of hyperparameters to investigate. The search space can be manually reduced to speed up the process.

#Create empty list to store model scores
parameter_scores = []

#Define target and explanatory variables
X = df.drop(columns = ['increase', 'increase_numeric', 'date', 'district', 'prevalence', 'next_prevalence']) #Note that these columns are dropped, the remaining columns are used as explanatory variables
y = df['next_prevalence'].values

for num_trees in range(num_trees_min, num_trees_max):
    
    for depth in range(depth_min, depth_max):
        
        #Investigate every subset of explanatory variables
        for features in subsets(X.columns):
        
            #First CV split. The 99 refers to the first 3 observations for the 33 districts in the data.
            Xtrain = X[:99][features].copy().values
            ytrain = y[:99]
            Xtest = X[99:132][features].copy().values
            ytest = y[99:132]

            #Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0)

            #Fit to the training data
            clf.fit(Xtrain, ytrain)

            #Make a prediction on the test data
            predictions = clf.predict(Xtest)

            #Calculate mean absolute error
            MAE1 = mean_absolute_error(ytest, predictions)


            #Second CV split. The 132 refers to the first 4 observations for the 33 districts in the data.
            Xtrain = X[:132][features].copy().values
            ytrain = y[:132]
            Xtest = X[132:165][features].copy().values
            ytest = y[132:165]

            #Create a RandomForestRegressor with the selected hyperparameters and random state 0.
            clf = RandomForestRegressor(n_estimators=num_trees, max_depth=depth, random_state=0)

            #Fit to the training data
            clf.fit(Xtrain, ytrain)

            #Make a prediction on the test data
            predictions = clf.predict(Xtest)

            #Calculate mean absolute error
            MAE2 = mean_absolute_error(ytest, predictions)

            #Calculate the mean MAE over the two folds
            mean_MAE = (MAE1 + MAE2)/2

            #Store the mean MAE together with the used hyperparameters in list 
            parameter_scores.append((mean_MAE, num_trees, depth, features))

#Sort the models based on score and retrieve the hyperparameters of the best model
parameter_scores.sort(key=lambda x: x[0])
best_model_score = parameter_scores[0][0]
best_model_trees = parameter_scores[0][1]
best_model_depth = parameter_scores[0][2]
best_model_columns = list(parameter_scores[0][3])

In [28]:
'''------------SECTION FINAL EVALUATION--------------'''
X = df[best_model_columns].values
y = df['next_prevalence'].values

#If there is only one explanatory variable, the values need to be reshaped for the model
if len(best_model_columns) == 1:
	X = X.reshape(-1, 1)

#Peform evaluation on full data
Xtrain = X[:165]
ytrain = y[:165]
Xtest = X[165:]
ytest = y[165:]

clf = RandomForestRegressor(n_estimators=best_model_trees, max_depth=best_model_depth, random_state=0)
clf.fit(Xtrain, ytrain)
predictions = clf.predict(Xtest)

#Calculate MAE
MAE = mean_absolute_error(ytest, predictions)

#Generate boolean values for increase or decrease in prevalence. 0 if next prevalence is smaller than current prevalence, 1 otherwise.
increase           = [0 if x<y else 1 for x in df.iloc[165:]['next_prevalence'] for y in df.iloc[165:]['prevalence']]
predicted_increase = [0 if x<y else 1 for x in predictions                      for y in df.iloc[165:]['prevalence']]

#Calculate accuracy of predicted boolean increase/decrease
acc = accuracy_score(increase, predicted_increase)

#Print model parameters
print('no. of trees: ' + str(best_model_trees) + '\nmax_depth: ' + str(best_model_depth) + '\ncolumns: ' + str(best_model_columns))

#Print model scores
print(MAE, acc)

no. of trees: 5
max_depth: 2
columns: ['ipc', 'cropdiv', 'population', 'month', 'district_encoded']
0.06461966113961246 0.8250688705234159
