In [1]:
%matplotlib inline 
import matplotlib.pylab as plt 
import numpy as np 
import pandas as pd 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler

In [2]:
def normalize_data(data, ids_list, grouping):
    
    grouped = data.groupby(grouping)
    normed_data = pd.DataFrame()
    Znorm = StandardScaler()
    colnames = list(data.drop(labels=ids_list,axis=1).columns) 

    for name,group in grouped:
        group.reset_index(inplace=True,drop=True)
        temp = pd.DataFrame(Znorm.fit_transform(group[colnames]), columns=colnames)
        temp[ids_list] = group[ids_list]
        temp = temp[ids_list+colnames]
        normed_data = pd.concat([normed_data, temp])
    
    normed_data.reset_index(inplace=True,drop=True)
    
    return normed_data

In [3]:
full_data = pd.read_csv('./datasets/Droughts_satelite_and_events.csv', index_col=False)
Uganda_data = full_data[full_data.Country=='Uganda']

In [4]:
first_harvest = [6,7]
second_harvest = [11,12]

first_planting = [3,4,5]

second_planting = [8,9,10]

second_id ='_'.join(str(x) for x in second_harvest)

first_id ='_'.join(str(x) for x in first_harvest)


label_list = ['drought_reported', 'drought_news_article', 'drought_desinventar']

feature_list = list(Uganda_data.drop(labels=['Country','District','year','month','day','date',
                                    ]+label_list, axis=1).columns) 

In [5]:
Uganda_data.head()

Unnamed: 0,Country,District,year,month,day,date,NDVI,EVI,precipitation_per_hour_v1,precipitation_per_hour_v2,...,SPEI_6month,SPEI_7month,SPEI_8month,SPEI_9month,SPEI_10month,SPEI_11month,SPEI_12month,drought_reported,drought_news_article,drought_desinventar
0,Uganda,ABIM,2000,3,1,2000-03-01,0.270712,0.137156,0.031873,5.285528,...,,,,,,,,False,False,False
1,Uganda,ABIM,2000,4,1,2000-04-01,0.368438,0.184616,0.172345,84.424662,...,,,,,,,,False,False,False
2,Uganda,ABIM,2000,5,1,2000-05-01,0.624764,0.36538,0.185952,100.613865,...,,,,,,,,False,False,False
3,Uganda,ABIM,2000,6,1,2000-06-01,0.669868,0.416891,0.205662,91.973897,...,,,,,,,,False,False,False
4,Uganda,ABIM,2000,7,1,2000-07-01,0.717746,0.458402,0.157562,67.037938,...,,,,,,,,False,False,False


In [6]:
raw_features_noSPEI = Uganda_data[['District','year','month']+feature_list[0:18]].copy()
raw_features_noSPEI = raw_features_noSPEI[raw_features_noSPEI.month.apply(lambda x: x in (first_planting+second_planting))]
raw_features_noSPEI['Season'] = raw_features_noSPEI['month'].apply(lambda x: first_id if x in first_planting else second_id)
raw_features_noSPEI.drop(labels='month',axis=1,inplace=True)
features_noSPEI = raw_features_noSPEI.groupby(['year','District','Season']).mean().reset_index()

normal_features = normalize_data(features_noSPEI, ids_list=['year','District','Season'], grouping=['District','Season'])

spei_col = 'SPEI_3month'
spei = Uganda_data[['year','District','month']+[spei_col]].copy()
spei = spei[spei['month'].apply(lambda x: x in ([first_planting[-1]]+[second_planting[-1]]))]
spei['Season'] = spei['month'].apply(lambda x: first_id if x==first_planting[-1] else second_id)
spei.drop(labels='month',axis=1,inplace=True)
spei.reset_index(drop=True,inplace=True)
normal_features = normal_features.merge(spei,on=['year','District','Season'])

new_feature_list = sorted(list(normal_features.drop(labels=['year','District','Season'], axis=1).columns)) 
normal_features = normal_features[['year','District','Season']+new_feature_list]
normal_features.sort_values(by=['year','District','Season'],inplace=True)

label_col = 'drought_reported'
labels = Uganda_data[['District','year','month']+[label_col]].copy()
labels = labels[labels.month.apply(lambda x: x in (first_harvest+second_harvest))]
labels['Season'] = labels['month'].apply(lambda x: first_id if x in first_harvest else second_id)
labels.drop(labels='month',axis=1,inplace=True)
sum_labels = labels.groupby(by=['year','District','Season']).sum().reset_index()
sum_labels.rename(columns={'drought_reported':'number_drought_reported'},inplace=True)
sum_labels[label_col] = sum_labels['number_drought_reported']>0

normal_data = normal_features.merge(sum_labels,on=['year','District','Season'])
normal_data.drop(labels='number_drought_reported',axis=1,inplace=True)

normal_data.to_csv('./datasets/Uganda_seasonal_normalized.csv',index=False)

normal_data.head()

Unnamed: 0,year,District,Season,EVI,NDVI,SPEI_3month,SoilMoisture00_10cm,SoilMoisture100_200cm,SoilMoisture10_40cm,SoilMoisture40_100cm,...,SoilTemperature40_100cm,air_temperature,evapotranspiration,precipitation_per_hour_v1,precipitation_per_hour_v2,rainfall,surface_temperature_daytime,surface_temperature_nighttime,wind_speed,drought_reported
0,2000,ABIM,11_12,0.635496,0.232554,0.946693,0.876312,0.199678,0.736812,0.605763,...,-0.552902,-0.435935,1.208654,1.601028,-0.46946,1.336204,-0.151764,-1.343323,-0.412148,False
1,2000,ABIM,6_7,-1.590836,-1.720571,0.075238,-1.302612,-1.113239,-1.262019,-0.937505,...,1.414168,1.570308,-0.740945,-0.358292,-1.030145,-0.388903,1.260564,0.032762,0.306375,False
2,2000,ADJUMANI,11_12,-1.085335,-1.26032,-0.606487,-0.724946,-0.722124,-0.683251,-0.829818,...,0.199455,-0.221168,-1.138716,0.041021,-2.093348,-1.024165,1.721439,-1.782573,-0.412293,False
3,2000,ADJUMANI,6_7,-2.697474,-3.056846,-0.816605,-0.872943,0.24324,-0.161562,0.000243,...,1.332993,1.546512,-0.115219,-1.67168,-1.859843,-0.68141,2.350867,-0.480881,0.608132,False
4,2000,AGAGO,11_12,1.149972,-0.07058,0.824262,0.874588,0.624983,0.793872,0.710458,...,-0.539216,-0.380892,1.12655,1.366786,0.265181,0.963633,-0.414698,-2.063056,-0.467823,False
