# Dengue Data Cleaning

**1. Prepare Workspace**

In [1]:
# Import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import datetime
pd.set_option('display.max_columns', None)

# Import data
features = pd.read_csv('dengue_features_train.csv')
labels = pd.read_csv('dengue_labels_train.csv')
df = features.merge(labels, how='outer', on=['city', 'year', 'weekofyear'])

**2. Perform Basic Cleaning**

In [2]:
# Drop duplicated column
df = df.drop(['reanalysis_sat_precip_amt_mm'], axis=1)

# Extract 'month' from 'week_start_date'
df['week_start_date'] = pd.to_datetime(df['week_start_date'])
df['month'] = df['week_start_date'].dt.month

# Extract 'season from 'week_start_date' and drop
df['time.season'] = np.where(df['month'].between(5,10), 'may-oct', 'nov-apr')
df['time.season'] = df['time.season'].replace({'may-oct': 1, 'nov-apr': -1})
df = df.drop(['week_start_date'], axis=1)

# Reorganize column names
names = {'year': 'time.year', 'weekofyear': 'time.weekofyear','ndvi_ne': 'vege.ndvi_ne', 'ndvi_nw': 'vege.ndvi_nw',
         'ndvi_se': 'vege.ndvi_se', 'ndvi_sw': 'vege.ndvi_sw', 'precipitation_amt_mm': 'prec.precipitation_amt_mm',
         'reanalysis_air_temp_k': 'temp.reanalysis_air_temp_k', 'reanalysis_avg_temp_k': 'temp.reanalysis_avg_temp_k',
         'reanalysis_dew_point_temp_k': 'temp.reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k':
         'temp.reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k': 'temp.reanalysis_min_air_temp_k',
         'reanalysis_precip_amt_kg_per_m2': 'prec.reanalysis_precip_amt_kg_per_m2', 'total_cases': 'y.total_cases',
         'reanalysis_relative_humidity_percent': 'humi.reanalysis_relative_humidity_percent', 'month': 'time.month',
         'reanalysis_specific_humidity_g_per_kg': 'humi.reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k':
         'temp.reanalysis_tdtr_k', 'station_avg_temp_c': 'temp.station_avg_temp_c', 'station_diur_temp_rng_c':
         'temp.station_diur_temp_rng_c', 'station_max_temp_c': 'temp.station_max_temp_c', 'station_min_temp_c':
         'temp.station_min_temp_c', 'station_precip_mm': 'prec.station_precip_mm', 'season': 'time.season',
         'ndvi_total': 'vege.ndvi_total'}
df = df.rename(names, axis=1)

**3. Transform Categorical Variables**

In [3]:
# Make 'time.month' and 'time.weekofyear' cyclical
df['time.month_sin'] = np.sin(2 * np.pi * df['time.month']/12)
df['time.month_cos'] = np.cos(2 * np.pi * df['time.month']/12)

df['time.weekofyear_sin'] = np.sin(2 * np.pi * df['time.weekofyear']/52)
df['time.weekofyear_cos'] = np.cos(2 * np.pi * df['time.weekofyear']/52)

# Effect code'city'
df['city'] = df['city'].replace({'sj': 1, 'iq': -1})

# Make times continuous
df['time.year'] = df['time.year'].astype(float)
df['time.month'] = df['time.month'].astype(float)
df['time.weekofyear'] = df['time.weekofyear'].astype(float)

**4. Predict Missing Information**

**i. vege.ndvi_ne**

In [4]:
# Predict and replace initial 'vege.ndvi_ne' data
ndvi_ne_train = df.dropna()

X = ndvi_ne_train.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1)
y = ndvi_ne_train['vege.ndvi_ne']
rfc_ndvi_ne = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_ne.fit(X,y)
print('OOB Score: ', rfc_ndvi_ne.oob_score_)

ndvi_ne_predict = df.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1).dropna()
ndvi_ne_predict['ndvi_ne_predict'] = rfc_ndvi_ne.predict(ndvi_ne_predict)

df = df.join(ndvi_ne_predict['ndvi_ne_predict'])
df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['ndvi_ne_predict'], df['vege.ndvi_ne'])
df = df.drop(['ndvi_ne_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_ne' data
missing = ['vege.ndvi_nw', 'vege.ndvi_se', 'vege.ndvi_sw', 'prec.precipitation_amt_mm']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_ne_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_ne_train2.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1)
y = ndvi_ne_train2['vege.ndvi_ne']
rfc_ndvi_ne2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_ne2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_ne2.oob_score_)

ndvi_ne_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_ne'], axis=1).dropna()
ndvi_ne_predict2['ndvi_ne_predict2'] = rfc_ndvi_ne2.predict(ndvi_ne_predict2)

df = df.join(ndvi_ne_predict2['ndvi_ne_predict2'])
df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['ndvi_ne_predict2'], df['vege.ndvi_ne'])
df = df.drop(['ndvi_ne_predict2'], axis=1)

df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['vege.ndvi_ne'].mean(), df['vege.ndvi_ne'])

OOB Score:  0.7612509061828922
OOB Score 2:  0.5887219201809644


**ii. vege.ndvi_nw**

In [5]:
# Predict and replace initial 'vege.ndvi_nw' data
ndvi_nw_train = df.dropna()

X = ndvi_nw_train.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1)
y = ndvi_nw_train['vege.ndvi_nw']
rfc_ndvi_nw = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_nw.fit(X,y)
print('OOB Score: ', rfc_ndvi_nw.oob_score_)

ndvi_nw_predict = df.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1).dropna()
ndvi_nw_predict['ndvi_nw_predict'] = rfc_ndvi_nw.predict(ndvi_nw_predict)

df = df.join(ndvi_nw_predict['ndvi_nw_predict'])
df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['ndvi_nw_predict'], df['vege.ndvi_nw'])
df = df.drop(['ndvi_nw_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_nw' data
missing = ['vege.ndvi_se', 'vege.ndvi_sw']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_nw_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_nw_train2.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1)
y = ndvi_nw_train2['vege.ndvi_nw']
rfc_ndvi_nw2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_nw2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_nw2.oob_score_)

ndvi_nw_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_nw'], axis=1).dropna()
ndvi_nw_predict2['ndvi_nw_predict2'] = rfc_ndvi_nw2.predict(ndvi_nw_predict2)

df = df.join(ndvi_nw_predict2['ndvi_nw_predict2'])
df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['ndvi_nw_predict2'], df['vege.ndvi_nw'])
df = df.drop(['ndvi_nw_predict2'], axis=1)

df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['vege.ndvi_nw'].mean(), df['vege.ndvi_nw'])

OOB Score:  0.7558213548325797
OOB Score 2:  0.7547635670399963


**iii. vege.ndvi_se**

In [6]:
# Predict and replace initial 'vege.ndvi_se' data
ndvi_se_train = df.dropna()

X = ndvi_se_train.drop(['y.total_cases', 'vege.ndvi_se'], axis=1)
y = ndvi_se_train['vege.ndvi_se']
rfc_ndvi_se = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_se.fit(X,y)
print('OOB Score: ', rfc_ndvi_se.oob_score_)

ndvi_se_predict = df.drop(['y.total_cases', 'vege.ndvi_se'], axis=1).dropna()
ndvi_se_predict['ndvi_se_predict'] = rfc_ndvi_se.predict(ndvi_se_predict)

df = df.join(ndvi_se_predict['ndvi_se_predict'])
df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['ndvi_se_predict'], df['vege.ndvi_se'])
df = df.drop(['ndvi_se_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_se' data
missing = ['vege.ndvi_sw']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_se_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_se_train2.drop(['y.total_cases', 'vege.ndvi_se'], axis=1)
y = ndvi_se_train2['vege.ndvi_se']
rfc_ndvi_se2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_se2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_se2.oob_score_)

ndvi_se_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_se'], axis=1).dropna()
ndvi_se_predict2['ndvi_se_predict2'] = rfc_ndvi_se2.predict(ndvi_se_predict2)

df = df.join(ndvi_se_predict2['ndvi_se_predict2'])
df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['ndvi_se_predict2'], df['vege.ndvi_se'])
df = df.drop(['ndvi_se_predict2'], axis=1)

df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['vege.ndvi_se'].mean(), df['vege.ndvi_se'])

OOB Score:  0.6898880793870372
OOB Score 2:  0.484319467163252


**iv. vege.ndvi_sw**

In [7]:
# Predict and replace initial 'vege.ndvi_sw' data
ndvi_sw_train = df.dropna()

X = ndvi_sw_train.drop(['y.total_cases', 'vege.ndvi_sw'], axis=1)
y = ndvi_sw_train['vege.ndvi_sw']
rfc_ndvi_sw = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_sw.fit(X,y)
print('OOB Score: ', rfc_ndvi_sw.oob_score_)

ndvi_sw_predict = df.drop(['y.total_cases', 'vege.ndvi_sw'], axis=1).dropna()
ndvi_sw_predict['ndvi_sw_predict'] = rfc_ndvi_sw.predict(ndvi_sw_predict)

df = df.join(ndvi_sw_predict['ndvi_sw_predict'])
df['vege.ndvi_sw'] = np.where(df['vege.ndvi_sw'].isna(), df['ndvi_sw_predict'], df['vege.ndvi_sw'])
df = df.drop(['ndvi_sw_predict'], axis=1)

df['vege.ndvi_sw'] = np.where(df['vege.ndvi_sw'].isna(), df['vege.ndvi_sw'].mean(), df['vege.ndvi_sw'])

OOB Score:  0.7717504431403441


**v. temp.station_avg_temp_c**

In [8]:
# Predict and replace initial 'temp.station_avg_temp_c' data
mis = ['temp.station_diur_temp_rng_c', 'prec.station_precip_mm', 'temp.station_max_temp_c', 'temp.station_min_temp_c']
subset = [i for i in list(df.columns) if i not in mis]
station_avg_temp_c_train = df.drop(mis, axis=1).dropna(subset=subset)

X = station_avg_temp_c_train.drop(['y.total_cases', 'temp.station_avg_temp_c'], axis=1)
y = station_avg_temp_c_train['temp.station_avg_temp_c']
rfc_station_avg_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_avg_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_avg_temp_c.oob_score_)

station_avg_temp_c_predict = df.drop(mis + ['y.total_cases', 'temp.station_avg_temp_c'], axis=1).dropna()
station_avg_temp_c_predict['station_avg_temp_c_predict'] = rfc_station_avg_temp_c.predict(station_avg_temp_c_predict)

df = df.join(station_avg_temp_c_predict['station_avg_temp_c_predict'])
preds, actual = df['station_avg_temp_c_predict'], df['temp.station_avg_temp_c']
df['temp.station_avg_temp_c'] = np.where(df['temp.station_avg_temp_c'].isna(), preds, actual)
df = df.drop(['station_avg_temp_c_predict'], axis=1)

# Predict and replace missing 'temp.station_avg_temp_c' data
df = df.join(pd.read_csv('dengue_features_train.csv')[['year', 'weekofyear', 'city']]['weekofyear'])
sj_avg_temp = df[df['city'] == 1].groupby('weekofyear').mean()[['temp.station_avg_temp_c']]
iq_avg_temp = df[df['city'] == -1].groupby('weekofyear').mean()[['temp.station_avg_temp_c']]

sj_dict = sj_avg_temp.to_dict()['temp.station_avg_temp_c']
iq_dict = iq_avg_temp.to_dict()['temp.station_avg_temp_c']

temp, city = df['temp.station_avg_temp_c'], df['city']
df['temp.station_avg_temp_c'] = np.where((temp.isna() & city == 1), df['weekofyear'].replace(sj_dict), temp)
df['temp.station_avg_temp_c'] = np.where((temp.isna() & city == -1), df['weekofyear'].replace(iq_dict), temp)
df = df.drop(['weekofyear'], axis=1)

OOB Score:  0.8190493414507558


**vi. temp.station_diur_temp_rng_c**

In [9]:
# Predict and replace initial 'temp.station_diur_temp_rng_c' data
mis = ['prec.station_precip_mm', 'temp.station_max_temp_c', 'temp.station_min_temp_c']
subset = [i for i in list(df.columns) if i not in mis]
station_diur_temp_rng_c_train = df.drop(mis, axis=1).dropna(subset=subset)

X = station_diur_temp_rng_c_train.drop(['y.total_cases', 'temp.station_diur_temp_rng_c'], axis=1)
y = station_diur_temp_rng_c_train['temp.station_diur_temp_rng_c']
rfc_diur_temp_rng_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_diur_temp_rng_c.fit(X,y)
print('OOB Score: ', rfc_diur_temp_rng_c.oob_score_)

diur_temp_rng_c_predict = df.drop(mis + ['y.total_cases', 'temp.station_avg_temp_c'], axis=1).dropna()
diur_temp_rng_c_predict['diur_temp_rng_c_predict'] = rfc_diur_temp_rng_c.predict(diur_temp_rng_c_predict)

df = df.join(diur_temp_rng_c_predict['diur_temp_rng_c_predict'])
preds, actual = df['diur_temp_rng_c_predict'], df['temp.station_diur_temp_rng_c']
df['temp.station_diur_temp_rng_c'] = np.where(df['temp.station_diur_temp_rng_c'].isna(), preds, actual)
df = df.drop(['diur_temp_rng_c_predict'], axis=1)

# Predict and replace missing 'temp.station_diur_temp_rng_c' data
df = df.join(pd.read_csv('dengue_features_train.csv')[['year', 'weekofyear', 'city']]['weekofyear'])
sj_diur_temp = df[df['city'] == 1].groupby('weekofyear').mean()[['temp.station_diur_temp_rng_c']]
iq_diur_temp = df[df['city'] == -1].groupby('weekofyear').mean()[['temp.station_diur_temp_rng_c']]

sj_dict = sj_diur_temp.to_dict()['temp.station_diur_temp_rng_c']
iq_dict = iq_diur_temp.to_dict()['temp.station_diur_temp_rng_c']

temp, city = df['temp.station_diur_temp_rng_c'], df['city']
df['temp.station_diur_temp_rng_c'] = np.where((temp.isna() & city == 1), df['weekofyear'].replace(sj_dict), temp)
df['temp.station_diur_temp_rng_c'] = np.where((temp.isna() & city == -1), df['weekofyear'].replace(iq_dict), temp)
df = df.drop(['weekofyear'], axis=1)

OOB Score:  0.8730694209179345


**vii. prec.station_precip_mm**

In [10]:
# Predict and replace initial 'prec.station_precip_mm' data
missing = 'temp.station_min_temp_c'
subset = [i for i in list(df.columns) if i != missing]
station_precip_mm_train = df.drop(missing, axis=1).dropna(subset=subset)

X = station_precip_mm_train.drop(['y.total_cases', 'prec.station_precip_mm'], axis=1)
y = station_precip_mm_train['prec.station_precip_mm']
rfc_station_precip_mm = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_precip_mm.fit(X,y)
print('OOB Score: ', rfc_station_precip_mm.oob_score_)

station_precip_mm_predict = df.drop(['y.total_cases', 'prec.station_precip_mm', missing], axis=1).dropna()
station_precip_mm_predict['station_precip_mm_predict'] = rfc_station_precip_mm.predict(station_precip_mm_predict)

df = df.join(station_precip_mm_predict['station_precip_mm_predict'])
preds, actual = df['station_precip_mm_predict'], df['prec.station_precip_mm']
df['prec.station_precip_mm'] = np.where(df['prec.station_precip_mm'].isna(), preds, actual)
df = df.drop(['station_precip_mm_predict'], axis=1)

# Predict and replace missing 'prec.station_precip_mm' data
sj_station_precip_mm = df[df['city'] == 1]['prec.station_precip_mm'].mean()
iq_station_precip_mm = df[df['city'] == -1]['prec.station_precip_mm'].mean()

prec, city = df['prec.station_precip_mm'], df['city']
df['prec.station_precip_mm'] = np.where((prec.isna() & city == 1), sj_station_precip_mm, prec)
df['prec.station_precip_mm'] = np.where((prec.isna() & city == -1), iq_station_precip_mm, prec)

OOB Score:  0.3176561917017875


**viii. temp.station_max_temp_c**

In [11]:
# Predict and replace initial 'temp.station_max_temp_c' data
missing = 'temp.station_min_temp_c'
subset = [i for i in list(df.columns) if i != missing]
station_max_temp_c_train = df.drop(missing, axis=1).dropna(subset=subset)

X = station_max_temp_c_train.drop(['y.total_cases', 'temp.station_max_temp_c'], axis=1)
y = station_max_temp_c_train['temp.station_max_temp_c']
rfc_station_max_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_max_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_max_temp_c.oob_score_)

station_max_temp_c_predict = df.drop(['y.total_cases', 'temp.station_max_temp_c', missing], axis=1).dropna()
station_max_temp_c_predict['station_max_temp_c_predict'] = rfc_station_max_temp_c.predict(station_max_temp_c_predict)

df = df.join(station_max_temp_c_predict['station_max_temp_c_predict'])
preds, actual = df['station_max_temp_c_predict'], df['temp.station_max_temp_c']
df['temp.station_max_temp_c'] = np.where(df['temp.station_max_temp_c'].isna(), preds, actual)
df = df.drop(['station_max_temp_c_predict'], axis=1)

# Predict and replace missing 'temp.station_max_temp_c' data
df = df.join(pd.read_csv('dengue_features_train.csv')[['year', 'weekofyear', 'city']]['weekofyear'])
sj_max_temp = df[df['city'] == 1].groupby('weekofyear').mean()[['temp.station_max_temp_c']]
iq_max_temp = df[df['city'] == -1].groupby('weekofyear').mean()[['temp.station_max_temp_c']]

sj_dict = sj_max_temp.to_dict()['temp.station_max_temp_c']
iq_dict = iq_max_temp.to_dict()['temp.station_max_temp_c']

temp, city = df['temp.station_max_temp_c'], df['city']
df['temp.station_max_temp_c'] = np.where((temp.isna() & city == 1), df['weekofyear'].replace(sj_dict), temp)
df['temp.station_max_temp_c'] = np.where((temp.isna() & city == -1), df['weekofyear'].replace(iq_dict), temp)
df = df.drop(['weekofyear'], axis=1)

OOB Score:  0.8715613915996613


**ix. temp.station_min_temp_c**

In [12]:
# Predict and replace initial 'temp.station_min_temp_c' data
station_min_temp_c_train = df.dropna()

X = station_min_temp_c_train.drop(['y.total_cases', 'temp.station_min_temp_c'], axis=1)
y = station_min_temp_c_train['temp.station_min_temp_c']
rfc_station_min_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_min_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_min_temp_c.oob_score_)

station_min_temp_c_predict = df.drop(['y.total_cases', 'temp.station_min_temp_c', missing], axis=1).dropna()
station_min_temp_c_predict['station_min_temp_c_predict'] = rfc_station_min_temp_c.predict(station_min_temp_c_predict)

df = df.join(station_min_temp_c_predict['station_min_temp_c_predict'])
preds, actual = df['station_min_temp_c_predict'], df['temp.station_min_temp_c']
df['temp.station_min_temp_c'] = np.where(df['temp.station_min_temp_c'].isna(), preds, actual)
df = df.drop(['station_min_temp_c_predict'], axis=1)

OOB Score:  0.8201569973620702


In [13]:
# Export cleaned data
df.to_csv('dengue_partially_cleaned_data.csv', index=False)

**5. Perform Feature Engineering**

In [15]:
# Create a column for 'ndvi_total'
df['vege.ndvi_total'] = df['vege.ndvi_sw'] + df['vege.ndvi_nw'] + df['vege.ndvi_se'] + df['vege.ndvi_ne']

# Create columns for previous week climate and drop 'order'
def get_last_week(year_week):
    year = year_week[:4]
    week = year_week[5:]
    if int(week) > 1:
        return year + '-' + str(int(week) - 1)
    else:
        return str(int(year) - 1) + '-52'

df['year_week'] = df['time.year'].astype(int).astype(str) + '-' + df['time.weekofyear'].astype(int).astype(str)
df['last_week'] = df['year_week'].apply(get_last_week)

clim = ['year_week', 'vege.ndvi_ne', 'vege.ndvi_nw', 'vege.ndvi_se', 'vege.ndvi_sw', 'prec.precipitation_amt_mm',
        'temp.reanalysis_air_temp_k', 'temp.reanalysis_avg_temp_k', 'temp.reanalysis_dew_point_temp_k', 'city',
        'temp.reanalysis_max_air_temp_k', 'temp.reanalysis_min_air_temp_k', 'prec.reanalysis_precip_amt_kg_per_m2',
        'humi.reanalysis_relative_humidity_percent', 'humi.reanalysis_specific_humidity_g_per_kg', 'vege.ndvi_total',
        'temp.reanalysis_tdtr_k', 'temp.station_avg_temp_c', 'temp.station_diur_temp_rng_c', 'prec.station_precip_mm',
        'temp.station_max_temp_c', 'temp.station_min_temp_c']

cols = [(i + '_last_week') for i in clim]
last_week = df[clim]
last_week.columns = cols
last_week = last_week.rename(columns={'year_week_last_week': 'last_week', 'city_last_week': 'city'})
#df = df.merge(last_week, how='left', on=['last_week','city'])
df = df.merge(last_week.drop('city', axis=1), how='left', on=['last_week'])

df = df.drop(['last_week', 'year_week'], axis=1)

# Drop remaining rows with empty values
df = df.dropna()

**6. Check Data**

In [16]:
# See data information
cols = list(df.columns)
cols.sort()
df = df[cols]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2236 entries, 1 to 2263
Data columns (total 50 columns):
city                                                    2236 non-null int64
humi.reanalysis_relative_humidity_percent               2236 non-null float64
humi.reanalysis_relative_humidity_percent_last_week     2236 non-null float64
humi.reanalysis_specific_humidity_g_per_kg              2236 non-null float64
humi.reanalysis_specific_humidity_g_per_kg_last_week    2236 non-null float64
prec.precipitation_amt_mm                               2236 non-null float64
prec.precipitation_amt_mm_last_week                     2236 non-null float64
prec.reanalysis_precip_amt_kg_per_m2                    2236 non-null float64
prec.reanalysis_precip_amt_kg_per_m2_last_week          2236 non-null float64
prec.station_precip_mm                                  2236 non-null float64
prec.station_precip_mm_last_week                        2236 non-null float64
temp.reanalysis_air_temp_k           

In [17]:
# Export data as cleaned data
df.to_csv('dengue_cleaned_data.csv', index=False)

Records

600 trees:
i.
OOB Score:  0.7603505783568217
OOB Score 2:  0.588373272231242
ii.
OOB Score:  0.754559279568142
OOB Score 2:  0.7531451167276504
iii.
OOB Score:  0.6904339738841314
OOB Score 2:  0.48348443965881505
iv.
OOB Score:  0.7712104301588782
v.
OOB Score:  0.8184259486932257
vi.
OOB Score:  0.8500202490660909
vii.
OOB Score:  0.302080877528478
viii.
OOB Score:  0.7880534245013433
ix.
OOB Score:  0.7631150290490145

1000 trees:
i.
OOB Score:  0.761605278165667
OOB Score 2:  0.5877737785372699
ii.
OOB Score:  0.7572935646033195
OOB Score 2:  0.7525599453969516
iii.
OOB Score:  0.6915630804945796
OOB Score 2:  0.4822080789327732
iv.
OOB Score:  0.7720811816430975
v.
OOB Score:  0.8173001177012491
vi.
OOB Score:  0.8503477775469876
vii.
OOB Score:  0.3006247508924371
viii.
OOB Score:  0.7889377464857015
ix.
OOB Score:  0.7641521479553669

Including month and week:
i.
OOB Score:  0.762040637580917
OOB Score 2:  0.5882104733303135
ii.
OOB Score:  0.7564313011160022
OOB Score 2:  0.75598141138214
iii.
OOB Score:  0.6934090389199117
OOB Score 2:  0.4841357428732188
iv.
OOB Score:  0.7720848103836997
v.
OOB Score:  0.820558759140701
vi.
OOB Score:  0.8515297999624688
vii.
OOB Score:  0.3081099754661011
viii.
OOB Score:  0.7894738204716429
ix.
OOB Score:  0.766191345118912

With time variables as float:
i.
OOB Score:  0.7625040487657402
OOB Score 2:  0.5903610585140664
ii.
OOB Score:  0.7562419191505434
OOB Score 2:  0.7553667735480439
iii.
OOB Score:  0.6895697370052626
OOB Score 2:  0.4817764850290346
iv.
OOB Score:  0.7727364376672178
v.
OOB Score:  0.8190704971247947
vi.
OOB Score:  0.8743678835670523
vii.
OOB Score:  0.3176081948607524
viii.
OOB Score:  0.8718028395856982
ix.
OOB Score:  0.8173948612782416