# Dengue Predictions

**1. Prepare Workspace**

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import datetime
green, orange = '#00A57B', '#EF5300'
pd.set_option('display.max_columns', None)

# Import data
df = pd.read_csv('dengue_features_test.csv')

**2. Perform Basic Cleaning**

In [2]:
# Drop duplicated column
df = df.drop(['reanalysis_sat_precip_amt_mm'], axis=1)

# Extract 'month' from 'week_start_date'
df['week_start_date'] = pd.to_datetime(df['week_start_date'])
df['month'] = df['week_start_date'].dt.month

# Extract 'season from 'week_start_date' and drop
df['time.season'] = np.where(df['month'].between(5,10), 'may-oct', 'nov-apr')
df['time.season'] = df['time.season'].replace({'may-oct': 1, 'nov-apr': -1})
df = df.drop(['week_start_date'], axis=1)

# Reorganize column names
names = {'year': 'time.year', 'weekofyear': 'time.weekofyear','ndvi_ne': 'vege.ndvi_ne', 'ndvi_nw': 'vege.ndvi_nw',
         'ndvi_se': 'vege.ndvi_se', 'ndvi_sw': 'vege.ndvi_sw', 'precipitation_amt_mm': 'prec.precipitation_amt_mm',
         'reanalysis_air_temp_k': 'temp.reanalysis_air_temp_k', 'reanalysis_avg_temp_k': 'temp.reanalysis_avg_temp_k',
         'reanalysis_dew_point_temp_k': 'temp.reanalysis_dew_point_temp_k', 'reanalysis_max_air_temp_k':
         'temp.reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k': 'temp.reanalysis_min_air_temp_k',
         'reanalysis_precip_amt_kg_per_m2': 'prec.reanalysis_precip_amt_kg_per_m2', 'total_cases': 'y.total_cases',
         'reanalysis_relative_humidity_percent': 'humi.reanalysis_relative_humidity_percent', 'month': 'time.month',
         'reanalysis_specific_humidity_g_per_kg': 'humi.reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k':
         'temp.reanalysis_tdtr_k', 'station_avg_temp_c': 'temp.station_avg_temp_c', 'station_diur_temp_rng_c':
         'temp.station_diur_temp_rng_c', 'station_max_temp_c': 'temp.station_max_temp_c', 'station_min_temp_c':
         'temp.station_min_temp_c', 'station_precip_mm': 'prec.station_precip_mm', 'season': 'time.season',
         'ndvi_total': 'vege.ndvi_total'}
df = df.rename(names, axis=1)

**3. Transform Categorical Variables**

In [3]:
# Make 'time.month' and 'time.weekofyear' cyclical
df['time.month_sin'] = np.sin(2 * np.pi * df['time.month']/12)
df['time.month_cos'] = np.cos(2 * np.pi * df['time.month']/12)

df['time.weekofyear_sin'] = np.sin(2 * np.pi * df['time.weekofyear']/52)
df['time.weekofyear_cos'] = np.cos(2 * np.pi * df['time.weekofyear']/52)

# Effect code'city'
df['city'] = df['city'].replace({'sj': 1, 'iq': -1})

# Make times continuous
df['time.year'] = df['time.year'].astype(float)
df['time.month'] = df['time.month'].astype(float)
df['time.weekofyear'] = df['time.weekofyear'].astype(float)

**4. Predict Missing Information**

In [4]:
# Combine with partially cleaned data
data = pd.read_csv('dengue_partially_cleaned_data.csv')
df = pd.concat([df, data], sort=False)

df = df.reset_index().drop(['index'], axis=1)

# Make dictionaries for weekly averages for each city
sj_dict = df[df['city']==1].groupby('time.weekofyear').mean().fillna(method='ffill').to_dict()
iq_dict = df[df['city']==-1].groupby('time.weekofyear').mean().fillna(method='ffill').to_dict()

**i. vege.ndvi_ne**

In [5]:
# Predict and replace initial 'vege.ndvi_ne' data
ndvi_ne_train = df.dropna()

X = ndvi_ne_train.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1)
y = ndvi_ne_train['vege.ndvi_ne']
rfc_ndvi_ne = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_ne.fit(X,y)
print('OOB Score: ', rfc_ndvi_ne.oob_score_)

ndvi_ne_predict = df.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1).dropna()
ndvi_ne_predict['ndvi_ne_predict'] = rfc_ndvi_ne.predict(ndvi_ne_predict)

df = df.join(ndvi_ne_predict['ndvi_ne_predict'])
df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['ndvi_ne_predict'], df['vege.ndvi_ne'])
df = df.drop(['ndvi_ne_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_ne' data
missing = ['vege.ndvi_nw', 'vege.ndvi_se', 'vege.ndvi_sw', 'prec.precipitation_amt_mm']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_ne_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_ne_train2.drop(['y.total_cases', 'vege.ndvi_ne'], axis=1)
y = ndvi_ne_train2['vege.ndvi_ne']
rfc_ndvi_ne2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_ne2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_ne2.oob_score_)

ndvi_ne_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_ne'], axis=1).dropna()
ndvi_ne_predict2['ndvi_ne_predict2'] = rfc_ndvi_ne2.predict(ndvi_ne_predict2)

df = df.join(ndvi_ne_predict2['ndvi_ne_predict2'])
df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['ndvi_ne_predict2'], df['vege.ndvi_ne'])
df = df.drop(['ndvi_ne_predict2'], axis=1)

df['vege.ndvi_ne'] = np.where(df['vege.ndvi_ne'].isna(), df['vege.ndvi_ne'].mean(), df['vege.ndvi_ne'])

OOB Score:  0.7961721821286349
OOB Score 2:  0.5993591551682438


**ii. vege.ndvi_nw**

In [6]:
# Predict and replace initial 'vege.ndvi_nw' data
ndvi_nw_train = df.dropna()

X = ndvi_nw_train.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1)
y = ndvi_nw_train['vege.ndvi_nw']
rfc_ndvi_nw = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_nw.fit(X,y)
print('OOB Score: ', rfc_ndvi_nw.oob_score_)

ndvi_nw_predict = df.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1).dropna()
ndvi_nw_predict['ndvi_nw_predict'] = rfc_ndvi_nw.predict(ndvi_nw_predict)

df = df.join(ndvi_nw_predict['ndvi_nw_predict'])
df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['ndvi_nw_predict'], df['vege.ndvi_nw'])
df = df.drop(['ndvi_nw_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_nw' data
missing = ['vege.ndvi_se', 'vege.ndvi_sw']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_nw_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_nw_train2.drop(['y.total_cases', 'vege.ndvi_nw'], axis=1)
y = ndvi_nw_train2['vege.ndvi_nw']
rfc_ndvi_nw2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_nw2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_nw2.oob_score_)

ndvi_nw_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_nw'], axis=1).dropna()
ndvi_nw_predict2['ndvi_nw_predict2'] = rfc_ndvi_nw2.predict(ndvi_nw_predict2)

df = df.join(ndvi_nw_predict2['ndvi_nw_predict2'])
df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['ndvi_nw_predict2'], df['vege.ndvi_nw'])
df = df.drop(['ndvi_nw_predict2'], axis=1)

df['vege.ndvi_nw'] = np.where(df['vege.ndvi_nw'].isna(), df['vege.ndvi_nw'].mean(), df['vege.ndvi_nw'])

OOB Score:  0.7657007337416871
OOB Score 2:  0.759770604607026


**iii. vege.ndvi_se**

In [7]:
# Predict and replace initial 'vege.ndvi_se' data
ndvi_se_train = df.dropna()

X = ndvi_se_train.drop(['y.total_cases', 'vege.ndvi_se'], axis=1)
y = ndvi_se_train['vege.ndvi_se']
rfc_ndvi_se = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_se.fit(X,y)
print('OOB Score: ', rfc_ndvi_se.oob_score_)

ndvi_se_predict = df.drop(['y.total_cases', 'vege.ndvi_se'], axis=1).dropna()
ndvi_se_predict['ndvi_se_predict'] = rfc_ndvi_se.predict(ndvi_se_predict)

df = df.join(ndvi_se_predict['ndvi_se_predict'])
df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['ndvi_se_predict'], df['vege.ndvi_se'])
df = df.drop(['ndvi_se_predict'], axis=1)

# Predict and replace remaining missing 'vege.ndvi_se' data
missing = ['vege.ndvi_sw']
subset = [i for i in list(df.columns) if i not in missing]
ndvi_se_train2 = df.drop(missing, axis=1).dropna(subset=subset)

X = ndvi_se_train2.drop(['y.total_cases', 'vege.ndvi_se'], axis=1)
y = ndvi_se_train2['vege.ndvi_se']
rfc_ndvi_se2 = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_se2.fit(X,y)
print('OOB Score 2: ', rfc_ndvi_se2.oob_score_)

ndvi_se_predict2 = df.drop(missing, axis=1).drop(['y.total_cases', 'vege.ndvi_se'], axis=1).dropna()
ndvi_se_predict2['ndvi_se_predict2'] = rfc_ndvi_se2.predict(ndvi_se_predict2)

df = df.join(ndvi_se_predict2['ndvi_se_predict2'])
df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['ndvi_se_predict2'], df['vege.ndvi_se'])
df = df.drop(['ndvi_se_predict2'], axis=1)

df['vege.ndvi_se'] = np.where(df['vege.ndvi_se'].isna(), df['vege.ndvi_se'].mean(), df['vege.ndvi_se'])

OOB Score:  0.6953590261800198
OOB Score 2:  0.49709352659207307


**iv. vege.ndvi_sw**

In [8]:
# Predict and replace initial 'vege.ndvi_sw' data
ndvi_sw_train = df.dropna()

X = ndvi_sw_train.drop(['y.total_cases', 'vege.ndvi_sw'], axis=1)
y = ndvi_sw_train['vege.ndvi_sw']
rfc_ndvi_sw = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_ndvi_sw.fit(X,y)
print('OOB Score: ', rfc_ndvi_sw.oob_score_)

ndvi_sw_predict = df.drop(['y.total_cases', 'vege.ndvi_sw'], axis=1).dropna()
ndvi_sw_predict['ndvi_sw_predict'] = rfc_ndvi_sw.predict(ndvi_sw_predict)

df = df.join(ndvi_sw_predict['ndvi_sw_predict'])
df['vege.ndvi_sw'] = np.where(df['vege.ndvi_sw'].isna(), df['ndvi_sw_predict'], df['vege.ndvi_sw'])
df = df.drop(['ndvi_sw_predict'], axis=1)

df['vege.ndvi_sw'] = np.where(df['vege.ndvi_sw'].isna(), df['vege.ndvi_sw'].mean(), df['vege.ndvi_sw'])

OOB Score:  0.7775416938768092


**v. temp.station_avg_temp_c**

In [9]:
# Predict and replace initial 'temp.station_avg_temp_c' data
mis = ['temp.station_diur_temp_rng_c', 'prec.station_precip_mm', 'temp.station_max_temp_c', 'temp.station_min_temp_c']
subset = [i for i in list(df.columns) if i not in mis]
station_avg_temp_c_train = df.drop(mis, axis=1).dropna(subset=subset)

X = station_avg_temp_c_train.drop(['y.total_cases', 'temp.station_avg_temp_c'], axis=1)
y = station_avg_temp_c_train['temp.station_avg_temp_c']
rfc_station_avg_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_avg_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_avg_temp_c.oob_score_)

station_avg_temp_c_predict = df.drop(mis + ['y.total_cases', 'temp.station_avg_temp_c'], axis=1).dropna()
station_avg_temp_c_predict['station_avg_temp_c_predict'] = rfc_station_avg_temp_c.predict(station_avg_temp_c_predict)

df = df.join(station_avg_temp_c_predict['station_avg_temp_c_predict'])
preds, actual = df['station_avg_temp_c_predict'], df['temp.station_avg_temp_c']
df['temp.station_avg_temp_c'] = np.where(df['temp.station_avg_temp_c'].isna(), preds, actual)
df = df.drop(['station_avg_temp_c_predict'], axis=1)

# Predict and replace missing 'temp.station_avg_temp_c' data
temp, city, target = df['temp.station_avg_temp_c'], df['city'], 'temp.station_avg_temp_c'
df[target] = np.where((temp.isna() & city == 1), df['time.weekofyear'].replace(sj_dict[target]), temp)
df[target] = np.where((temp.isna() & city == -1), df['time.weekofyear'].replace(iq_dict[target]), temp)

OOB Score:  0.8217695649126368


**vi. temp.station_diur_temp_rng_c**

In [10]:
# Predict and replace initial 'temp.station_diur_temp_rng_c' data
mis = ['prec.station_precip_mm', 'temp.station_max_temp_c', 'temp.station_min_temp_c']
subset = [i for i in list(df.columns) if i not in mis]
station_diur_temp_rng_c_train = df.drop(mis, axis=1).dropna(subset=subset)

X = station_diur_temp_rng_c_train.drop(['y.total_cases', 'temp.station_diur_temp_rng_c'], axis=1)
y = station_diur_temp_rng_c_train['temp.station_diur_temp_rng_c']
rfc_diur_temp_rng_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_diur_temp_rng_c.fit(X,y)
print('OOB Score: ', rfc_diur_temp_rng_c.oob_score_)

diur_temp_rng_c_predict = df.drop(mis + ['y.total_cases', 'temp.station_avg_temp_c'], axis=1).dropna()
diur_temp_rng_c_predict['diur_temp_rng_c_predict'] = rfc_diur_temp_rng_c.predict(diur_temp_rng_c_predict)

df = df.join(diur_temp_rng_c_predict['diur_temp_rng_c_predict'])
preds, actual = df['diur_temp_rng_c_predict'], df['temp.station_diur_temp_rng_c']
df['temp.station_diur_temp_rng_c'] = np.where(df['temp.station_diur_temp_rng_c'].isna(), preds, actual)
df = df.drop(['diur_temp_rng_c_predict'], axis=1)

# Predict and replace missing 'temp.station_diur_temp_rng_c' data
temp, city, target = df['temp.station_diur_temp_rng_c'], df['city'], 'temp.station_diur_temp_rng_c'
df[target] = np.where((temp.isna() & city == 1), df['time.weekofyear'].replace(sj_dict[target]), temp)
df[target] = np.where((temp.isna() & city == -1), df['time.weekofyear'].replace(iq_dict[target]), temp)

OOB Score:  0.8002793779775578


**vii. prec.station_precip_mm**

In [11]:
# Predict and replace initial 'prec.station_precip_mm' data
missing = 'temp.station_min_temp_c'
subset = [i for i in list(df.columns) if i != missing]
station_precip_mm_train = df.drop(missing, axis=1).dropna(subset=subset)

X = station_precip_mm_train.drop(['y.total_cases', 'prec.station_precip_mm'], axis=1)
y = station_precip_mm_train['prec.station_precip_mm']
rfc_station_precip_mm = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_precip_mm.fit(X,y)
print('OOB Score: ', rfc_station_precip_mm.oob_score_)

station_precip_mm_predict = df.drop(['y.total_cases', 'prec.station_precip_mm', missing], axis=1).dropna()
station_precip_mm_predict['station_precip_mm_predict'] = rfc_station_precip_mm.predict(station_precip_mm_predict)

df = df.join(station_precip_mm_predict['station_precip_mm_predict'])
preds, actual = df['station_precip_mm_predict'], df['prec.station_precip_mm']
df['prec.station_precip_mm'] = np.where(df['prec.station_precip_mm'].isna(), preds, actual)
df = df.drop(['station_precip_mm_predict'], axis=1)

# Predict and replace missing 'prec.station_precip_mm' data
sj_station_precip_mm = df[df['city'] == 1]['prec.station_precip_mm'].mean()
iq_station_precip_mm = df[df['city'] == -1]['prec.station_precip_mm'].mean()

prec, city = df['prec.station_precip_mm'], df['city']
df['prec.station_precip_mm'] = np.where((prec.isna() & city == 1), sj_station_precip_mm, prec)
df['prec.station_precip_mm'] = np.where((prec.isna() & city == -1), iq_station_precip_mm, prec)

OOB Score:  0.3321535579792406


**viii. temp.station_max_temp_c**

In [12]:
# Predict and replace initial 'temp.station_max_temp_c' data
missing = 'temp.station_min_temp_c'
subset = [i for i in list(df.columns) if i != missing]
station_max_temp_c_train = df.drop(missing, axis=1).dropna(subset=subset)

X = station_max_temp_c_train.drop(['y.total_cases', 'temp.station_max_temp_c'], axis=1)
y = station_max_temp_c_train['temp.station_max_temp_c']
rfc_station_max_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_max_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_max_temp_c.oob_score_)

station_max_temp_c_predict = df.drop(['y.total_cases', 'temp.station_max_temp_c', missing], axis=1).dropna()
station_max_temp_c_predict['station_max_temp_c_predict'] = rfc_station_max_temp_c.predict(station_max_temp_c_predict)

df = df.join(station_max_temp_c_predict['station_max_temp_c_predict'])
preds, actual = df['station_max_temp_c_predict'], df['temp.station_max_temp_c']
df['temp.station_max_temp_c'] = np.where(df['temp.station_max_temp_c'].isna(), preds, actual)
df = df.drop(['station_max_temp_c_predict'], axis=1)

# Predict and replace missing 'temp.station_max_temp_c' data
temp, city, target = df['temp.station_max_temp_c'], df['city'], 'temp.station_max_temp_c'
df[target] = np.where((temp.isna() & city == 1), df['time.weekofyear'].replace(sj_dict[target]), temp)
df[target] = np.where((temp.isna() & city == -1), df['time.weekofyear'].replace(iq_dict[target]), temp)

OOB Score:  0.8721323739367368


**ix. temp.station_min_temp_c**

In [13]:
# Predict and replace initial 'temp.station_min_temp_c' data
station_min_temp_c_train = df.dropna()

X = station_min_temp_c_train.drop(['y.total_cases', 'temp.station_min_temp_c'], axis=1)
y = station_min_temp_c_train['temp.station_min_temp_c']
rfc_station_min_temp_c = RandomForestRegressor(n_estimators=1000, oob_score=True)

rfc_station_min_temp_c.fit(X,y)
print('OOB Score: ', rfc_station_min_temp_c.oob_score_)

station_min_temp_c_predict = df.drop(['y.total_cases', 'temp.station_min_temp_c', missing], axis=1).dropna()
station_min_temp_c_predict['station_min_temp_c_predict'] = rfc_station_min_temp_c.predict(station_min_temp_c_predict)

df = df.join(station_min_temp_c_predict['station_min_temp_c_predict'])
preds, actual = df['station_min_temp_c_predict'], df['temp.station_min_temp_c']
df['temp.station_min_temp_c'] = np.where(df['temp.station_min_temp_c'].isna(), preds, actual)
df = df.drop(['station_min_temp_c_predict'], axis=1)

OOB Score:  0.8197917044558559


**x. Remaining Values**

In [14]:
# Estimate missing values for remaining items

cols = list(df.columns)

for i in cols:
    if i != 'city' and i != 'y.total_cases' and i !='time.weekofyear':
        df[i] = np.where((df[i].isna() & df['city'] == 1), df['time.weekofyear'].replace(sj_dict[i]), df[i])
        df[i] = np.where((df[i].isna() & df['city'] == -1), df['time.weekofyear'].replace(iq_dict[i]), df[i])

**5. Perform Feature Engineering**

In [15]:
# Create a column for 'ndvi_total'
df['vege.ndvi_total'] = df['vege.ndvi_sw'] + df['vege.ndvi_nw'] + df['vege.ndvi_se'] + df['vege.ndvi_ne']

# Create columns for previous week climate and drop 'order'
def get_last_week(year_week):
    year = year_week[:4]
    week = year_week[5:]
    if int(week) > 1:
        return year + '-' + str(int(week) - 1)
    else:
        return str(int(year) - 1) + '-52'

df['year_week'] = df['time.year'].astype(int).astype(str) + '-' + df['time.weekofyear'].astype(int).astype(str)
df['last_week'] = df['year_week'].apply(get_last_week)

clim = ['year_week', 'vege.ndvi_ne', 'vege.ndvi_nw', 'vege.ndvi_se', 'vege.ndvi_sw', 'prec.precipitation_amt_mm',
        'temp.reanalysis_air_temp_k', 'temp.reanalysis_avg_temp_k', 'temp.reanalysis_dew_point_temp_k', 'city',
        'temp.reanalysis_max_air_temp_k', 'temp.reanalysis_min_air_temp_k', 'prec.reanalysis_precip_amt_kg_per_m2',
        'humi.reanalysis_relative_humidity_percent', 'humi.reanalysis_specific_humidity_g_per_kg', 'vege.ndvi_total',
        'temp.reanalysis_tdtr_k', 'temp.station_avg_temp_c', 'temp.station_diur_temp_rng_c', 'prec.station_precip_mm',
        'temp.station_max_temp_c', 'temp.station_min_temp_c']

cols = [(i + '_last_week') for i in clim]
last_week = df[clim]
last_week.columns = cols
last_week = last_week.rename(columns={'year_week_last_week': 'last_week', 'city_last_week': 'city'})
df = df.merge(last_week, how='left', on=['last_week','city'])

df = df.drop(['last_week', 'year_week'], axis=1)

# Guess missing values from last week
sj_dict = df[df['city']==1].groupby('time.weekofyear').mean().fillna(method='ffill').to_dict()
iq_dict = df[df['city']==-1].groupby('time.weekofyear').mean().fillna(method='ffill').to_dict()

cols = list(df.columns)

for i in cols:
    if i != 'city' and i != 'y.total_cases' and i != 'time.weekofyear':
        df[i] = np.where((df[i].isna() & df['city'] == 1), df['time.weekofyear'].replace(sj_dict[i]), df[i])
        df[i] = np.where((df[i].isna() & df['city'] == -1), df['time.weekofyear'].replace(iq_dict[i]), df[i])

# Drop remaining rows with empty values
fts = list(df.drop('y.total_cases', axis=1).columns)
df = df.dropna(subset=fts)

In [16]:
cols = list(df.columns)
cols.sort()
df = df[cols]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1872 entries, 0 to 1871
Data columns (total 50 columns):
city                                                    1872 non-null int64
humi.reanalysis_relative_humidity_percent               1872 non-null float64
humi.reanalysis_relative_humidity_percent_last_week     1872 non-null float64
humi.reanalysis_specific_humidity_g_per_kg              1872 non-null float64
humi.reanalysis_specific_humidity_g_per_kg_last_week    1872 non-null float64
prec.precipitation_amt_mm                               1872 non-null float64
prec.precipitation_amt_mm_last_week                     1872 non-null float64
prec.reanalysis_precip_amt_kg_per_m2                    1872 non-null float64
prec.reanalysis_precip_amt_kg_per_m2_last_week          1872 non-null float64
prec.station_precip_mm                                  1872 non-null float64
prec.station_precip_mm_last_week                        1872 non-null float64
temp.reanalysis_air_temp_k           

1872 total
416 to predict

In [20]:
# Export to csv
preds = df[df['y.total_cases'].isna()]
preds.to_csv('dengue_prepared_predictions.csv', index=False)