In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('dengue_features_train.csv')
df.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [3]:
df.shape

(1456, 24)

In [4]:
df.columns

Index(['city', 'year', 'weekofyear', 'week_start_date', 'ndvi_ne', 'ndvi_nw',
       'ndvi_se', 'ndvi_sw', 'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm'],
      dtype='object')

# Visualizations 

# Extracting features 

In [5]:
!ls

 DengAI.ipynb			     dengue_labels_train.csv
'Dengue Data Visualizations.ipynb'   submission1.csv
 dengue_features_test.csv	     submission_format.csv
 dengue_features_train.csv


In [6]:
X = pd.read_csv('dengue_features_train.csv')
Y = pd.read_csv('dengue_labels_train.csv')
X_new = pd.read_csv('dengue_features_test.csv')

In [7]:
shape_of_df = lambda x: x.shape
list(map(shape_of_df, [X, Y, X_new]))

[(1456, 24), (1456, 4), (416, 24)]

# Trasnform train vectors

In [8]:
def remove_nans(df):
    return df.fillna(df.mean())

In [9]:
def dummify_cities(df):
    city_features = pd.get_dummies(df['city'], prefix='city')
    df_without_cities = df.drop('city', axis=1)
    return pd.concat([df_without_cities, city_features], axis=1)

In [10]:
def remove_unwanted_features(df):
    features_not_used = ['week_start_date', 'total_cases']
    features_to_use = set(df.columns) - set(features_not_used)
    features_to_use = list(features_to_use)
    return df[features_to_use]

In [11]:
def do_all_transforms(df):
    df = remove_nans(df)
    df = dummify_cities(df)
    df = remove_unwanted_features(df)
    return df

## Seperating test and train datasets

In [13]:
X = do_all_transforms(X)
y = Y['total_cases']

# Testing on models

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
list(map(lambda x: x.shape, [X_test, X_train, y_test, y_train]))

[(364, 24), (1092, 24), (364,), (1092,)]

In [15]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.16391794864470866

In [16]:
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)
dtree.score(X_test, y_test)

0.12311666327260307

In [17]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(X_train, y_train)
svr.score(X_test, y_test)

-0.06427309472124065

In [18]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
nb.score(X_test, y_test)

0.07967032967032966

In [19]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.5193441795183671

# Doing a Grid Search on Random Forest

In [20]:
# from sklearn.model_selection import GridSearchCV
# param_grid = { 
#     'n_estimators': [200, 500],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth': [4,5,6,7,8],
#     'criterion': ['mse', 'mae']
# }
# model = GridSearchCV(RandomForestRegressor(), param_grid, cv=3)
# model.fit(X, y)

# Running on unknown data

In [21]:
output = pd.DataFrame()

In [22]:
output = X_new[['city', 'year', 'weekofyear']]

In [23]:
X_new = do_all_transforms(X_new)
X_new.head()

Unnamed: 0,weekofyear,reanalysis_sat_precip_amt_mm,station_min_temp_c,station_max_temp_c,ndvi_nw,city_iq,ndvi_se,precipitation_amt_mm,reanalysis_tdtr_k,reanalysis_specific_humidity_g_per_kg,...,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,ndvi_ne,reanalysis_relative_humidity_percent,station_precip_mm,city_sj,reanalysis_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_dew_point_temp_k,ndvi_sw
0,18,78.6,21.7,33.3,-0.0189,0,0.102729,78.6,3.128571,15.918571,...,301.1,296.4,-0.0189,78.781429,75.2,1,298.492857,25.37,294.527143,0.0912
1,19,12.56,22.2,30.0,-0.0124,0,0.082043,12.56,2.571429,15.791429,...,300.8,296.7,-0.018,78.23,34.3,1,298.475714,21.83,294.395714,0.072314
2,20,3.66,22.8,32.8,0.126803,0,0.151083,3.66,4.428571,16.674286,...,302.2,296.4,-0.0015,78.27,3.0,1,299.455714,4.12,295.308571,0.091529
3,21,0.0,24.4,33.3,-0.019867,0,0.124329,0.0,4.342857,15.775714,...,303.0,296.9,0.12605,73.015714,0.3,1,299.69,2.2,294.402857,0.125686
4,22,0.76,23.3,33.3,0.039833,0,0.062267,0.76,3.542857,16.137143,...,302.3,297.3,0.0568,74.084286,84.1,1,299.78,4.36,294.76,0.075914


In [24]:
predictions = forest.predict(X_new)

In [25]:
predictions = predictions.round()

In [26]:
output['total_cases'] = predictions

In [27]:
output['total_cases'] = pd.to_numeric(output['total_cases'], downcast='integer')

In [28]:
output.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,2
1,sj,2008,19,5
2,sj,2008,20,6
3,sj,2008,21,8
4,sj,2008,22,6


In [29]:
output.to_csv('submission1.csv', index=False)