In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!ls

 DengAI.ipynb			     dengue_labels_train.csv
'Dengue Data Visualizations.ipynb'  'Just one city.ipynb'
 dengue_features_test.csv	     submission1.csv
 dengue_features_train.csv	     submission_format.csv


In [3]:
X = pd.read_csv('dengue_features_train.csv')
X.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8


In [4]:
Y = pd.read_csv('dengue_labels_train.csv')

In [5]:
san_juan_indexes = X.city == 'sj'
X = X[san_juan_indexes]
Y = Y[san_juan_indexes]

In [6]:
X.shape, Y.shape

((936, 24), (936, 4))

In [7]:
y = Y['total_cases']

## Visualizations

## Transformation functions

In [8]:
ndvi_feature_names = list(filter(lambda s: s.startswith(r'ndvi'), list(X.columns)))
reanalysis_feature_names = list(filter(lambda s: s.startswith(r'reanalysis'), list(X.columns)))
station_feature_names = list(filter(lambda s: s.startswith('station'), list(X.columns)))

numerical_features = ndvi_feature_names + reanalysis_feature_names + station_feature_names

In [9]:
def add_historical_values(df):
    historical_data = df[numerical_features].rolling(4).mean().fillna(method="bfill")
    column_rename = lambda name: 'past_' + name
    historical_data.columns = map(column_rename, historical_data.columns)
    return pd.concat([df, historical_data], axis=1)

In [10]:
def fill_missing_values(df):
    new_df = df.fillna(df.mean())
    return new_df

In [11]:
def drop_unwanted_variables(df):
    new_df = df.drop(columns=['city', 'week_start_date'])
    return new_df

In [12]:
def apply_all_transformations(df):
    df1 = add_historical_values(df)
    df2 = fill_missing_values(df1)
    df3 = drop_unwanted_variables(df2)
    return df3

In [13]:
X = apply_all_transformations(X)

In [14]:
X_and_y = pd.concat([X, y], axis=1)
X_and_y.columns

Index(['year', 'weekofyear', 'ndvi_ne', 'ndvi_nw', 'ndvi_se', 'ndvi_sw',
       'precipitation_amt_mm', 'reanalysis_air_temp_k',
       'reanalysis_avg_temp_k', 'reanalysis_dew_point_temp_k',
       'reanalysis_max_air_temp_k', 'reanalysis_min_air_temp_k',
       'reanalysis_precip_amt_kg_per_m2',
       'reanalysis_relative_humidity_percent', 'reanalysis_sat_precip_amt_mm',
       'reanalysis_specific_humidity_g_per_kg', 'reanalysis_tdtr_k',
       'station_avg_temp_c', 'station_diur_temp_rng_c', 'station_max_temp_c',
       'station_min_temp_c', 'station_precip_mm', 'past_ndvi_ne',
       'past_ndvi_nw', 'past_ndvi_se', 'past_ndvi_sw',
       'past_reanalysis_air_temp_k', 'past_reanalysis_avg_temp_k',
       'past_reanalysis_dew_point_temp_k', 'past_reanalysis_max_air_temp_k',
       'past_reanalysis_min_air_temp_k',
       'past_reanalysis_precip_amt_kg_per_m2',
       'past_reanalysis_relative_humidity_percent',
       'past_reanalysis_sat_precip_amt_mm',
       'past_reanalysis_sp

In [15]:
from sklearn.model_selection import cross_val_score

In [16]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
list(map(lambda x: x.shape, [X_test, X_train, y_test, y_train]))

[(234, 41), (702, 41), (234,), (702,)]

In [18]:
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.6335481342982439

In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
cross_val_score(forest, X, y, cv=KFold(5))



array([-2.46, -5.1 ,  0.  ])

In [20]:
# %%time
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     'n_estimators': [5, 10, 20],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_depth': [4, 8, 12, None],
#     'criterion': ['mse', 'mae']
# }
# grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3)

In [21]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

0.55192347244295

In [22]:
cross_val_score(tree, X, y, cv=2)

array([-0.11518712, -9.72527418])

In [25]:
from sklearn.linear_model import Lasso
lasso = Lasso()
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

0.06623051146712133

In [46]:
from sklearn.linear_model import LassoCV
lasso_cv = LassoCV(alphas=[100, 10, 1])
lasso_cv.fit(X_train, y_train)

LassoCV(alphas=[100, 10, 1], copy_X=True, cv=None, eps=0.001,
    fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [45]:
lasso_cv.score(X_test, y_test)

0.06623051146712133