In [1]:
#### PACKAGE AND DATA IMPORTS
##########################
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

test_df = pd.read_csv('./kaggle_data/test.csv')
train_df = pd.read_csv('./kaggle_data/train.csv')
weather_df = pd.read_csv('./kaggle_data/weather.csv')

In [2]:
#### CLEANING AND COMBINING
#########################

# creating daily_weather data df using only one station
daily_weather = weather_df[weather_df['Station'] == 1] 

In [3]:
# dropping station label since all are station 1
daily_weather.drop('Station', axis= 1, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [4]:
# datetime index on weather data
daily_weather.reset_index(inplace=True, drop=True) 

daily_weather['Date'] = pd.to_datetime(daily_weather['Date']) 

daily_weather.set_index('Date',inplace=True, drop=True) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [5]:
#datetime index on training data
train_df['Date'] = pd.to_datetime(train_df['Date']) 
test_df['Date'] = pd.to_datetime(test_df['Date'])
train_df.set_index('Date', inplace=True, drop=True)
test_df.set_index('Date', inplace=True, drop = True)

In [6]:
#combine weather data with train and test sets along Date index
train_df = pd.merge(train_df, 
                  daily_weather, 
                  left_index = True, right_index = True)
test_df = pd.merge(test_df, 
                 daily_weather, 
                  left_index = True, right_index = True)

In [7]:
train_df.drop(['Water1','SnowFall'], axis=1, inplace=True) # dropping these columns since they provided no info
test_df.drop(['Water1','SnowFall'], axis=1, inplace=True) # dropping these columns since they provided no info

In [8]:
# storing mode precip value (for rows w/ numeric values) for replacement in next step
mode_precip = float(train_df[train_df['PrecipTotal'] != '  T'].PrecipTotal.mode()[0]) 

In [9]:
# replacing '  T' with mode precip value in training set
train_precip_totals = []
for total in train_df.PrecipTotal:
    if total == '  T':
        train_precip_totals.append(mode_precip)
    else:
        train_precip_totals.append(total)

train_df.PrecipTotal = pd.to_numeric(train_precip_totals) 

In [10]:
# replacing '  T' in test set with SAME mode precip value in training set
test_precip_totals = []
for total in test_df.PrecipTotal:
    if total == '  T':
        test_precip_totals.append(mode_precip)
    else:
        test_precip_totals.append(total)

test_df.PrecipTotal = pd.to_numeric(test_precip_totals) 

In [11]:
# storing mode pressure (for rows with numeric value) for replacement in next step
mode_pressure = train_df[train_df['StnPressure'] != 'M'].StnPressure.mode() 

In [12]:
# replacing 'M' with mode pressure value in train set
train_pressures = []
for pressure in train_df.StnPressure:
    if pressure == 'M':
        train_pressures.append(mode_pressure)
    else:
        train_pressures.append(pressure)
train_pressures = [float(pressure) for pressure in train_pressures]

train_df.StnPressure = pd.to_numeric(train_pressures) 

In [13]:
# replacing 'M' in test set with SAME mode pressure value from train set
test_pressures = []
for pressure in test_df.StnPressure:
    if pressure == 'M':
        test_pressures.append(mode_pressure)
    else:
        test_pressures.append(pressure)
test_pressures = [float(pressure) for pressure in test_pressures]

test_df.StnPressure = pd.to_numeric(test_pressures) # replacing 'M' with mode pressure value

In [14]:
# these are the columns that could be numeric values but are currently object
#new_df.dtypes
cols_to_change = ['Tavg',
                 'Depart',
                 'Cool',
                 'Sunrise',
                 'Sunset',
                 'Depth',
                 'PrecipTotal',
                 'StnPressure',
                 'SeaLevel',
                 'AvgSpeed'
                 ]

In [15]:
#changing columns above to numeric in both train and set
for col in cols_to_change:
    train_df[col] = pd.to_numeric(train_df[col])
    test_df[col] = pd.to_numeric(test_df[col])

In [16]:
# dropping NumMosquitos from training date b/c not in test set
train_df.drop('NumMosquitos', axis = 1, inplace = True)

In [17]:
# undersampling rows that are WNV- to balance the classes
# resaving as train_df
wnv1_df = train_df[train_df['WnvPresent'] == 1]

wnv0_df = train_df[train_df['WnvPresent'] == 0].sample(n = wnv1_df.shape[0], random_state = 21)

train_df = pd.concat([wnv1_df, wnv0_df], axis = 0)

In [18]:
#saving Id col for submission so i can drop it from df during modeling
submission_ids = test_df.Id

test_df.drop('Id', axis = 1, inplace = True)

In [201]:
# we found that the numeric data alone performed better than
## including dummied columns (e.g. Address)
num_train = train_df._get_numeric_data() # training df numerical
num_test = test_df._get_numeric_data() # test df numerical

In [202]:
# split X and y for training
### NOTE THAT THE SPLIT WAS JUST FOR EXPLORATORY PURPOSES
### X and y are used for model building without TTS
X = num_train.drop('WnvPresent', axis = 1)
y = num_train.WnvPresent

In [203]:
# TTS so that we can check the effectiveness of our model and pick the best
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .25, shuffle = True)

In [204]:
# instantiate SS and fit on X (or X_train)
ss = StandardScaler()

ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [205]:
# transform X and test data with same SS
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)
num_test = pd.DataFrame(ss.transform(num_test), columns=num_test.columns)

In [1]:
#### MODELS TO TRY (Gridsearch Imported Above)
##################################

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

## Random Forrest on Numerical Data

In [135]:
rf = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4],
            'random_state':[1,21,31,100]}
rf_gs = GridSearchCV(rf, rf_params, n_jobs=4)
rf_gs.fit(X, y)
rf_gs.best_params_

{'min_samples_split': 2, 'n_estimators': 10, 'random_state': 21}

In [26]:
#Create predictions
rf_preds = rf_gs.predict(X_test)

In [27]:
# Precision and Accuracy Scoring
precision_score(y_test, rf_preds), accuracy_score(y_test, rf_preds)

(0.7412587412587412, 0.7681159420289855)

In [28]:
len(rf_preds), len(X_test)

(276, 276)

In [45]:
# predictions as dataframe for export
pred_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':rf_preds})

In [47]:
#### PREDICTION EXPORT
#######################
# pred_df.to_csv('./predictions1.csv', index=False)

# THE SAME PROCESS ABOVE IS REPEATED FOR OTHER MODELS

## AdaBoost on Numerical Data (our highest scoring kaggle submission)

In [144]:
ada = AdaBoostClassifier()
ada_params = {'n_estimators':[20,30,40,50,60,80, 100, 120],
             'learning_rate':[1, .8, .4,.3,.1],}
ada_gs = GridSearchCV(ada, ada_params, n_jobs=4)
ada_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [20, 30, 40, 50, 60, 80, 100, 120], 'learning_rate': [1, 0.8, 0.4, 0.3, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [145]:
ada_gs.best_params_

{'learning_rate': 0.8, 'n_estimators': 100}

In [None]:
#best params with train test was learning rate .8 with 100 estimators

In [178]:
#### THIS WAS THE HIGHEST SCORING MODEL
################ SUBMISSION 3

ada_model = AdaBoostClassifier(n_estimators=100, learning_rate=.8)
ada_model.fit(X_train,y_train)

ada_preds = ada_model.predict(num_test)

ada_sub_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':ada_preds})

#ada_sub_df.to_csv('./predictions3.csv', index=False)

ada_sub_df.WnvPresent.value_counts()

0    90100
1    26193
Name: WnvPresent, dtype: int64

## Gradient Boost on Numerical Data

In [35]:
xgb = GradientBoostingClassifier()
xgb_params = {'learning_rate':[.01, .05, .1, .3, .5],
             'n_estimators':[50,80, 100, 120],
             'min_samples_split':[2,3,4]}
xgb_gs = GridSearchCV(xgb, xgb_params, n_jobs=4)
xgb_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5], 'n_estimators': [50, 80, 100, 120], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
xgb_gs.best_params_

{'learning_rate': 0.3, 'min_samples_split': 2, 'n_estimators': 50}

In [37]:
xgb_preds = xgb_gs.predict(X_test)

In [38]:
precision_score(y_test, xgb_preds), accuracy_score(y_test, xgb_preds)

(0.7266187050359713, 0.7463768115942029)

In [2]:
#### THIS WAS THE HIGHEST SCORING MODEL
################ SUBMISSION 3

xgb_model = GradientBoostingClassifier(learning_rate=.2, n_estimators=50)
xgb_model.fit(X,y)

xgb_preds = xgb_model.predict(num_test)

xgb_sub_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':xgb_preds})

#xgb_sub_df.to_csv('./predictions5.csv', index=False)

xgb_sub_df.WnvPresent.value_counts()

NameError: name 'X' is not defined

## Trying same three models on dummied data

In [40]:
train_df.columns

Index(['Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'WnvPresent', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint', 'WetBulb',
       'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth', 'PrecipTotal',
       'StnPressure', 'SeaLevel', 'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [96]:
dummies_df = pd.get_dummies(train_df.drop(['Address','Street'], axis = 1))
test_dummies_df = pd.get_dummies(test_df)

In [97]:
dummies_df.shape, test_dummies_df.shape

((1102, 344), (116293, 666))

In [98]:
X_dum = dummies_df.drop('WnvPresent', axis = 1)
y_dum = dummies_df.WnvPresent

In [99]:
for col in test_dummies_df.columns:
    if col not in X_dum.columns:
        test_dummies_df.drop(f'{col}', inplace=True, axis = 1)

In [100]:
X_dum.shape, test_dummies_df.shape

((1102, 343), (116293, 320))

In [101]:
for col in X_dum.columns:
    if col not in test_dummies_df.columns:
        X_dum.drop(f'{col}', inplace=True, axis=1)

In [102]:
X_dum.shape, test_dummies_df.shape

((1102, 320), (116293, 320))

In [104]:
Xdum_train, Xdum_test, ydum_train, ydum_test = train_test_split(X_dum, y_dum, test_size = .25)

In [105]:
ss_dum = StandardScaler()
ss_dum.fit(Xdum_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [106]:
Xdum_train = ss_dum.transform(Xdum_train)
Xdum_test = ss_dum.transform(Xdum_test)

## Random Forrest on Dummy Data
### Random forest had highest accuracy of 78.6% but lower precision than AdaBoost using just numerical data

In [107]:
rf_dum = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4, 5,6],
            }
rf_dum_gs = GridSearchCV(rf_dum, rf_params, n_jobs=4)
rf_dum_gs.fit(Xdum_train, ydum_train)
rf_dum_gs.best_params_

{'min_samples_split': 5, 'n_estimators': 12}

In [108]:
rf_dum_gs.best_params_

{'min_samples_split': 5, 'n_estimators': 12}

In [109]:
rf_dum_preds = rf_dum_gs.predict(Xdum_test)

In [110]:
precision_score(ydum_test, rf_dum_preds), accuracy_score(ydum_test, rf_dum_preds)

(0.5847953216374269, 0.6630434782608695)

## AdaBoost on Dummy Data

In [113]:
ada_dum = AdaBoostClassifier()
ada_dum_params = {'n_estimators':[30,40,50,60,80],
             'learning_rate':[1, .9, .8,.4,.1],}
ada_dum_gs = GridSearchCV(ada_dum, ada_params, n_jobs=4)
ada_dum_gs.fit(Xdum_train, ydum_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [30, 40, 50, 60, 80, 100, 120], 'learning_rate': [1, 0.8, 0.4, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [114]:
ada_dum_gs.best_params_

{'learning_rate': 1, 'n_estimators': 40}

In [115]:
ada_dum_preds = ada_dum_gs.predict(Xdum_test)

In [116]:
precision_score(ydum_test, ada_dum_preds), accuracy_score(ydum_test, ada_dum_preds)

(0.5882352941176471, 0.6666666666666666)

## Gradient Boost on Dummy Data

In [117]:
xgb_dum = GradientBoostingClassifier()
xgb_dum_params = {'learning_rate':[.005,.01, .05, .1, .3, .5],
             'n_estimators':[5, 20, 30, 40,50,80, 100, 120, 150]}
xgb_dum_gs = GridSearchCV(xgb_dum, xgb_dum_params, n_jobs=4)
xgb_dum_gs.fit(Xdum_train, ydum_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.005, 0.01, 0.05, 0.1, 0.3, 0.5], 'n_estimators': [5, 20, 30, 40, 50, 80, 100, 120, 150]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [78]:
xgb_dum_sub = GradientBoostingClassifier(learning_rate=.3, n_estimators=50)
xgb_dum_sub.fit(X,y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.3, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [121]:
xgb_dum_preds = xgb_dum_gs.predict(Xdum_test)

In [None]:
xgb_dum_pred_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':xgb_preds})

#xgb_pred_df.to_csv('./predictions2.csv', index=False)

In [122]:
precision_score(ydum_test, xgb_dum_preds), accuracy_score(ydum_test, xgb_dum_preds)

(0.6037735849056604, 0.677536231884058)

# Logistic Regression with Pipeline

In [None]:
X = df.drop(columns = ['WnvPresent', 'Address', 
                           'Species', 'Street', 'Trap', 
                           'AddressNumberAndStreet', 'WetBulb'
                           , 'Heat', 'CodeSum'],
                            axis = 1)
y = df['WnvPresent']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.33, 
                                                   random_state = 42)

# Instantiating Preprocessing and Model 
lr = LogisticRegression()
ss = StandardScaler()

# Setting up Pipeline 

lr_pipe = Pipeline([
    ('ss', ss),
    ('lr', lr)
])

# Setting up Parameter Dictionary 
gs_lr_params = {
    'lr__penalty': ['l1', 'l2'], 
    'lr__C': [0.5, 1.0, 1.2]
}

# Instantiating and Fitting my Grid Search
gs_lr = GridSearchCV(lr_pipe, param_grid=gs_lr_params)
gs_lr.fit(X_train, y_train);

print("Best Params:", gs_lr.best_params_)
print("Best Train Score:", gs_lr.best_score_ )
print("Best Test Score:", gs_lr.score(X_test, y_test) )