In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
test_df = pd.read_csv('./kaggle_data/test.csv')
train_df = pd.read_csv('./kaggle_data/train.csv')
weather_df = pd.read_csv('./kaggle_data/weather.csv')

In [3]:
daily_weather = weather_df[weather_df['Station'] == 1] # creating weather data df using only one station

In [4]:
daily_weather.drop('Station', axis= 1, inplace=True) # dropping station label since all are station 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [5]:
daily_weather.reset_index(inplace=True, drop=True) 

daily_weather['Date'] = pd.to_datetime(daily_weather['Date']) 

daily_weather.set_index('Date',inplace=True, drop=True) # datetime index on weather data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
train_df['Date'] = pd.to_datetime(train_df['Date']) #datetime index on training data
test_df['Date'] = pd.to_datetime(test_df['Date'])
train_df.set_index('Date', inplace=True, drop=True)
test_df.set_index('Date', inplace=True, drop = True)

In [7]:
train_df = pd.merge(train_df, 
                  daily_weather, 
                  left_index = True, right_index = True)
test_df = pd.merge(test_df, 
                 daily_weather, 
                  left_index = True, right_index = True)#new_df = combined weather and train data

In [8]:
train_df.drop(['Water1','SnowFall'], axis=1, inplace=True) # dropping these columns since they provided no info
test_df.drop(['Water1','SnowFall'], axis=1, inplace=True) # dropping these columns since they provided no info

In [9]:
mode_precip = float(train_df[train_df['PrecipTotal'] != '  T'].PrecipTotal.mode()[0]) # storing mode precip value for replacement in next step

In [10]:
train_precip_totals = []
for total in train_df.PrecipTotal:
    if total == '  T':
        train_precip_totals.append(mode_precip)
    else:
        train_precip_totals.append(total)

train_df.PrecipTotal = pd.to_numeric(train_precip_totals) # replacing '  T' with mode precip value

In [11]:
test_precip_totals = []
for total in test_df.PrecipTotal:
    if total == '  T':
        test_precip_totals.append(mode_precip)
    else:
        test_precip_totals.append(total)

test_df.PrecipTotal = pd.to_numeric(test_precip_totals) # replacing '  T' with mode precip value

In [12]:
mode_pressure = train_df[train_df['StnPressure'] != 'M'].StnPressure.mode() # storing mode pressure for replacement in next step

In [13]:
train_pressures = []
for pressure in train_df.StnPressure:
    if pressure == 'M':
        train_pressures.append(mode_pressure)
    else:
        train_pressures.append(pressure)
train_pressures = [float(pressure) for pressure in train_pressures]

train_df.StnPressure = pd.to_numeric(train_pressures) # replacing 'M' with mode pressure value

In [14]:
test_pressures = []
for pressure in test_df.StnPressure:
    if pressure == 'M':
        test_pressures.append(mode_pressure)
    else:
        test_pressures.append(pressure)
test_pressures = [float(pressure) for pressure in test_pressures]

test_df.StnPressure = pd.to_numeric(test_pressures) # replacing 'M' with mode pressure value

In [15]:
#new_df.dtypes
cols_to_change = ['Tavg',
                 'Depart',
                 'Cool',
                 'Sunrise',
                 'Sunset',
                 'Depth',
                 'PrecipTotal',
                 'StnPressure',
                 'SeaLevel',
                 'AvgSpeed'
                 ] # columns of type object that can be coerced to numeric values

In [17]:
for col in cols_to_change:
    train_df[col] = pd.to_numeric(train_df[col])
    test_df[col] = pd.to_numeric(test_df[col])#changing columns above to numeric

In [19]:
train_df.drop('NumMosquitos', axis = 1, inplace = True)

In [20]:
wnv1_df = train_df[train_df['WnvPresent'] == 1]

In [21]:
wnv0_df = train_df[train_df['WnvPresent'] == 0].sample(n = wnv1_df.shape[0], random_state = 21)

In [22]:
train_df = pd.concat([wnv1_df, wnv0_df], axis = 0)

In [23]:
submission_ids = test_df.Id

In [24]:
test_df.drop('Id', axis = 1, inplace = True)

In [26]:
num_train = train_df._get_numeric_data()
num_test = test_df._get_numeric_data()

In [28]:
X = num_train.drop('WnvPresent', axis = 1)
y = num_train.WnvPresent

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .25, shuffle = True)

In [30]:
ss = StandardScaler()

In [31]:
ss.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [32]:
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)
num_test = ss.transform(num_test)

In [33]:
pca = PCA()

In [34]:
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [35]:
Z_train = pca.transform(X_train)
Z_test = pca.transform(X_test)
ZTEST = pca.transform(num_test)

In [36]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

## Random Forrest on Numerical Data

In [37]:
rf = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4],
            'random_state':[1,21,31,100]}
rf_gs = GridSearchCV(rf, rf_params, n_jobs=4)
rf_gs.fit(Z_train, y_train)
rf_gs.best_params_

{'min_samples_split': 4, 'n_estimators': 10, 'random_state': 21}

In [38]:
rf_preds = rf_gs.predict(ZTEST)

In [None]:
precision_score(y_test, rf_preds), accuracy_score(y_test, rf_preds)

In [None]:
len(rf_preds), len(X_test)

In [45]:
pred_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':rf_preds})

In [47]:
pred_df.to_csv('./predictions1.csv', index=False)

## AdaBoost on Numerical Data

In [48]:
ada = AdaBoostClassifier()
ada_params = {'n_estimators':[30,40,50,60,80],
             'learning_rate':[.8,.4,.1],}
ada_gs = GridSearchCV(ada, ada_params, n_jobs=4)
ada_gs.fit(Z_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [30, 40, 50, 60, 80], 'learning_rate': [0.8, 0.4, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [49]:
ada_preds = ada_gs.predict(ZTEST)

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
ada_gs.best_params_

In [None]:
precision_score(y_test, ada_preds), accuracy_score(y_test, ada_preds)

## Gradient Boost on Numerical Data

In [51]:
xgb = GradientBoostingClassifier()
xgb_params = {'learning_rate':[.01, .05, .1, .3, .5],
             'n_estimators':[50,80, 100, 120],
             'min_samples_split':[2,3,4]}
xgb_gs = GridSearchCV(xgb, xgb_params, n_jobs=4)
xgb_gs.fit(Z_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5], 'n_estimators': [50, 80, 100, 120], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
xgb_gs.best_params_

{'learning_rate': 0.05, 'min_samples_split': 3, 'n_estimators': 100}

In [53]:
xgb_preds = xgb_gs.predict(ZTEST)

In [54]:
xgb_pred_df = pd.DataFrame({'Id': submission_ids, 'WnvPresent':xgb_preds})

In [55]:
xgb_pred_df.to_csv('./predictions2.csv', index=False)

In [None]:
precision_score(y_test, xgb_preds), accuracy_score(y_test, xgb_preds)

## Trying same three models on dummied data

In [None]:
df.columns

In [None]:
dummies_df = pd.get_dummies(df.drop(['Address','Street'], axis = 1))

In [None]:
X_dum = dummies_df.drop('WnvPresent', axis = 1)
y = dummies_df.WnvPresent

In [None]:
Xdum_train, Xdum_test, ydum_train, ydum_test = train_test_split(X_dum, y, test_size = .25)

In [None]:
ss_dum = StandardScaler()
ss_dum.fit(Xdum_train)

In [None]:
Xdum_train = ss_dum.transform(Xdum_train)
Xdum_test = ss_dum.transform(Xdum_test)

In [None]:
pca_dum = PCA(.95)

In [None]:
pca_dum.fit(Xdum_train)

In [None]:
Zdum_train = pca_dum.transform(Xdum_train)
Zdum_test = pca_dum.transform(Xdum_test)

## Random Forrest on Dummy Data
### Random forest had highest accuracy of 78.6% but lower precision than AdaBoost using just numerical data

In [None]:
rf_dum = RandomForestClassifier(n_jobs=4)
rf_params = {'n_estimators':[8,10,12,14,20],
            'min_samples_split':[2,3,4],
            'random_state':[1,21,31,100]}
rf_dum_gs = GridSearchCV(rf_dum, rf_params, n_jobs=4)
rf_dum_gs.fit(Zdum_train, ydum_train)
rf_dum_gs.best_params_

In [None]:
rf_dum_preds = rf_dum_gs.predict(Zdum_test)

In [None]:
precision_score(ydum_test, rf_dum_preds), accuracy_score(ydum_test, rf_dum_preds)

## AdaBoost on Dummy Data

In [None]:
ada_dum = AdaBoostClassifier()
ada_dum_params = {'n_estimators':[30,40,50,60,80],
             'learning_rate':[.8,.4,.1],}
ada_dum_gs = GridSearchCV(ada_dum, ada_params, n_jobs=4)
ada_dum_gs.fit(Zdum_train, ydum_train)

In [None]:
ada_dum_preds = ada_dum_gs.predict(Zdum_test)

In [None]:
ada_dum_gs.best_params_

In [None]:
precision_score(ydum_test, ada_dum_preds), accuracy_score(ydum_test, ada_dum_preds)

## Gradient Boost on Dummy Data

In [None]:
xgb_dum = GradientBoostingClassifier()
xgb_dum_params = {'learning_rate':[.01, .05, .1, .3, .5],
             'n_estimators':[50,80, 100, 120]}
xgb_dum_gs = GridSearchCV(xgb_dum, xgb_dum_params, n_jobs=4)
xgb_dum_gs.fit(Zdum_train, ydum_train)

In [None]:
xgb_dum_gs.best_params_

In [None]:
xgb_dum_preds = xgb_dum_gs.predict(Zdum_test)

In [None]:
precision_score(ydum_test, xgb_dum_preds), accuracy_score(ydum_test, xgb_dum_preds)

# Suzanne Pipeline

In [None]:
X = df.drop(columns = ['WnvPresent', 'Address', 
                           'Species', 'Street', 'Trap', 
                           'AddressNumberAndStreet', 'WetBulb'
                           , 'Heat', 'CodeSum'],
                            axis = 1)
y = df['WnvPresent']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size = 0.33, 
                                                   random_state = 42)

# Instantiating Preprocessing and Model 
lr = LogisticRegression()
ss = StandardScaler()

# Setting up Pipeline 

lr_pipe = Pipeline([
    ('ss', ss),
    ('lr', lr)
])

# Setting up Parameter Dictionary 
gs_lr_params = {
    'lr__penalty': ['l1', 'l2'], 
    'lr__C': [0.5, 1.0, 1.2]
}

# Instantiating and Fitting my Grid Search
gs_lr = GridSearchCV(lr_pipe, param_grid=gs_lr_params)
gs_lr.fit(X_train, y_train);

print("Best Params:", gs_lr.best_params_)
print("Best Train Score:", gs_lr.best_score_ )
print("Best Test Score:", gs_lr.score(X_test, y_test) )