In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
import numpy as np
from sklearn.model_selection import GridSearchCV

#### Read in cleaned test data and train data

In [6]:
master_train = pd.read_csv("assets/master_clean_train.csv", index_col=0)
master_test = pd.read_csv("assets/master_clean_test.csv", index_col=0)

print("master_train shape: ", master_train.shape)
print("master_test shape: ", master_test.shape)

master_train shape:  (9686, 28)
master_test shape:  (116293, 27)


#### Engineer some date features

In [7]:
def clean_dates(df):
    new_df = df.copy()
    new_df['dtdate'] = pd.to_datetime(df['dtdate'])
    new_df['month'] = new_df['dtdate'].dt.month
    new_df = pd.get_dummies(new_df, columns=['month'], drop_first=True)
    return new_df

In [8]:
master_train = clean_dates(master_train)
master_test = clean_dates(master_test)
print("master_train shape: ", master_train.shape)
print("master_test shape: ", master_test.shape)

master_train shape:  (9686, 33)
master_test shape:  (116293, 31)


In [6]:
print("Columns in input dataset: ")
print(list(master_train.columns))

Columns in input dataset: 
['address', 'block', 'street', 'trap', 'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 'nummosquitos', 'wnvpresent', 'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 'species_culex_restuans', 'neighborhood_infection_category_high', 'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 'timelaggedprecipitation', 'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10']


In [7]:
print("Columns in test dataset: ")
print(list(master_test.columns))

Columns in test dataset: 
['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 'species_culex_restuans', 'neighborhood_infection_category_high', 'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 'timelaggedprecipitation', 'timelaggedwindspeed', 'month_7', 'month_8', 'month_9', 'month_10']


Have to put a dummy column for 'month_6' = 0 into 'master_test' 

In [9]:
new_test = pd.concat([master_test[['date', 'address', 'block', 'street', 'trap', 
                                  'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy', 
                                  'neighborhood', 'dtdate', 'species_culex_pipiens', 'species_culex_pipiens_restuans', 
                                  'species_culex_restuans', 'neighborhood_infection_category_high', 
                                  'neighborhood_infection_category_low', 'neighborhood_infection_category_medium', 
                                  'neighborhood_infection_category_none', 'park_score', 'daylight', 'avg_tavg', 
                                  'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight', 'timelaggedtemperature', 
                                  'timelaggedprecipitation', 'timelaggedwindspeed']], 
                      pd.DataFrame({'month_6':master_test['month_7'].astype('uint8') * 0}),
                      master_test[['month_7', 'month_8', 'month_9', 'month_10']]], axis=1)

In [9]:
new_test.columns

Index(['date', 'address', 'block', 'street', 'trap', 'addressnumberandstreet',
       'latitude', 'longitude', 'addressaccuracy', 'neighborhood', 'dtdate',
       'species_culex_pipiens', 'species_culex_pipiens_restuans',
       'species_culex_restuans', 'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'],
      dtype='object')

In [10]:
new_test.dtypes

date                                              object
address                                           object
block                                              int64
street                                            object
trap                                              object
addressnumberandstreet                            object
latitude                                         float64
longitude                                        float64
addressaccuracy                                    int64
neighborhood                                      object
dtdate                                    datetime64[ns]
species_culex_pipiens                              int64
species_culex_pipiens_restuans                     int64
species_culex_restuans                             int64
neighborhood_infection_category_high               int64
neighborhood_infection_category_low                int64
neighborhood_infection_category_medium             int64
neighborhood_infection_category

In [10]:
master_test = new_test
del new_test

#### Start to hone in on predictors/features

In [11]:
target = 'wnvpresent'
features = ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10']
X = master_train[features]
y = master_train[target]

In [13]:
print(" ")
print("Features and types:")
print(X.dtypes, "\n--------")

 
Features and types:
species_culex_pipiens                       int64
species_culex_pipiens_restuans              int64
species_culex_restuans                      int64
neighborhood_infection_category_high        int64
neighborhood_infection_category_low         int64
neighborhood_infection_category_medium      int64
neighborhood_infection_category_none        int64
park_score                                float64
daylight                                    int64
avg_tavg                                  float64
avg_preciptotal                           float64
avg_avgspeed                              float64
timelaggeddaylight                        float64
timelaggedtemperature                     float64
timelaggedprecipitation                   float64
timelaggedwindspeed                       float64
month_6                                     uint8
month_7                                     uint8
month_8                                     uint8
month_9                     

#### Fix imbalanced classes by bootstrapping the minoroity class (WNV+) up to 50%

In [12]:
sm = SMOTE()

X_os, y_os = sm.fit_sample(X, y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_os, y_os, random_state=0, test_size=0.25)

ss = StandardScaler()

X_train_s = ss.fit_transform(X_train)
X_test_s = ss.transform(X_test)

In [13]:
import time

In [15]:
def output_to_kaggle(test_df, X, model, filename):
    p_array = model.predict_proba(X)
    p_series = pd.Series(p_array[:,1])
    out_df = pd.concat([test_df.index.to_series().reset_index(drop=True),p_series], axis=1)
    out_df.columns = ['Id', 'WnvPresent']
    out_df.to_csv(filename, header=True, index=False)
    return out_df

In [102]:
outfilename = r'output/kagglesubmission'+ '_' + time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime()) + '.csv'
out_df = output_to_kaggle(master_test, X_test_r, custom_logreg2.model, outfilename)

### Go back to Logistic Regression

In [1]:
from sklearn.linear_model import LogisticRegression

In [18]:
logreg1 = LogisticRegression()

In [19]:
scores = cross_val_score(logreg1, X, y, cv = 5, scoring='roc_auc')
scores

array([0.82875922, 0.76510885, 0.63319985, 0.48410675, 0.66323529])

In [22]:
np.mean(scores)

0.6748819928605517

In [20]:
logreg1.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [23]:
ss = StandardScaler()
X_s = ss.fit_transform(X)

In [31]:
gs_params_logreg1 = {'C': np.logspace(-2,2,11), 'penalty':['l1','l2'], 'class_weight':[None, 'balanced']}
logreg1 = LogisticRegression(random_state=0)
gs_logreg1 = GridSearchCV(logreg1, gs_params_logreg1, scoring='roc_auc', cv=3, verbose=1, n_jobs=2)

In [32]:
gs_logreg1.fit(X_s,y)

Fitting 3 folds for each of 44 candidates, totalling 132 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.6s
[Parallel(n_jobs=2)]: Done 132 out of 132 | elapsed:  1.8min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.51189e-02, 6.30957e-02, 1.58489e-01, 3.98107e-01,
       1.00000e+00, 2.51189e+00, 6.30957e+00, 1.58489e+01, 3.98107e+01,
       1.00000e+02]), 'penalty': ['l1', 'l2'], 'class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [34]:
gs_logreg1_results = pd.DataFrame(gs_logreg1.cv_results_).sort_values(by='mean_test_score', ascending=False)



In [35]:
gs_logreg1_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,param_class_weight,param_penalty,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
4,0.055301,0.002666,0.671889,0.818085,0.0251189,,l1,"{'C': 0.025118864315095794, 'class_weight': No...",1,0.750194,0.806305,0.656902,0.854965,0.608552,0.792984,0.010365,0.0004713707,0.058787,0.026639
40,6.311141,0.001,0.664762,0.849038,100.0,,l1,"{'C': 100.0, 'class_weight': None, 'penalty': ...",2,0.650878,0.83015,0.767958,0.879176,0.575422,0.837787,1.050145,0.001413549,0.079212,0.021538
32,3.578319,0.0,0.664663,0.848988,15.8489,,l1,"{'C': 15.848931924611142, 'class_weight': None...",3,0.652002,0.830121,0.766373,0.879138,0.575586,0.837705,1.069205,0.0,0.0784,0.021543
36,4.736743,0.003,0.664547,0.849028,39.8107,,l1,"{'C': 39.810717055349734, 'class_weight': None...",4,0.651197,0.830138,0.766908,0.879183,0.57551,0.837764,0.626053,5.15043e-07,0.078704,0.021548
41,0.186667,0.005208,0.664382,0.849029,100.0,,l2,"{'C': 100.0, 'class_weight': None, 'penalty': ...",5,0.650858,0.830149,0.76684,0.879168,0.57542,0.837768,0.020172,0.007365021,0.078728,0.021538
37,0.20876,0.001666,0.664101,0.848983,39.8107,,l2,"{'C': 39.810717055349734, 'class_weight': None...",6,0.651123,0.830151,0.765673,0.879056,0.575478,0.83774,0.015402,0.001246916,0.078185,0.02149
28,1.245298,0.000999,0.664049,0.848949,6.30957,,l1,"{'C': 6.309573444801936, 'class_weight': None,...",7,0.652737,0.830155,0.763651,0.879114,0.575731,0.837577,0.233209,0.0014131,0.077132,0.021544
8,0.097277,0.002999,0.663206,0.835607,0.0630957,,l1,"{'C': 0.06309573444801933, 'class_weight': Non...",8,0.68913,0.819786,0.684234,0.869147,0.616241,0.817887,0.015275,2.973602e-07,0.033264,0.023729
33,0.171863,0.0,0.663003,0.848945,15.8489,,l2,"{'C': 15.848931924611142, 'class_weight': None...",9,0.65158,0.830125,0.76193,0.87906,0.575472,0.837651,0.012758,0.0,0.076547,0.021515
24,1.356129,0.0,0.662654,0.848765,2.51189,,l1,"{'C': 2.5118864315095824, 'class_weight': None...",10,0.653439,0.830046,0.758175,0.878965,0.576321,0.837284,0.561048,0.0,0.074525,0.021558


In [40]:
gs_params_logreg2 = {'C': np.logspace(-2,2,21)}
logreg2 = LogisticRegression(random_state=0, penalty='l1',class_weight=None, tol=1e-4)
gs_logreg2 = GridSearchCV(logreg2, gs_params_logreg2, scoring='roc_auc', cv=10, verbose=1, n_jobs=3)

In [44]:
gs_logreg2.fit(X_s,y)

Fitting 10 folds for each of 21 candidates, totalling 210 fits


[Parallel(n_jobs=3)]: Done  55 tasks      | elapsed:    5.0s
[Parallel(n_jobs=3)]: Done 210 out of 210 | elapsed:  3.4min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'C': array([1.00000e-02, 1.58489e-02, 2.51189e-02, 3.98107e-02, 6.30957e-02,
       1.00000e-01, 1.58489e-01, 2.51189e-01, 3.98107e-01, 6.30957e-01,
       1.00000e+00, 1.58489e+00, 2.51189e+00, 3.98107e+00, 6.30957e+00,
       1.00000e+01, 1.58489e+01, 2.51189e+01, 3.98107e+01, 6.30957e+01,
       1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [46]:
gs_logreg2_results = pd.DataFrame(gs_logreg2.cv_results_).sort_values(by='mean_test_score', ascending=False)



In [47]:
gs_logreg2_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_C,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,...,split7_test_score,split7_train_score,split8_test_score,split8_train_score,split9_test_score,split9_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
11,0.966879,0.002691,0.71299,0.828667,1.58489,{'C': 1.584893192461114},1,0.923019,0.81041,0.685762,...,0.792789,0.818046,0.887691,0.808086,0.641634,0.844123,0.130349,0.004449,0.184598,0.013579
10,0.757973,0.001426,0.71255,0.828348,1.0,{'C': 1.0},2,0.923126,0.810115,0.685186,...,0.794118,0.817732,0.888758,0.807668,0.641285,0.843853,0.156974,0.001215,0.186332,0.013603
12,1.291887,0.002062,0.712541,0.828882,2.51189,{'C': 2.5118864315095824},3,0.923169,0.810697,0.685549,...,0.790349,0.818265,0.886209,0.808273,0.64183,0.84444,0.092941,0.004589,0.183767,0.013571
13,1.808447,0.002362,0.712042,0.82905,3.98107,{'C': 3.981071705534973},4,0.923404,0.810904,0.685464,...,0.788954,0.818423,0.884662,0.808411,0.641895,0.844737,0.180738,0.004586,0.183008,0.013562
14,2.783927,0.001499,0.711582,0.82915,6.30957,{'C': 6.309573444801936},5,0.923681,0.811022,0.685016,...,0.788366,0.818557,0.884074,0.808475,0.641939,0.844889,0.634663,0.001283,0.182411,0.013565
15,3.807954,0.0011,0.711337,0.829205,10.0,{'C': 10.0},6,0.923766,0.811056,0.68493,...,0.788235,0.818615,0.883965,0.808511,0.642004,0.844985,0.985297,0.001135,0.182156,0.013574
16,5.30684,0.000608,0.71071,0.829231,15.8489,{'C': 15.848931924611142},7,0.923766,0.81108,0.684845,...,0.788148,0.818659,0.883856,0.808515,0.642004,0.845022,0.850599,0.001023,0.181847,0.013577
9,0.619501,0.001332,0.710463,0.827824,0.630957,{'C': 0.6309573444801934},8,0.922827,0.809615,0.681858,...,0.794466,0.81701,0.888802,0.807094,0.639695,0.843293,0.175352,0.001154,0.188195,0.013678
17,7.084145,0.001599,0.710428,0.829246,25.1189,{'C': 25.11886431509582},9,0.923766,0.811097,0.684738,...,0.788105,0.818674,0.883834,0.808509,0.642048,0.845041,1.405906,0.00128,0.181726,0.01358
18,8.622489,0.001299,0.710355,0.829255,39.8107,{'C': 39.810717055349734},10,0.923788,0.811102,0.684738,...,0.788083,0.818685,0.883834,0.808512,0.642026,0.845051,1.041156,0.001099,0.181687,0.013581


In [48]:
logreg_final = LogisticRegression(random_state=0, penalty='l1',class_weight=None, tol=1e-4, C=1)
logreg_final.fit(X_s,y)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [93]:
scores=cross_val_score(logreg_final, X_s, y, cv = 5, scoring = 'roc_auc')
scores

array([0.80095345, 0.83621596, 0.58896608, 0.46476035, 0.65029412])

In [95]:
print('Mean of ROC AUC scores is {0:.3f}'.format(np.mean(scores)))
print('Std. dev. of ROC AUC scores is {0:.3f}'.format(np.std(scores)))

Mean of ROC AUC scores is 0.668
Std. dev. of ROC AUC scores is 0.137


In [65]:
final_coef_df = pd.DataFrame({'feature': ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'], 'coef': logreg_final.coef_.tolist()[0]}).sort_values(by='coef')

In [66]:
final_coef_df

Unnamed: 0,coef,feature
8,-1.194248,daylight
6,-0.671658,neighborhood_infection_category_none
4,-0.365203,neighborhood_infection_category_low
16,-0.256499,month_6
14,-0.252825,timelaggedprecipitation
15,-0.188598,timelaggedwindspeed
10,-0.18608,avg_preciptotal
20,-0.142273,month_10
7,-0.073265,park_score
5,0.0,neighborhood_infection_category_medium


work from https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d

In [67]:
from sklearn import linear_model
import numpy as np
import scipy.stats as stat

class CustomLogisticRegression:
    """
    Wrapper Class for Logistic Regression which has the usual sklearn instance 
    in an attribute self.model, and pvalues, z scores and estimated 
    errors for each coefficient in 
    
    self.z_scores
    self.p_values
    self.sigma_estimates
    
    as well as the negative hessian of the log Likelihood (Fisher information)
    
    self.F_ij
    """
    
    def __init__(self,*args,**kwargs):#,**kwargs):
        self.model = linear_model.LogisticRegression(*args,**kwargs)#,**args)

    def fit(self,X,y):
        self.model.fit(X,y)
        #### Get p-values for the fitted model ####
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        F_ij = np.dot((X/denom[:,None]).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.array([np.sqrt(Cramer_Rao[i,i]) for i in range(Cramer_Rao.shape[0])]) # sigma for each coefficient
        z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values
        
        self.z_scores = z_scores
        self.p_values = p_values
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_ij

In [68]:
custom_logreg1 = CustomLogisticRegression(random_state=0, penalty='l1',class_weight=None, tol=1e-4, C=1)

In [69]:
custom_logreg1.fit(X_s,y)

In [74]:
custom_logreg1.model.coef_

array([[ 1.19613592,  1.31513737,  0.836149  ,  0.21789752, -0.36520304,
         0.        , -0.67165834, -0.07326503, -1.19424824,  0.19641165,
        -0.18607964,  0.3293212 ,  0.72511484,  0.20709738, -0.25282495,
        -0.18859848, -0.25649878,  0.        ,  0.54593396,  0.18073102,
        -0.1422735 ]])

In [75]:
custom_coef_df = pd.DataFrame({'feature': ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'neighborhood_infection_category_high',
       'neighborhood_infection_category_low',
       'neighborhood_infection_category_medium',
       'neighborhood_infection_category_none', 'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10'], 'coef': custom_logreg1.model.coef_.tolist()[0],
                              'p': custom_logreg1.p_values}).sort_values(by='coef')

In [77]:
custom_coef_df.sort_values(by='p')

Unnamed: 0,coef,feature,p
8,-1.194248,daylight,4.646658e-12
11,0.329321,avg_avgspeed,5.550898e-07
14,-0.252825,timelaggedprecipitation,9.175197e-06
12,0.725115,timelaggeddaylight,0.0001184212
0,1.196136,species_culex_pipiens,0.0007370398
1,1.315137,species_culex_pipiens_restuans,0.001683634
15,-0.188598,timelaggedwindspeed,0.009425257
10,-0.18608,avg_preciptotal,0.01736614
13,0.207097,timelaggedtemperature,0.0214929
9,0.196412,avg_tavg,0.0244017


In [84]:
reduced_features = ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed']
X_r = master_train[reduced_features]
ss = StandardScaler()
X_r_s = ss.fit_transform(X_r)

In [101]:
X_test_r = master_test[reduced_features].values

In [85]:
custom_logreg2 = CustomLogisticRegression(random_state=0, penalty='l1',class_weight=None, tol=1e-4, C=1)
custom_logreg2.fit(X_r_s,y)

In [87]:
custom_coef_df = pd.DataFrame({'feature': ['species_culex_pipiens',
       'species_culex_pipiens_restuans', 'species_culex_restuans',
       'park_score', 'daylight',
       'avg_tavg', 'avg_preciptotal', 'avg_avgspeed', 'timelaggeddaylight',
       'timelaggedtemperature', 'timelaggedprecipitation',
       'timelaggedwindspeed'], 'coef': custom_logreg2.model.coef_.tolist()[0],
                              'p': custom_logreg2.p_values})

In [99]:
custom_coef_df.sort_values(by='p')

Unnamed: 0,coef,feature,p
4,-1.562928,daylight,7.286135999999999e-44
8,1.23973,timelaggeddaylight,6.465939000000001e-31
10,-0.315213,timelaggedprecipitation,2.250784e-09
7,0.271071,avg_avgspeed,5.711878e-06
11,-0.290134,timelaggedwindspeed,3.448055e-05
0,1.191021,species_culex_pipiens,0.0005957049
5,0.259545,avg_tavg,0.001254877
1,1.274247,species_culex_pipiens_restuans,0.001882032
6,-0.209992,avg_preciptotal,0.003273281
2,0.774678,species_culex_restuans,0.03916881


In [97]:
scores = cross_val_score(custom_logreg2.model, X_r_s, y, cv=5, scoring='roc_auc')

In [98]:
print('Mean of ROC AUC scores is {0:.3f}'.format(np.mean(scores)))
print('Std. dev. of ROC AUC scores is {0:.3f}'.format(np.std(scores)))

Mean of ROC AUC scores is 0.654
Std. dev. of ROC AUC scores is 0.151
