# Part 4c: Model to Predict Period 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import warnings
import os
from category_encoders import TargetEncoder 
import seaborn as sns
warnings.filterwarnings('ignore')
%matplotlib inline

from scipy import stats
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier

from skopt import BayesSearchCV
import pickle

In [2]:
#reading in file 
booker=pd.read_csv('Datasets/eda_booker_report.csv')
booker=booker.iloc[:,1:]
booker

Unnamed: 0,ACCAP,AGE,GGDUM,BOOKER2,AROFFAP,CIRCDIST,MITDUM,MONCIRC,ONSEX,NEWCIT,...,SAFEVALVE,SENTIMP,OTCHPTS,WEAPON,XCRHISSR,LMIN,FY,USSCIDN,ENSPLT0,GDL
0,0,28,0,2,0,80.0,0,10.0,0,1,...,2,1,5,0,3,10,2005,890751.0,8.0,0
1,0,24,0,2,0,80.0,0,10.0,0,0,...,2,1,3,0,2,168,2005,890752.0,72.0,1
2,0,32,0,0,0,33.0,0,5.0,1,0,...,0,1,3,0,2,12,2005,890755.0,15.0,2
3,0,24,1,0,0,33.0,0,5.0,0,0,...,0,1,10,1,5,228,2005,890756.0,270.0,2
4,0,43,0,0,0,93.0,0,11.0,0,1,...,2,1,9,0,4,57,2005,890757.0,60.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,0,40,0,2,0,47.0,0,7.0,1,1,...,2,4,0,0,1,6,2004,759721.0,0.0,4
799996,0,41,0,0,0,47.0,0,7.0,0,1,...,2,1,0,0,1,24,2004,759722.0,24.0,51
799997,0,23,0,0,0,47.0,0,7.0,0,0,...,2,3,1,0,1,6,2004,759723.0,6.0,8
799998,0,30,0,0,0,47.0,0,7.0,0,1,...,2,1,8,0,4,57,2004,759724.0,61.0,0


In [3]:
booker.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 28 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   ACCAP      800000 non-null  int64  
 1   AGE        800000 non-null  int64  
 2   GGDUM      800000 non-null  int64  
 3   BOOKER2    800000 non-null  int64  
 4   AROFFAP    800000 non-null  int64  
 5   CIRCDIST   800000 non-null  float64
 6   MITDUM     800000 non-null  int64  
 7   MONCIRC    800000 non-null  float64
 8   ONSEX      800000 non-null  int64  
 9   NEWCIT     800000 non-null  int64  
 10  NEWCNVTN   800000 non-null  int64  
 11  EWEDUC     800000 non-null  int64  
 12  NEWRACE    800000 non-null  int64  
 13  OFFTYPE2   800000 non-null  int64  
 14  ERIOD      800000 non-null  int64  
 15  PRIMARY    800000 non-null  int64  
 16  QUARTER    800000 non-null  float64
 17  A          800000 non-null  int64  
 18  SAFEVALVE  800000 non-null  int64  
 19  SENTIMP    800000 non-n

In [4]:
# defining X and y and splitting into traintest sets
y= booker['ERIOD']
X = booker.drop(columns = 'ERIOD')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=42)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Logistic Regression

In [6]:
lg = LogisticRegression(random_state = 42)
lg.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [7]:
lg.score(X_train,y_train)

0.9769910714285714

In [8]:
cross_val_score(lg, X_train, y_train, cv=5).mean()

0.9772464285714285

In [9]:
y_pred = lg.predict(X_test)


In [10]:
lg.score(X_test,y_test)

0.9772208333333333

## Random Forest Classifier  

In [11]:
# instantiate and fit model
rf = RandomForestClassifier(n_estimators=200, 
                            max_depth=None, 
                            max_features='log2', 
                            min_samples_split=50)
rf.fit(X_train, y_train)
# scoring train model data
rf.score(X_train, y_train)

0.9928535714285714

In [12]:
cross_val_score(rf,X_train,y_train,cv=5).mean()

0.9896839285714286

In [13]:
rf.score(X_test,y_test)

0.9899375

## GridSearchCV using Logistic Regression

In [34]:
parameters = [{'penalty':['l1','l2']}, 
              {'C':[1, 10, 100, 1000]}]
gs = GridSearchCV(estimator = lg,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)


gs.fit(X_train, y_train) 

GridSearchCV(cv=5, estimator=LogisticRegression(random_state=42),
             param_grid=[{'penalty': ['l1', 'l2']}, {'C': [1, 10, 100, 1000]}],
             scoring='accuracy')

In [35]:
gs.score(X_train,y_train)

0.9769910714285714

In [36]:
cross_val_score(gs, X_train, y_train, cv=5).mean()

0.9772464285714285

In [37]:
y_pred = gs.predict(X_test)

In [38]:
gs.score(X_test,y_test)

0.9772208333333333

## BayesSearchCV using Logistic Regression

In [39]:
lg = LogisticRegression(random_state =42)


pipe_lg_params = {
    'C': np.logspace(-4, 4, 50), 
    'max_iter': [10_000, 1_000_000]
}   

bs_lg=BayesSearchCV(
    estimator=lg,
    search_spaces=pipe_lg_params,
    n_iter=50, 
    verbose=1,
    cv=5,
    n_jobs=-1
)

In [40]:
bs_lg.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

BayesSearchCV(cv=5, estimator=LogisticRegression(random_state=42), n_jobs=-1,
              search_spaces={'C': array([1.00000000e-04, 1.45634848e-04, 2.12095089e-04, 3.08884360e-04,
       4.49843267e-04, 6.55128557e-04, 9.54095476e-04, 1.38949549e-03,
       2.02358965e-03, 2.94705170e-03, 4.29193426e-03, 6.25055193e-03,
       9.10298178e-03, 1.32571137e-02, 1.93069773e-02, 2.81176870e-02,
       4.0949150...
       3.72759372e+00, 5.42867544e+00, 7.90604321e+00, 1.15139540e+01,
       1.67683294e+01, 2.44205309e+01, 3.55648031e+01, 5.17947468e+01,
       7.54312006e+01, 1.09854114e+02, 1.59985872e+02, 2.32995181e+02,
       3.39322177e+02, 4.94171336e+02, 7.19685673e+02, 1.04811313e+03,
       1.52641797e+03, 2.22299648e+03, 3.23745754e+03, 4.71486636e+03,
       6.86648845e+03, 1.00000000e+04]),
                             'max_iter': [10000, 1000000]},
              verbose=1)

In [41]:
bs_lg.score(X_train,y_train)

0.9782035714285714

In [42]:
cross_val_score(bs_lg, X_train, y_train, cv=5).mean()

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

0.9779642857142857

In [43]:
y_pred = bs_lg.predict(X_test)

In [44]:
bs_lg.score(X_test,y_test)

0.9783083333333333

In [45]:
# with open('eriod_bs_lg', 'wb') as pickle_out:
    # pickle.dump(bs_lg, pickle_out)

In [46]:
# with open('eriod_gs_lg', 'wb') as pickle_out:
    # pickle.dump(gs, pickle_out)

In [47]:
# with open('eriod_rf', 'wb') as pickle_out:
    # pickle.dump(rf, pickle_out)