# 4 Training and Modeling Data<a id='4_Training_and_Modeling_Data'></a>

## Contents <a id ="Content" > </a>

* [Introduction](#Introduction)
* [Imports](#Imports)
* [Train Test Split](#Train_Test_Split) 
* [Training and Modeling](#Training_and_Modeling)
    * [Model Selection](#Model_Selection)
    * [Evaluation Metrics](#Evaluation-Metrics)
        * [Training and Modeling](#Train_and_Model)
        * [Hyperparameter Tuning and Model Training](#Hyperparameter_Tuning_Training)
            * [Logistic Regression](#Logistic_Regression)
            * [Evaluation](#Evaluation)
* [Additional Models](#AdditionalModels)
* [Summary](#Summary)
* [Recommendations](#Recom)

## Introduction <a id = 'Introduction'></a>

## Imports <a id="Imports"></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn import metrics
from sklearn import pipeline
from sklearn import model_selection
from sklearn import linear_model 

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV
import matplotlib.pyplot as plt

# remove warning
import warnings
warnings.filterwarnings("ignore")


In [2]:
X_train = pd.read_csv("../data/4.X_train.csv")
y_train = pd.read_csv("../data/4.y_train.csv")
X_test = pd.read_csv("../data/4.X_test.csv")
y_test = pd.read_csv("../data/4.y_test.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../data/4.X_train.csv'

In [None]:
type(X_train)

In [None]:
pd.__version__

In [None]:
X_train.shape,y_train.shape

In [None]:
X_test.shape,y_test.shape

In [None]:
X_train.describe().T

In [None]:
X_test.describe().T

## Training and Modeling <a id=Training_and_Modeling ></a>

### Model Selection <a id=Model_Selection ></a>

Four competing supervised classfication models/algorithms are considered, namely, 
* Logistic Regression classification

### Training and Modeling <a id=Train_and_Model>

### Fit Model on Intercept

In [None]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().ravel()

In [None]:
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().ravel()

In [None]:
logistic_regression = ("model", LogisticRegression(fit_intercept=False,max_iter=500,random_state=632966))

model_params = {"model__C": (np.logspace(start=-4, stop=4, num=30))}

model_pipeline = Pipeline(steps=[logistic_regression])

#cross_validator = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
cross_validator = (StratifiedShuffleSplit(train_size=0.8, random_state=1337, n_splits=200))

# set up grid search
model_grid = (GridSearchCV(estimator=model_pipeline,
                           param_grid=model_params,
                           refit=True, 
                           scoring="roc_auc", 
                           cv=cross_validator))

In [None]:
%%time
# fit model on intercept (random guesses - baseline performance)
null_mod = model_grid.fit(np.ones(shape=X_train.shape[0]).reshape(-1,1), y_train)

In [None]:
print("Best model parameters - null model ")
print("Cost parameter: {:.03f}".format(null_mod.best_params_["model__C"])) 
print("Best score {:0.3f}".format(null_mod.best_score_))

In [None]:
 np.mean(y_train), np.var(y_train)

In [None]:
null_reg = linear_model.LogisticRegression(fit_intercept= False,max_iter=500,random_state=632966)

In [None]:
X_train_null = np.ones(shape=y_train.shape[0]).reshape(-1,1)
X_test_null = np.ones(shape=y_test.shape[0]).reshape(-1,1)

In [None]:
#fit on train and test data no tuning
null_train = null_reg.fit(X=X_train_null,y=y_train)
null_test = null_reg.fit(X=X_test_null,y=y_test)

In [None]:
#predict_proba and pred_prob returns two columns, second one is target
train_pred = null_train.predict_proba(X_train_null)
test_pred = null_train.predict_proba(X_test_null)

In [None]:
#base metrics: minmum performance
metrics.precision_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0))
metrics.recall_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0))
metrics.f1_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0))

In [None]:
#check to see if close to population value
#print(f"Observed: {y_train.mean():.3f} Fit: {train_pred[:,1].mean(): .3f}")
print("Observed: {:.3f}, Fit: {:.3f}".format(y_train.mean(),train_pred[:,1].mean()))

In [None]:
model_params = {"model__C": (np.logspace(start=-4,stop=4,num=30))}

In [None]:
#pipe line has just model object
model_pipeline = pipeline.Pipeline(steps=[("model",null_reg)])

In [None]:
# set up cross-validator:
# 80/20 train/calibration split in this data (test data still held out)
# 200 iterations
# feel free to use another CV method and explain why
cross_validator = (
  model_selection.StratifiedShuffleSplit(train_size=0.8,
                                          random_state=1337,
                                          n_splits=50))

In [None]:
# set up grid search
model_grid = (
  model_selection.GridSearchCV(estimator=model_pipeline,
                           param_grid=model_params,
                           refit=True, # refit using best estimates
                           scoring="roc_auc", # metric to optimize (can pick another)
                           cv=cross_validator,
                           n_jobs = -1))

In [None]:
# fit model on intercept (random guesses - baseline performance)
null_mod = model_grid.fit(X_train_null,y_train)

In [None]:
logistic_regression = ("model", LogisticRegression(fit_intercept=False,max_iter=500,random_state=632966))
#null_reg = linear_model.LogisticRegression(fit_intercept= False,max_iter=500,random_state=632966)

model_params = {"model__C": (np.logspace(start=-4, stop=4, num=30))}
#model_params = {"model__C": (np.logspace(start=-4,stop=4,num=30))}

model_pipeline = Pipeline(steps=[logistic_regression])
#model_pipeline = pipeline.Pipeline(steps=[("model",null_reg)])

#cross_validator = (StratifiedShuffleSplit(train_size=0.8, random_state=1337, n_splits=50))
cross_validator = (
  model_selection.StratifiedShuffleSplit(train_size=0.8,
                                          random_state=1337,
                                          n_splits=50))

# set up grid search
model_grid = (
  model_selection.GridSearchCV(estimator=model_pipeline,
                           param_grid=model_params,
                           refit=True, # refit using best estimates
                           scoring="roc_auc", # metric to optimize (can pick another)
                           cv=cross_validator,
                           n_jobs = -1))

In [None]:
# fit model on intercept (random guesses - baseline performance)
null_mod = model_grid.fit(X_train_null,y_train)

In [None]:
cv_scores = (
  model_selection.cross_val_score(null_mod,
                                  X_train_null,
                                  y_train,
                                  scoring="roc_auc",
                                  cv=cross_validator,
                                  n_jobs=-1))

In [None]:
cv_quantiles = np.quantile(a=cv_scores,q=[0.025,0.975])

In [None]:
print("Observed: {:.3f}".format(y_train.mean()))

In [None]:
print(f" 2.5%:{cv_quantiles[0]:.3f},97.5%:{cv_quantiles[1]:.3f}")

In [None]:
def threshhold_and_plot(y_test,target,n_points=50,plot=True):
    probs = np.linspace(0,1,n_points)
    #def make_thresh():
    precision = [metrics.precision_score(y_test,np.where(target > thresh,1,0)) for thresh in probs]
    recall = [metrics.recall_score(y_test,np.where(target > thresh,1,0)) for thresh in probs]
    f1 = [metrics.f1_score(y_test,np.where(target > thresh,1,0)) for thresh in probs]
       # return precision,recall,f1

    #precision,recall,f1 = make_thresh()
    if plot:
        plt.plot(probs,precision,label='precision')
        plt.plot(probs,recall,label='recall')
        plt.plot(probs,f1,label='f1')
        plt.title("Metrics at different threshold")
        plt.xlabel("Probability")
        plt.legend()
        return precision,recall,f1
    else:
        return precision,recall,f1
    
def max_thresh(score_vec):
    max_idx=np.argmax(score_vec)
    max_score=score_vec(max_idx)
    return max_idx,max_score

In [None]:
import seaborn as sns
sns.histplot(cv_scores, bins=10)

In [None]:
threshhold_and_plot(X_test_null,y_test)

In [None]:
# check to see if close to population value
print(f"Observed: {y_train.mean():.3f} Fit: {train_pred[:,1].mean():.3f}")

## Refining The Linear Model

## Additional Models: <a id=AdditionalModels></a>

## Summary <a id =Summary> </a>

## Recommendations <a id = Recom></a>