# 4 Training and Modeling Data<a id='4_Training_and_Modeling_Data'></a>

## Contents <a id ="Content" > </a>

* [Introduction](#Introduction)
* [Imports](#Imports)
* [Train Test Split](#Train_Test_Split) 
* [Training and Modeling](#Training_and_Modeling)
    * [Model Selection](#Model_Selection)
    * [Evaluation Metrics](#Evaluation-Metrics)
        * [Training and Modeling](#Train_and_Model)
        * [Hyperparameter Tuning and Model Training](#Hyperparameter_Tuning_Training)
            * [Logistic Regression](#Logistic_Regression)
            * [Evaluation](#Evaluation)
* [Additional Models](#AdditionalModels)
* [Summary](#Summary)
* [Recommendations](#Recom)

## Introduction <a id = 'Introduction'></a>

## Imports <a id="Imports"></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import pipeline
from sklearn import model_selection
from sklearn import linear_model 

#from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

#from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV
import matplotlib.pyplot as plt

# remove warning
#import warnings
#warnings.filterwarnings("ignore")


In [2]:
X_train = pd.read_csv("../data/4.X_train.csv")
y_train = pd.read_csv("../data/4.y_train.csv")
X_test = pd.read_csv("../data/4.X_test.csv")
y_test = pd.read_csv("../data/4.y_test.csv")

In [3]:
pd.__version__

'1.2.4'

In [4]:
X_train.shape,y_train.shape

((7088, 16), (7088, 1))

In [5]:
X_test.shape,y_test.shape

((3039, 16), (3039, 1))

In [6]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Customer_Age,7088.0,-3.454018e-16,1.000071,-2.531322,-0.661337,-0.038008,0.709986,3.327966
Credit_Limit,7088.0,8.597793,0.930669,7.271217,7.841395,8.418587,9.298671,10.449178
Total_Revolving_Bal,7088.0,0.9103482,0.638103,0.0,0.248822,1.0,1.396112,1.977219
Total_Trans_Amt,7088.0,8.167921,0.656979,6.390241,7.674617,8.267449,8.468633,9.824661
Avg_Utilization_Ratio,7088.0,0.275307,0.276635,0.0,0.023,0.175,0.502,0.999
Gender_Encoded,7088.0,0.5328725,0.498953,0.0,0.0,1.0,1.0,1.0
Dependent_count,7088.0,2.344949,1.299597,0.0,1.0,2.0,3.0,5.0
Education_Level_sorted,7088.0,2.839306,1.215114,1.0,2.0,3.0,3.0,6.0
Income_Category_sorted,7088.0,2.175226,1.334986,1.0,1.0,2.0,3.0,5.0
Card_Category_sorted,7088.0,1.081546,0.330233,1.0,1.0,1.0,1.0,4.0


In [7]:
X_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Customer_Age,3039.0,1.760867e-16,1.000165,-2.545475,-0.671517,-0.046864,0.702719,2.701607
Credit_Limit,3039.0,8.616519,0.940983,7.271217,7.854381,8.432724,9.328701,10.449178
Total_Revolving_Bal,3039.0,0.9134881,0.640117,0.0,0.349961,1.0,1.401403,1.961808
Total_Trans_Amt,3039.0,8.157574,0.648578,6.234411,7.682021,8.271293,8.453401,9.797849
Avg_Utilization_Ratio,3039.0,0.2739293,0.273522,0.0,0.0225,0.177,0.5035,0.994
Gender_Encoded,3039.0,0.5202369,0.499673,0.0,0.0,1.0,1.0,1.0
Dependent_count,3039.0,2.349128,1.297511,0.0,1.0,2.0,3.0,5.0
Education_Level_sorted,3039.0,2.851925,1.201185,1.0,2.0,3.0,3.0,6.0
Income_Category_sorted,3039.0,2.199079,1.332057,1.0,1.0,2.0,3.0,5.0
Card_Category_sorted,3039.0,1.088516,0.341929,1.0,1.0,1.0,1.0,4.0


## Training and Modeling <a id=Training_and_Modeling ></a>

### Model Selection <a id=Model_Selection ></a>

### Training and Modeling <a id=Train_and_Model>

### Fit Model on Intercept (naive model)

In [8]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().ravel()

In [9]:
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().ravel()

In [10]:
def threshhold_and_plot(y_test,target,n_points=50,plot=True):
    probs = np.linspace(0,1,n_points)
    def make_thresh(): 
        precision = [metrics.precision_score(y_test,np.where(target > thresh,1,0),zero_division=1) for thresh in probs]
        recall = [metrics.recall_score(y_test,np.where(target > thresh,1,0),labels=np.unique(target)) for thresh in probs]
        f1 = [metrics.f1_score(y_test,np.where(target > thresh,1,0),labels=np.unique(target),zero_division=1) for thresh in probs]
        return precision,recall,f1

    precision,recall,f1 = make_thresh()
    if plot:
        plt.plot(probs,precision,label='precision')
        plt.plot(probs,recall,label='recall')
        plt.plot(probs,f1,label='f1')
        plt.title("Metrics at different threshold")
        plt.xlabel("Probability")
        plt.legend()
        return precision,recall,f1
    else:
        return precision,recall,f1
    
def max_thresh(score_vec):
    max_idx=np.argmax(score_vec)
    max_score=score_vec[max_idx]
    return max_idx,max_score

In [11]:
#plot roc

def plot_roc(actuals,preds):
    FPRs, TPRs, thresholds  = metrics.roc_curve(actuals, preds)
    print("FPRs:{}\n TPRs:{}\n thresholds:{}".format(FPRs,TPRs,thresholds))
    print("Area under ROC:{}".format(metrics.roc_auc_score(actuals, preds)))
    optimal_idx = np.argmax(TPRs - FPRs)
    optimal_threshold = thresholds[optimal_idx]
    print("Threshold value is:", optimal_threshold)

    # Plot the ROC curve
    plt.plot(FPRs, TPRs, color='red',lw=2, label='ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label="Mean")
    plt.xlim([-0.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title("ROC curve")
    plt.legend(loc="lower right")
    plt.show()

In [12]:
import itertools
# plot confusion matrix
def plot_confusion_matrx(confm):
    classes = [0, 1]
    plt.imshow(confm, interpolation='nearest', cmap=plt.cm.Greens)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = confm_train.max() / 2.
    for i, j in itertools.product(range(confm.shape[0]), range(confm.shape[1])):
        plt.text(j, i, format(confm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if confm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [13]:
null_reg = linear_model.LogisticRegression(fit_intercept= False,max_iter=500,random_state=632966)

In [14]:
X_train_null = np.ones(shape=y_train.shape[0]).reshape(-1,1)
X_test_null = np.ones(shape=y_test.shape[0]).reshape(-1,1)

In [15]:
#fit on train and test data no tuning
null_train = null_reg.fit(X=X_train_null,y=y_train)
null_test = null_reg.fit(X=X_test_null,y=y_test)

In [16]:
#predict_proba and pred_prob returns two columns, second one is target
train_pred = null_train.predict_proba(X_train_null)
test_pred = null_train.predict_proba(X_test_null)

In [17]:
#base metrics: minmum performance
metrics.precision_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0),zero_division=1)

1.0

In [18]:
metrics.recall_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0))

0.0

In [19]:
metrics.f1_score(y_true= y_train,y_pred= np.where(train_pred[:,1]>0.5,1,0),zero_division=1)

0.0

In [20]:
#check to see if close to population value
print("Observed: {:.3f}, Fit: {:.3f}".format(y_train.mean(),train_pred[:,1].mean()))

Observed: 0.161, Fit: 0.161


In [21]:
model_params = {"model__C": (np.logspace(start=-4,stop=4,num=30))}

In [22]:
#pipe line has just model object
model_pipeline = pipeline.Pipeline(steps=[("model",null_reg)])

In [23]:
# set up cross-validator:
# 80/20 train/calibration split in this data (test data still held out)
# 200 iterations
# feel free to use another CV method and explain why
cross_validator = (
  model_selection.StratifiedShuffleSplit(train_size=0.8,
                                          random_state=1337,
                                          n_splits=50))

In [24]:
# set up grid search
model_grid = (
  model_selection.GridSearchCV(estimator=model_pipeline,
                           param_grid=model_params,
                           refit=True, # refit using best estimates
                           scoring="recall", # metric to optimize (can pick another)
                           cv=cross_validator,
                           n_jobs = -1))

In [25]:
# fit model on intercept (random guesses - baseline performance)
null_mod = model_grid.fit(X_train_null,y_train)

In [None]:
cv_scores = (
  model_selection.cross_val_score(null_mod,
                                  X_train_null,
                                  y_train,
                                  scoring="recall",
                                  cv=cross_validator,
                                  n_jobs=-1))

In [None]:
cv_quantiles = np.quantile(a=cv_scores,q=[0.025,0.975])

In [None]:
print("Observed: {:.3f}".format(y_train.mean()))

In [None]:
print(f" 2.5%:{cv_quantiles[0]:.3f},97.5%:{cv_quantiles[1]:.3f}")

In [None]:
import seaborn as sns
sns.histplot(cv_scores, bins=10)

In [None]:
max_thresh(cv_scores)

In [None]:
p, r, f = threshhold_and_plot(y_test,test_pred[:,1])

In [None]:
# now on train data
precision_lg, recall_lg, _ =  threshhold_and_plot(y_test=y_train, target=train_pred[:,1],n_points=50, plot=True)

In [None]:
auc_score = metrics.roc_auc_score(y_test, test_pred[:,1])
print(f"Scikit's ROC-AUC score of SVC model is {auc_score: .4f}")

In [None]:
max_thresh(r)

In [None]:
max_thresh(p)

In [None]:
max_thresh(f)

In [None]:
# check to see if close to population value
print(f"Observed: {y_train.mean():.3f} Fit: {train_pred[:,1].mean():.3f}")

## Logistics Regression

Thinking of customer will not exit but he does, this means income loss for the Bank. Bank need to take the action steps for this scenario. This is false negative, and is "Recall". Banks  is looking for Recall to be maximized, greater the Recall lesser the chances of false negatives means lesser chances of predicting customers will not  exit where in reality they do.

In [None]:
clf_lg = linear_model.LogisticRegression(fit_intercept= True,max_iter=500,random_state=632966)

model_params_lg = {"model__C": (np.logspace(start=-4,stop=4,num=30))} 

model_pipeline_lg = pipeline.Pipeline(steps=[("model",clf_lg)])

cross_validator = (model_selection.StratifiedShuffleSplit(train_size=0.8,random_state=1337,n_splits=50))

# set up grid search
model_grid_lg = (
  model_selection.GridSearchCV(estimator=model_pipeline_lg,param_grid=model_params_lg,refit=True, # refit using best estimates
                           scoring="recall", cv=cross_validator,n_jobs = -1))

In [None]:
%%time
lg_mod = model_grid_lg.fit(X_train,y_train)

In [None]:
# Setting to the best combination of parameters
lr_reg = lg_mod.best_estimator_
# Fit the best algorithm to the data. 
lr_reg.fit(X_train, y_train)

In [None]:
%%time
cv_scores_lg = (
  model_selection.cross_val_score(lr_reg,
                                  X_train,
                                  y_train,
                                  scoring="recall",
                                  cv=cross_validator,
                                  n_jobs=-1))

In [None]:
cv_quantiles = np.quantile(a=cv_scores_lg,q=[0.025,0.975])
print(f" 2.5%:{cv_quantiles[0]:.3f},97.5%:{cv_quantiles[1]:.3f}")

In [None]:
sns.histplot(cv_scores_lg, bins=10)

In [None]:
max_thresh(cv_scores_lg)

In [None]:
true_train_preds = lr_reg.predict(X_train)
true_test_preds = lr_reg.predict(X_test)

In [None]:
test_precision_lg, test_recall_lg, test_f1_lg =  threshhold_and_plot(y_test=y_test, target=true_test_preds,n_points=50, plot=True)

In [None]:
train_precision_lg, train_recall_lg, train_f1_lg =  threshhold_and_plot(y_test=y_train, target=true_train_preds,n_points=50, plot=True)

In [None]:
confm_train = metrics.confusion_matrix(y_train, true_train_preds)
plot_confusion_matrx(confm_train)

In [None]:
# from the confusion matrix
TP = true_pos = 419
TN = true_neg = 5822
FP = false_pos = 127
FN = false_neg = 720

In [None]:
confm_test = metrics.confusion_matrix(y_test, true_test_preds)
plot_confusion_matrx(confm_test)

In [None]:
# from the confusion matrix
TP = true_pos = 169
TN = true_neg = 2492
FP = false_pos = 59
FN = false_neg = 321

In [None]:
max_recall_lg_idx, max_recall_lg_val = max_thresh(recall_lg)

In [None]:
print("Max recall - {} at index {}".format(max_recall_lg_val,max_recall_lg_idx))

In [None]:
#plot roc
plot_roc(y_train, true_train_preds)

In [None]:
auc_score = metrics.roc_auc_score(y_train, true_train_preds)
print(f"ROC-AUC score of logistic model is {auc_score: .4f}")

In [None]:
print("Test Recall - {} and Train Recall {}".format(max(test_recall_lg),max(train_recall_lg)))

The recall on test data is only 0.34 ,and model is overfitting there is some discrepancy between test score and train score. let try regularization.

In [None]:
#classifier
clf_lg = linear_model.LogisticRegression(fit_intercept= True,max_iter=500,random_state=632966)

# Grid of parameters
model_params_lg = {"model__C": (np.logspace(start=-4,stop=4,num=30)),
              'model__solver' : ['liblinear','newton-cg','lbfgs','sag','saga'],
              'model__penalty': ['l2']} 

# pipeline
model_pipeline_lg = pipeline.Pipeline(steps=[("model",clf_lg)])

cross_validator = (model_selection.StratifiedShuffleSplit(train_size=0.8,random_state=1337,n_splits=50))

# set up grid search
model_grid_lg = (
  model_selection.GridSearchCV(estimator=model_pipeline_lg,param_grid=model_params_lg,refit=True, # refit using best estimates
                           scoring="recall", cv=cross_validator,n_jobs = -1))
# Fit the best algorithm to the data. 
model_grid_lg.fit(X_train, y_train)


In [None]:
model_grid_lg.best_params_,model_grid_lg.best_score_

In [None]:
# Setting to the best combination of parameters
lr_reg = model_grid_lg.best_estimator_
# Fit the best algorithm to the data. 
lr_reg.fit(X_train, y_train)

In [None]:
true_train_preds = lr_reg.predict(X_train)
true_test_preds = lr_reg.predict(X_test)

In [None]:
%%time
cv_scores_lg = (
  model_selection.cross_val_score(lr_reg,
                                  X_train,
                                  y_train,
                                  scoring="recall",
                                  cv=cross_validator,
                                  n_jobs=-1))

In [None]:
cv_quantiles = np.quantile(a=cv_scores_lg,q=[0.025,0.975])
print(f" 2.5%:{cv_quantiles[0]:.3f},97.5%:{cv_quantiles[1]:.3f}")

In [None]:
sns.histplot(cv_scores_lg, bins=10)

In [None]:
max_thresh(cv_scores_lg)

In [None]:
test_precision_lg, test_recall_lg, test_precision_f1 =  threshhold_and_plot(y_test=y_test, target=true_test_preds,n_points=50, plot=True)

In [None]:
train_precision_lg, train_recall_lg, train_precision_f1 =  threshhold_and_plot(y_test=y_train, target=true_train_preds,n_points=50, plot=True)

In [None]:
#plot roc
plot_roc(y_train, true_train_preds)

In [None]:
auc_score = metrics.roc_auc_score(y_train, true_train_preds)
print(f"ROC-AUC score of logistic model is {auc_score: .4f}")

In [None]:
print("Test Recall - {} and Train Recall {}".format(max(test_recall_lg),max(train_recall_lg)))

Need to move as module

In [None]:
#base metrics: minmum performance
metrics.precision_score(y_true= y_train,y_pred= true_train_preds,zero_division=1)

In [None]:
#base metrics: minmum performance
metrics.recall_score(y_true= y_train,y_pred= true_train_preds,zero_division=1)

In [None]:
#base metrics: minmum performance
metrics.f1_score(y_true= y_train,y_pred= true_train_preds,zero_division=1)

In [None]:
#base metrics: minmum performance
metrics.precision_score(y_true= y_test,y_pred= true_test_preds,zero_division=1)

In [None]:
#base metrics: minmum performance
metrics.recall_score(y_true= y_test,y_pred= true_test_preds,zero_division=1)

In [None]:
#base metrics: minmum performance
metrics.f1_score(y_true= y_test,y_pred= true_test_preds,zero_division=1)

Not much improvement even after regularization. Need to try 

## Additional Models: <a id=AdditionalModels></a>

## Summary <a id =Summary> </a>

## Recommendations <a id = Recom></a>