In [1]:
import numpy as np
import pandas as pd

In [2]:
# Get multiple outputs in the same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# To supress future warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth',2000)
pd.options.display.float_format='{:.2f}'.format

In [4]:
preprocessed_df = pd.read_csv('../output/PreProcessedData.csv')

**Splitting the data into Train and test split**

In [5]:
X = preprocessed_df.iloc[:,1:]
Y = preprocessed_df.iloc[:,0:1]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234, stratify=Y['Loan Status'])

print('X_train.shape - ',X_train.shape)
print('y_train.shape - ',y_train.shape)
print('X_test.shape - ',X_test.shape)
print('y_test.shape - ',y_test.shape)

X_train.shape -  (62237, 16)
y_train.shape -  (62237, 1)
X_test.shape -  (26673, 16)
y_test.shape -  (26673, 1)


In [7]:
preprocessed_df['Loan Status'].value_counts(normalize=True)

0   0.72
1   0.28
Name: Loan Status, dtype: float64

In [8]:
y_train['Loan Status'].value_counts(normalize=True)

0   0.72
1   0.28
Name: Loan Status, dtype: float64

In [9]:
y_test['Loan Status'].value_counts(normalize=True)

0   0.72
1   0.28
Name: Loan Status, dtype: float64

**Scaling the data using MinMax Scaler**

In [10]:
columns_to_be_scaled = X_train.columns.values
columns_to_be_scaled

array(['Current Loan Amount', 'Term', 'Credit Score',
       'Years in current job', 'Home Ownership', 'Annual Income',
       'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'], dtype=object)

In [11]:
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()
X_train_scaled = mm_scaler.fit_transform(X_train[columns_to_be_scaled])
X_train_scaled = pd.DataFrame(X_train_scaled, columns = columns_to_be_scaled)

In [12]:
X_train_scaled

Unnamed: 0,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,0.00,1.00,0.39,0.56,1.00,0.16,0.21,0.21,0.51,0.18,0.11,0.00,0.19,0.14,0.00,0.00
1,0.00,1.00,0.39,1.00,0.00,0.21,0.21,0.21,0.16,0.18,0.23,0.00,0.25,0.26,0.00,0.00
2,0.00,1.00,0.39,0.56,1.00,0.09,0.21,0.14,0.10,0.10,0.12,0.00,0.05,0.11,0.00,0.00
3,0.00,1.00,0.36,1.00,0.00,0.20,0.21,0.36,0.09,0.26,0.34,0.00,0.12,0.18,0.00,0.00
4,0.00,1.00,0.38,0.00,1.00,0.16,0.21,0.03,0.17,0.18,0.07,0.00,0.04,0.15,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62232,1.00,1.00,0.40,1.00,0.00,0.42,0.21,0.52,0.14,0.18,0.41,0.00,0.18,0.50,0.00,0.00
62233,0.00,0.00,0.15,1.00,0.00,0.29,0.36,0.32,0.15,0.39,0.27,0.00,0.38,0.24,0.00,0.00
62234,0.00,1.00,0.35,0.00,1.00,0.17,0.71,0.21,0.09,0.18,0.16,0.00,0.04,0.04,0.00,0.00
62235,0.00,1.00,0.36,0.44,1.00,0.15,0.21,0.21,0.14,0.18,0.16,0.00,0.19,0.13,0.00,0.00


In [13]:
from sklearn.preprocessing import MinMaxScaler

mm_scaler = MinMaxScaler()
X_test_scaled = mm_scaler.fit_transform(X_test[columns_to_be_scaled])
X_test_scaled = pd.DataFrame(X_test_scaled, columns = columns_to_be_scaled)

In [14]:
X_test_scaled

Unnamed: 0,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,0.00,1.00,0.31,0.11,1.00,0.11,0.00,0.21,0.09,0.21,0.07,0.00,0.00,0.04,0.00,0.00
1,0.00,1.00,0.39,0.22,0.00,0.21,0.21,0.21,0.26,0.51,0.20,0.00,0.16,0.38,0.00,0.00
2,0.00,1.00,0.39,0.22,0.00,0.19,0.21,0.21,0.20,0.21,0.15,0.07,0.22,0.21,0.20,0.00
3,0.00,1.00,0.40,1.00,0.50,0.16,0.14,0.25,0.36,0.21,0.20,0.00,0.11,0.29,0.00,0.00
4,0.00,1.00,0.27,0.56,0.00,0.50,0.21,0.93,0.23,0.21,0.24,0.00,0.49,0.36,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26668,0.00,1.00,0.36,0.44,1.00,0.11,0.21,0.21,0.13,0.16,0.16,0.00,0.09,0.17,0.00,0.00
26669,0.00,1.00,0.35,1.00,0.00,0.34,0.21,0.21,0.24,0.21,0.09,0.00,0.07,0.25,0.00,0.00
26670,0.00,1.00,0.30,0.56,0.00,0.30,0.21,0.21,0.22,0.21,0.19,0.00,0.17,0.36,0.00,0.00
26671,0.00,1.00,0.36,0.78,1.00,0.20,0.21,0.20,0.11,0.21,0.12,0.00,0.10,0.10,0.00,0.00


### Training with train dataset on Models and evaluating the Models

#### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=123)
lr_model.fit(X_train,y_train)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

y_lr_train_predicted = lr_model.predict(X_train_scaled)
y_lr_test_predicted = lr_model.predict(X_test_scaled)

#Using above defined function to get accuracy, recall and precision on train and test set
# lr_score=get_metrics_score(lr_model)

print('\nClassification Report for Training Data- \n',classification_report(y_train, y_lr_train_predicted))
print('\nClassification Report for Test Data- \n',classification_report(y_test, y_lr_test_predicted))
print('Confusion matrix for the train Data- \n',confusion_matrix(y_train, y_lr_train_predicted))
print('\nConfusion matrix for the test Data- \n',confusion_matrix(y_test, y_lr_test_predicted))


Classification Report for Training Data- 
               precision    recall  f1-score   support

           0       0.80      0.00      0.00     44616
           1       0.28      1.00      0.44     17621

    accuracy                           0.28     62237
   macro avg       0.54      0.50      0.22     62237
weighted avg       0.65      0.28      0.13     62237


Classification Report for Test Data- 
               precision    recall  f1-score   support

           0       0.78      0.00      0.00     19121
           1       0.28      1.00      0.44      7552

    accuracy                           0.28     26673
   macro avg       0.53      0.50      0.22     26673
weighted avg       0.64      0.28      0.13     26673

Confusion matrix for the train Data- 
 [[   63 44553]
 [   16 17605]]

Confusion matrix for the test Data- 
 [[   28 19093]
 [    8  7544]]


#### Linear Discriminant Analysis

In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda_model = LinearDiscriminantAnalysis(n_components=1)
lda_model.fit(X_train,y_train)

In [18]:
y_lda_train_predicted = lda_model.predict(X_train)
y_lda_test_predicted = lda_model.predict(X_test)

print('\nClassification Report for Training Data- \n',classification_report(y_train, y_lda_train_predicted))
print('\nClassification Report for Test Data- \n',classification_report(y_test, y_lda_test_predicted))
print('Confusion matrix for the train data- \n',confusion_matrix(y_train, y_lda_train_predicted))
print('\nConfusion matrix for the test data- \n',confusion_matrix(y_test, y_lda_test_predicted))


Classification Report for Training Data- 
               precision    recall  f1-score   support

           0       0.77      0.97      0.86     44616
           1       0.78      0.26      0.39     17621

    accuracy                           0.77     62237
   macro avg       0.78      0.62      0.63     62237
weighted avg       0.77      0.77      0.73     62237


Classification Report for Test Data- 
               precision    recall  f1-score   support

           0       0.77      0.97      0.86     19121
           1       0.79      0.27      0.40      7552

    accuracy                           0.77     26673
   macro avg       0.78      0.62      0.63     26673
weighted avg       0.77      0.77      0.73     26673

Confusion matrix for the train data- 
 [[43329  1287]
 [12973  4648]]

Confusion matrix for the test data- 
 [[18577   544]
 [ 5546  2006]]


#### Guassian Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB

GNB_model = GaussianNB()
GNB_model.fit(X_train, y_train)

In [20]:
# make predictions for train data
y_gnb_train_predicted = GNB_model.predict(X_train)
# make predictions for test data
y_gnb_test_predicted = GNB_model.predict(X_test)


# summarize the fit of the model
print('\nClassification Report for Training Data- \n',classification_report(y_train, y_gnb_train_predicted))
print('\nClassification Report for Test Data- \n',classification_report(y_test, y_gnb_test_predicted))
print('Confusion matrix for the train data- \n',confusion_matrix(y_train, y_gnb_train_predicted))
print('\nConfusion matrix for the test data- \n',confusion_matrix(y_test, y_gnb_test_predicted))


Classification Report for Training Data- 
               precision    recall  f1-score   support

           0       0.99      0.21      0.34     44616
           1       0.33      1.00      0.50     17621

    accuracy                           0.43     62237
   macro avg       0.66      0.60      0.42     62237
weighted avg       0.80      0.43      0.39     62237


Classification Report for Test Data- 
               precision    recall  f1-score   support

           0       0.99      0.21      0.34     19121
           1       0.33      0.99      0.50      7552

    accuracy                           0.43     26673
   macro avg       0.66      0.60      0.42     26673
weighted avg       0.80      0.43      0.38     26673

Confusion matrix for the train data- 
 [[ 9225 35391]
 [   86 17535]]

Confusion matrix for the test data- 
 [[ 3920 15201]
 [   39  7513]]


#### Decision Tree Classifier

In [21]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=123)
dt_model.fit(X_train,y_train)

In [22]:
y_dt_train_predicted = dt_model.predict(X_train)
y_dt_test_predicted = dt_model.predict(X_test)

# summarize the fit of the model
print('\nClassification report for the training data \n',classification_report(y_train, y_dt_train_predicted))
print('\nClassification report for the test data \n',classification_report(y_test, y_dt_test_predicted))
print('\nConfusion matrix for the train data- \n',confusion_matrix(y_train, y_dt_train_predicted))
print('\nConfusion matrix for the test data- \n',confusion_matrix(y_test, y_dt_test_predicted))


Classification report for the training data 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     44616
           1       1.00      1.00      1.00     17621

    accuracy                           1.00     62237
   macro avg       1.00      1.00      1.00     62237
weighted avg       1.00      1.00      1.00     62237


Classification report for the test data 
               precision    recall  f1-score   support

           0       0.79      0.78      0.79     19121
           1       0.47      0.49      0.48      7552

    accuracy                           0.70     26673
   macro avg       0.63      0.63      0.63     26673
weighted avg       0.70      0.70      0.70     26673


Confusion matrix for the train data- 
 [[44616     0]
 [    0 17621]]

Confusion matrix for the test data- 
 [[14907  4214]
 [ 3865  3687]]


#### Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier

RF_model=RandomForestClassifier(max_depth=7,max_features=5,min_samples_leaf=10,min_samples_split=25,n_estimators=40,
                                random_state=1)
RF_model.fit(X_train, y_train)

In [24]:
y_rf_train_predicted = RF_model.predict(X_train)
y_rf_test_predicted = RF_model.predict(X_test)

# summarize the fit of the model
print('\nClassification report for the training data \n',classification_report(y_train, y_rf_train_predicted))
print('\nClassification report for the test data \n',classification_report(y_test, y_rf_test_predicted))
print('\nConfusion matrix for the train data- \n',confusion_matrix(y_train, y_rf_train_predicted))
print('\nConfusion matrix for the test data- \n',confusion_matrix(y_test, y_rf_test_predicted))


Classification report for the training data 
               precision    recall  f1-score   support

           0       0.77      0.99      0.87     44616
           1       0.92      0.24      0.38     17621

    accuracy                           0.78     62237
   macro avg       0.85      0.62      0.62     62237
weighted avg       0.81      0.78      0.73     62237


Classification report for the test data 
               precision    recall  f1-score   support

           0       0.77      0.99      0.86     19121
           1       0.91      0.23      0.37      7552

    accuracy                           0.78     26673
   macro avg       0.84      0.61      0.62     26673
weighted avg       0.81      0.78      0.72     26673


Confusion matrix for the train data- 
 [[44269   347]
 [13413  4208]]

Confusion matrix for the test data- 
 [[18956   165]
 [ 5798  1754]]


#### Gradient Boosting Classifier

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_model = GradientBoostingClassifier(random_state=1)
gbc_model.fit(X_train,y_train)

In [26]:
# make predictions for train data
y_gbc_train_predicted = gbc_model.predict(X_train)

# make predictions for test data
y_gbc_test_predicted = gbc_model.predict(X_test)

# summarize the fit of the model
print('\nClassification Report for Training Data- \n',classification_report(y_train, y_gbc_train_predicted))
print('\nClassification Report for Test Data- \n',classification_report(y_test, y_gbc_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train, y_gbc_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_gbc_test_predicted))


Classification Report for Training Data- 
               precision    recall  f1-score   support

           0       0.78      0.98      0.87     44616
           1       0.83      0.29      0.43     17621

    accuracy                           0.78     62237
   macro avg       0.80      0.63      0.65     62237
weighted avg       0.79      0.78      0.74     62237


Classification Report for Test Data- 
               precision    recall  f1-score   support

           0       0.77      0.97      0.86     19121
           1       0.81      0.28      0.42      7552

    accuracy                           0.78     26673
   macro avg       0.79      0.63      0.64     26673
weighted avg       0.78      0.78      0.74     26673


Confusion Matrix for Training Data- 
 [[43528  1088]
 [12453  5168]]

Confusion Matrix for Test Data - 
 [[18601   520]
 [ 5404  2148]]


#### Extreme Gradient Boosting Classifier

In [27]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state=123)
xgb_model.fit(X_train, y_train)

In [28]:
y_xgb_train_predicted = xgb_model.predict(X_train)
y_xgb_test_predicted = xgb_model.predict(X_test)

print('Classification Report for Training data for Mobile Users-\n',classification_report(y_train, y_xgb_train_predicted))
print('\n\n\nClassification Report for Test data for Mobile Users-\n',classification_report(y_test, y_xgb_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train, y_xgb_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_xgb_test_predicted))

Classification Report for Training data for Mobile Users-
               precision    recall  f1-score   support

           0       0.82      0.98      0.89     44616
           1       0.88      0.45      0.59     17621

    accuracy                           0.83     62237
   macro avg       0.85      0.71      0.74     62237
weighted avg       0.83      0.83      0.81     62237




Classification Report for Test data for Mobile Users-
               precision    recall  f1-score   support

           0       0.79      0.95      0.86     19121
           1       0.72      0.35      0.47      7552

    accuracy                           0.78     26673
   macro avg       0.75      0.65      0.66     26673
weighted avg       0.77      0.78      0.75     26673


Confusion Matrix for Training Data- 
 [[43534  1082]
 [ 9750  7871]]

Confusion Matrix for Test Data - 
 [[18094  1027]
 [ 4943  2609]]


**So far, Extreme Gradient Boosting Algorithm has performed better than the rest of the models. We will try to improve its performance using SMOTE and threshold values for predicting the classes**

### Using SMOTE for Imbalanced DataSet

In [29]:
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE object
smote = SMOTE(sampling_strategy={1: 30621}, random_state=42)

# Upsample the dataset
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Print the class distribution before and after SMOTE
print("Class distribution before SMOTE:", {label: count for label, count in zip(*np.unique(y_train, return_counts=True))})
print("Class distribution after SMOTE:", {label: count for label, count in zip(*np.unique(y_train_resampled, return_counts=True))})

Class distribution before SMOTE: {0: 44616, 1: 17621}
Class distribution after SMOTE: {0: 44616, 1: 30621}


### Gradient Boosting Classifier using SMOTE

In [30]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_smote_model = GradientBoostingClassifier(random_state=1)
gbc_smote_model.fit(X_train_resampled, y_train_resampled)

In [31]:
# make predictions for train data
y_gbc_train_predicted = gbc_smote_model.predict(X_train_resampled)

# make predictions for test data
y_gbc_test_predicted = gbc_smote_model.predict(X_test)

# summarize the fit of the model
print('\nClassification Report for Training Data- \n',classification_report(y_train_resampled, y_gbc_train_predicted))
print('\nClassification Report for Test Data- \n',classification_report(y_test, y_gbc_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train_resampled, y_gbc_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_gbc_test_predicted))


Classification Report for Training Data- 
               precision    recall  f1-score   support

           0       0.76      0.89      0.82     44616
           1       0.80      0.59      0.68     30621

    accuracy                           0.77     75237
   macro avg       0.78      0.74      0.75     75237
weighted avg       0.78      0.77      0.76     75237


Classification Report for Test Data- 
               precision    recall  f1-score   support

           0       0.80      0.89      0.84     19121
           1       0.62      0.43      0.51      7552

    accuracy                           0.76     26673
   macro avg       0.71      0.66      0.68     26673
weighted avg       0.75      0.76      0.75     26673


Confusion Matrix for Training Data- 
 [[39931  4685]
 [12448 18173]]

Confusion Matrix for Test Data - 
 [[17088  2033]
 [ 4294  3258]]


### Extreme Gradient Boosting Classifier using SMOTE

In [32]:
from xgboost import XGBClassifier

xgb_smote_model = XGBClassifier(random_state=123)
xgb_smote_model.fit(X_train_resampled, y_train_resampled)

In [33]:
y_xgb_train_predicted = xgb_smote_model.predict(X_train_resampled)
y_xgb_test_predicted = xgb_smote_model.predict(X_test)

print('Classification Report for Training data for Mobile Users-\n',classification_report(y_train_resampled, y_xgb_train_predicted))
print('\n\n\nClassification Report for Test data for Mobile Users-\n',classification_report(y_test, y_xgb_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train_resampled, y_xgb_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_xgb_test_predicted))

Classification Report for Training data for Mobile Users-
               precision    recall  f1-score   support

           0       0.81      0.92      0.86     44616
           1       0.86      0.69      0.76     30621

    accuracy                           0.83     75237
   macro avg       0.84      0.81      0.81     75237
weighted avg       0.83      0.83      0.82     75237




Classification Report for Test data for Mobile Users-
               precision    recall  f1-score   support

           0       0.80      0.89      0.84     19121
           1       0.61      0.44      0.51      7552

    accuracy                           0.76     26673
   macro avg       0.71      0.66      0.68     26673
weighted avg       0.75      0.76      0.75     26673


Confusion Matrix for Training Data- 
 [[41245  3371]
 [ 9574 21047]]

Confusion Matrix for Test Data - 
 [[17005  2116]
 [ 4228  3324]]


### Extreme Gradient Boosting Classifier using Threshold 

In [34]:
from xgboost import XGBClassifier

xgb_threshold_model = XGBClassifier(random_state=123)
xgb_threshold_model.fit(X_train, y_train)

In [35]:
y_xgb_train_prob_predicted = xgb_threshold_model.predict_proba(X_train)
y_xgb_test_prob_predicted = xgb_threshold_model.predict_proba(X_test)

In [36]:
from sklearn.metrics import roc_curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train,y_xgb_train_prob_predicted[:,1])

optimal_index = np.argmax(true_positive_rate - false_positive_rate)
optimal_threshold = thresholds[optimal_index]
optimal_threshold

0.29149655

In [37]:
y_xgb_train_predicted = [1 if i >= 0.20 else 0 for i in y_xgb_train_prob_predicted[:,1]]
y_xgb_test_predicted =  [1 if i >= 0.20 else 0 for i in y_xgb_test_prob_predicted[:,1]]

print('Classification Report for Training data for Mobile Users-\n',classification_report(y_train, y_xgb_train_predicted))
print('\n\n\nClassification Report for Test data for Mobile Users-\n',classification_report(y_test, y_xgb_test_predicted))
print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train, y_xgb_train_predicted))
print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_xgb_test_predicted))

Classification Report for Training data for Mobile Users-
               precision    recall  f1-score   support

           0       0.96      0.57      0.71     44616
           1       0.46      0.93      0.62     17621

    accuracy                           0.67     62237
   macro avg       0.71      0.75      0.66     62237
weighted avg       0.82      0.67      0.68     62237




Classification Report for Test data for Mobile Users-
               precision    recall  f1-score   support

           0       0.90      0.53      0.66     19121
           1       0.42      0.86      0.56      7552

    accuracy                           0.62     26673
   macro avg       0.66      0.69      0.61     26673
weighted avg       0.76      0.62      0.63     26673


Confusion Matrix for Training Data- 
 [[25254 19362]
 [ 1160 16461]]

Confusion Matrix for Test Data - 
 [[10040  9081]
 [ 1094  6458]]


### Tuning Random Forest Classifier

In [38]:
# from sklearn.model_selection import GridSearchCV
# 
# hyper_parameters = {
#             'max_features': [8,9,10,11,12],
#             'n_estimators': [50, 60, 70, 80], # number of trees in the random forest
#             'max_depth' : [12,15,18,21], # maximum number of levels allowed in each decision tree
#             'min_samples_split' : [1,2,3,6,9], #,15,20,25,30, # minimum sample number to split a node
#             'min_samples_leaf' : [1,2,3,6,9] # minimum sample number that can be stored in a leaf node
# }
# 
# rf_model_tuned = RandomForestClassifier(random_state=123)
# 
# gs_rf_model = GridSearchCV(estimator=rf_model_tuned,
#                                    param_grid=hyper_parameters,
#                                    cv = 5,
#                                    verbose=True,
#                                    n_jobs=-1)
# 
# gs_rf_model.fit(X_train, y_train)

In [39]:
# gs_rf_model.best_params_

In [40]:
# y_gs_rf_train_prob_predicted = gs_rf_model.predict_proba(X_train)
# y_gs_rf_test_prob_predicted = gs_rf_model.predict_proba(X_test)

In [41]:
# y_gs_rf_train_predicted = [1 if i >= 0.50 else 0 for i in y_gs_rf_train_prob_predicted[:,1]]
# y_gs_rf_test_predicted =  [1 if i >= 0.50 else 0 for i in y_gs_rf_test_prob_predicted[:,1]]
# 
# print('Classification Report for Training data for Mobile Users-\n',classification_report(y_train, y_gs_rf_train_predicted))
# print('\n\n\nClassification Report for Test data for Mobile Users-\n',classification_report(y_test, y_gs_rf_test_predicted))
# print('\nConfusion Matrix for Training Data- \n',confusion_matrix(y_train, y_gs_rf_train_predicted))
# print('\nConfusion Matrix for Test Data - \n',confusion_matrix(y_test, y_gs_rf_test_predicted))

### Extreme Gradient Boosting Classifier using Threshold seems to perform better for "Recall" metric. So, we use this model for inference.

**Saving the model for Inference use**

In [42]:
import pickle

with open('../output/xgb_threshold_model.pkl', 'wb') as f:
    pickle.dump(xgb_threshold_model, f)