## Credit Card Fraud Detection

In this project you will predict fraudulent credit card transactions with the help of Machine learning models. Please import the following libraries to get started.

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

from helper import get_detailed_info
from helper import get_numeric_metadata
from helper import get_frequency_distribution
from helper import get_group_frequency_dist
from helper import plot_box_bar_or_scatter_plot
from helper import plot_dist_plot

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.preprocessing import StandardScaler

from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

pd.set_option("display.float_format", "{:.2f}".format)


# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Testing for Normality in Python
# This function will return the skewness, kurtosis and whether the data is normally distributed or not 
# for the numerical features
# Just pass the dataframe and it will extract the numeric features to perform normality test on the features
# Returns dataframe
# This method is based upon the DataCamp tutorial on Portfolio Risk management

# https://www.spcforexcel.com/knowledge/basic-statistics/are-skewness-and-kurtosis-useful-statistics#:~:text=So%2C%20a%20normal%20distribution%20will,sizes%20of%20the%20two%20tails.&text=If%20the%20kurtosis%20is%20greater,(more%20in%20the%20tails).

# https://campus.datacamp.com/courses/introduction-to-portfolio-risk-management-in-python/univariate-investment-risk-and-returns?ex=10
def test_for_normality(df):
    """Testing for Normality in Python
       This function will return the skewness, kurtosis and whether the data is normally distributed or not for the numerical features

    Arguments:
        df {[DataFrame]} -- Pass the pandas dataframe (with numeric features only)

    Returns:
        [DataFrame] -- The dataframe
    """
    ref_df = df.copy()
    result_df = pd.DataFrame()
    result_df["Features"] = ref_df.columns

    skew = "skew"
    kurtosis = "kurtosis"
    isNormal = "isNormal"

    for col in ref_df.columns.values:
        # get index
        idx = result_df[result_df["Features"] == col].index[0]

        # Skew
        result_df.loc[idx, skew] = ref_df[col].skew()
        result_df.loc[idx, kurtosis] = ref_df[col].kurtosis()

        # Shapiro-Wilk test
        # The null hypothesis of the Shapiro-Wilk test assumes that the data are normally distributed.
        # If the p-value is less than 0.05, the null hypothesis is rejected because the data are most likely non-normal.
        p_value = stats.shapiro(ref_df[col].dropna())[1]
        if p_value <= 0.05:
            result_df.loc[idx, isNormal] = False
        else:
            result_df.loc[idx, isNormal] = True

    return result_df

def get_higly_correlated_columns_name(row, positive=0.8, negative=-0.8):
    corr_col_arr = []
    for col in row.index.values:
        if (row[col] >= positive or row[col] <= negative):
            corr_col_arr.append(col)
    return corr_col_arr


def get_higly_correlated_columns_value(row, positive=0.8, negative=-0.8):
    corr_col_val_arr = []
    for col in row.index.values:
        if ((row[col] >= positive ) or (row[col] <= negative)):
            corr_col_val_arr.append(str(round(row[col], 2)))
    return corr_col_val_arr


def get_numeric_correlation(df, thresh=0.8):

    corr_matrix = df.corr()
    
    cor = corr_matrix.copy()
    
    cor["cor_columns"] = cor.apply(
        lambda x: get_higly_correlated_columns_name(x, thresh, -(thresh)), axis=1
    )
    cor["cor_columns_val"] = cor.iloc[:, :-1].apply(
        lambda x: get_higly_correlated_columns_value(x, thresh, -(thresh)), axis=1
    )

    cor = cor.iloc[:, -2:]
    return cor, corr_matrix

## Exploratory data analysis

In [None]:
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
#get detailed summary report
get_detailed_info(df)

So, from above detail report of dataset, we have 30 features all numeric, no missing data, no null values, no column with all unique values, we get few duplicate rows.

In [None]:
# percentage of duplicate rows
df.loc[df.duplicated()].shape[0] / df.shape[0] * 100

the fraction of duplicate rows is too low in comparison to amount of data we have, also as we have numeric data hence duplication result even for slight changes in values, hence we could ignore duplication here

In [None]:
#get metadata info for dataset
#observe the different feature type present in the data
get_numeric_metadata(df)

Here we will observe the distribution of our classes

In [None]:
#Create a bar plot for the number and percentage of fraudulent vs non-fraudulent transcations
class_freq_dist = get_frequency_distribution(df, 'Class', normalize=False)
class_freq_dist.iloc[0,0] = 'normal_share'
class_freq_dist.iloc[1,0] = 'fraud_share'

plot_box_bar_or_scatter_plot(x='Class',
                     y='normalized_frequency_distribution',
                     data=class_freq_dist, 
                     xlabel='Class',
                     ylabel='Class Distribution (Count)',
                     title='Class Frequency Distribution Count',
                     plot=1, 
                     figsize=(10,7),
                     ticksFont_size=14,
                     normalize=False)

class_freq_dist = get_frequency_distribution(df, 'Class', normalize=True)
class_freq_dist.iloc[0,0] = 'normal_share'
class_freq_dist.iloc[1,0] = 'fraud_share'

plot_box_bar_or_scatter_plot(x='Class',
                     y='normalized_frequency_distribution',
                     data=class_freq_dist, 
                     xlabel='Class',
                     ylabel='Class Distribution (in %)',
                     title='Class Frequency Distribution',
                     plot=1, 
                     figsize=(10,7),
                     ticksFont_size=14)

In [None]:
# Create a dist plot to observe the distribution of classes with Amount
plot_dist_plot(df.loc[df['Class']==0, 'Amount'],figsize=(10,6),color='g')

plot_dist_plot(df.loc[df['Class']==1, 'Amount'],figsize=(10,6), color='r')


In [None]:
# Create a dist plot to observe the distribution of classes with Time
plot_dist_plot(df.loc[df['Class']==0, 'Time'],figsize=(10,6),color='g')

plot_dist_plot(df.loc[df['Class']==1, 'Time'],figsize=(10,6), color='r')

In [None]:
# Create a scatter plot to observe the distribution of classes with time
sns.catplot(x="Class", y="Time", data=df)

# Create a scatter plot to observe the distribution of classes with amount
sns.catplot(x="Class", y="Amount", data=df)

#### Correlation between features & between target and features

In [None]:
#Create the heatmap depicting the numeric correlations
mapping, corr_matrix = get_numeric_correlation(df, .3)
#corr_matrix.style.background_gradient(cmap='coolwarm')

In [None]:
#Get the mapping of features and target to depict to correlations with the minimum threshold of 30%
mapping

In [None]:
print(mapping.loc['Class',:][0])
print(mapping.loc['Class',:][1])

In [None]:
print(mapping.loc['Amount',:][0])
print(mapping.loc['Amount',:][1])

In [None]:
print(mapping.loc['Time',:][0])
print(mapping.loc['Time',:][1])

### Splitting the data into train & test data
- We will use stratified split to maintain the class proportion in train & test set

In [None]:
y= df['Class']
X = df.drop(['Class'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=123, test_size=0.20)

##### Preserve X_test & y_test to evaluate on the test data once you build the model

In [None]:
print(np.sum(y))
print(np.sum(y_train))
print(np.sum(y_test))

### Plotting the distribution of a variable

In [None]:
plt.figure(figsize=(15, 80))
for k, col in enumerate(X_train):
  plt.subplot(16,2,k+1)
  sns.distplot(X_train.loc[df['Class']==0, col], color='g')
  sns.distplot(X_train.loc[df['Class']==1, col], color='r')
plt.show()

In [None]:
plt.figure(figsize=(15, 80))
for k, col in enumerate(X_test):
  plt.subplot(16,2,k+1)
  sns.distplot(X_test.loc[df['Class']==0, col], color='g')
  sns.distplot(X_test.loc[df['Class']==1, col], color='r')
plt.show()

### If there is skewness present in the distribution use:
- <b>Power Transformer</b> package present in the <b>preprocessing library provided by sklearn</b> to make distribution more gaussian

In [None]:
#Shapiro-Wilk test for checking normality of feature variable in train set
test_for_normality(X_train)

In [None]:
#Shapiro-Wilk test for checking normality of feature variable in test set
test_for_normality(X_test)

We could see from above that all the feature variables are non-normal

In [None]:
# - Apply : preprocessing.PowerTransformer(copy=False) to fit & transform the train & test data
tx = PowerTransformer()
tx.fit(X_train)                      
X_train = pd.DataFrame(tx.transform(X_train), columns=X.columns)
X_test = pd.DataFrame(tx.transform(X_test), columns=X.columns)

In [None]:
#Shapiro-Wilk test for checking normality of feature variable in train set
test_for_normality(X_train)

In [None]:
#Shapiro-Wilk test for checking normality of feature variable in test set
test_for_normality(X_test)

In [None]:
# plot the histogram of features from the train set to see the result 
plt.figure(figsize=(15, 80))
for k, col in enumerate(X_train):
  plt.subplot(16,2,k+1)
  sns.distplot(X_train.loc[df['Class']==0, col], color='g')
  sns.distplot(X_train.loc[df['Class']==1, col], color='r')
plt.show()

In [None]:
# plot the histogram of features from the test set to see the result 
plt.figure(figsize=(15, 80))
for k, col in enumerate(X_test):
  plt.subplot(16,2,k+1)
  sns.distplot(X_test.loc[df['Class']==0, col], color='g')
  sns.distplot(X_test.loc[df['Class']==1, col], color='r')
plt.show()

## Model Building
- Build different models on the imbalanced dataset and see the result

In [None]:
def run_cross_validation(model, X_train, y_train, cv):
    print('Accuracy -> (TP+TN)/total:' ,np.mean(cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)))
    print('ROC_AUC Score :' ,np.mean(cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv)))
    print('F1 Score -> 2*(precision*recall)/(precision+recall):' ,np.mean(cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)))
    print('Precision -> TP/(TP+FP):' ,np.mean(cross_val_score(model, X_train, y_train, scoring='precision', cv=cv, n_jobs=-1)))
    print('Recall -> TP/(TP+FN):' ,np.mean(cross_val_score(model, X_train, y_train, scoring='recall', cv=cv, n_jobs=-1)))
    

In [None]:
# Logistic Regression
def TuneLRModel(X_train, y_train, k=5):
    
    # Instantiate logistic regression 
    LR = LogisticRegression(random_state=123)
  
    # Create regularization penalty space
    C = np.logspace(-4,4,50)
    
    # Stratified kfold as cross validation strategy
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    # Create params options
    param_grid = [
      {'C' :C, 'penalty': ['l1'], 'solver': [ 'saga', 'liblinear']},
      {'C':C, 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs']},
    ]

    # Use Multi scorer
    scoring  = {'Accuracy':'accuracy','Precision': 'precision', 'Recall': 'recall', 'F1' : 'f1', 'AUC':'roc_auc'}
   
    # Create grid search using StratifiedKFold cross validation
    lrSearch = GridSearchCV(LR, 
                   param_grid = param_grid, 
                   cv=cv, 
                   scoring=scoring ,
                   refit='AUC',
                   return_train_score=True, 
                   verbose=1, n_jobs=-1)
    
    # Fit grid search
    lrSearch.fit(X_train, y_train)
    
    # Store the results
    results = lrSearch.cv_results_
    
    # View best hyperparameters
    BEST_C =  lrSearch.best_estimator_.get_params()['C']
    BEST_PENALTY = lrSearch.best_estimator_.get_params()['penalty']
    BEST_SOLVER = lrSearch.best_estimator_.get_params()['solver']
    
    print('Best Penalty:', BEST_PENALTY)    
    print('Best Solver:', BEST_SOLVER)

    print('Best C:', BEST_C)
    print('AUC', lrSearch.best_score_)
    
    cv_results = pd.DataFrame(results)
    cv_results_subset =  cv_results.loc[:,['params','mean_test_Accuracy','mean_test_Precision', 'mean_test_Recall','mean_test_AUC', 'mean_test_F1']]
    

    return cv_results_subset, BEST_C, BEST_PENALTY, BEST_SOLVER

In [None]:
def TuneClassificationModel(X_train, y_train, estimator, param_grid, k=2): 
    # Stratified kfold as cross validation strategy
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    # Use Multi scorer
    scoring  = {'Accuracy':'accuracy','AUC':'roc_auc'}
    
    # Create grid search using StratifiedKFold cross validation
    # Create grid search using StratifiedKFold cross validation
    gridSearch = GridSearchCV(estimator = estimator, 
                   param_grid = param_grid, 
                   cv=cv, 
                   scoring=scoring ,
                   refit='AUC',
                   return_train_score=True, 
                   verbose=1, n_jobs=-1)
    
    # Fit grid search
    gridSearch.fit(X_train, y_train)
    
    # Store the results
    results = gridSearch.cv_results_
    
    # View best hyperparameters
    print('Best Params:', gridSearch.best_params_)
    print('AUC', gridSearch.best_score_)
    
    cv_results = pd.DataFrame(results)
    return cv_results.loc[:,['params','mean_train_Accuracy','mean_test_Accuracy','mean_train_AUC', 'mean_test_AUC']]
    

#### Logistic Regression on Imbalanced dataset

In [None]:
# Instantiate logistic regression 
lr = LogisticRegression(random_state=123)
  
# Create regularization penalty space
C = np.logspace(-4,4,50)
     
# Create params options
param_grid = [
      {'C' :C, 'penalty': ['l1'], 'solver': [ 'saga', 'liblinear']},
      {'C':C, 'penalty': ['l2'], 'solver': ['newton-cg', 'lbfgs']},
]

TuneClassificationModel(X_train, y_train, lr, param_grid, k=5)

#### RandomForest Classifier on Imbalanced dataset

In [None]:
# Instantiate the grid search model
rf = RandomForestClassifier(random_state=123)


# Create the parameter grid
param_grid = { 
    'max_depth': range(5, 25, 5), 
    'criterion': ["entropy", "gini"]
}
TuneClassificationModel(X_train, y_train, rf, param_grid, k=5)

In [None]:
# Instantiate the grid search model
rf = RandomForestClassifier(criterion = 'entropy',
                            max_depth = 10,
                            random_state=123)


# Create the parameter grid
param_grid = { 
     'n_estimators': range(5,100,10)
}

TuneClassificationModel(X_train, y_train, rf, param_grid, k=5)

In [None]:
# Instantiate the grid search model
rf = RandomForestClassifier(criterion = 'entropy',
                            max_depth = 10,
                            n_estimators = 85,
                            random_state=123)


# Create the parameter grid
param_grid = { 
     'max_features': range(5,35,5)
}

TuneClassificationModel(X_train, y_train, rf, param_grid, k=5)

In [None]:
# Instantiate the grid search model
rf = RandomForestClassifier(criterion = 'entropy',
                            max_depth = 10,
                            n_estimators = 85,
                            max_features=5,
                            random_state=123)


# Create the parameter grid
param_grid = { 
     'min_samples_leaf': range(100,400,50)
}

TuneClassificationModel(X_train, y_train, rf, param_grid, k=5)

In [None]:
# Instantiate the grid search model
rf = RandomForestClassifier(criterion = 'entropy',
                            max_depth = 10,
                            n_estimators = 85,
                            max_features=5,
                            min_samples_leaf=250,
                            random_state=123)


# Create the parameter grid
param_grid = { 
      'min_samples_split': range(200,500,50)
}

TuneClassificationModel(X_train, y_train, rf, param_grid, k=5)

In [None]:
# Instantiate the grid search model
knn = KNeighborsClassifier(n_jobs=-1)


# Create the parameter grid
param_grid = { 
      'n_neighbors':range(3,20,2),
      'p':[1,2]
}

TuneClassificationModel(X_train, y_train, knn, param_grid, k=2)

In [None]:
# Logistic Regression
from sklearn import linear_model #import the package

num_C = ______  #--> list of values
cv_num =   #--> list of values

#### perfom cross validation on the X_train & y_train to create:
- X_train_cv
- X_test_cv 
- y_train_cv
- y_test_cv 

In [None]:
#perform cross validation

#perform hyperparameter tuning

#print the evaluation result by choosing a evaluation metric

#print the optimum value of hyperparameters

### Similarly explore other algorithms by building models like:
- KNN
- SVM
- Decision Tree
- Random Forest
- XGBoost

#### Proceed with the model which shows the best result 
- Apply the best hyperparameter on the model
- Predict on the test dataset

In [None]:
clf = ___  #initialise the model with optimum hyperparameters
clf.fit(X_train, y_train)
print --> #print the evaluation score on the X_test by choosing the best evaluation metric

### Print the important features of the best model to understand the dataset
- This will not give much explanation on the already transformed dataset
- But it will help us in understanding if the dataset is not PCA transformed

In [None]:
var_imp = []
for i in clf.feature_importances_:
    var_imp.append(i)
print('Top var =', var_imp.index(np.sort(clf.feature_importances_)[-1])+1)
print('2nd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-2])+1)
print('3rd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-3])+1)

# Variable on Index-16 and Index-13 seems to be the top 2 variables
top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-1])
second_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-2])

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]

np.random.shuffle(X_train_0)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]

plt.scatter(X_train_1[:, top_var_index], X_train_1[:, second_top_var_index], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], top_var_index], X_train_0[:X_train_1.shape[0], second_top_var_index],
            label='Actual Class-0 Examples')
plt.legend()

## Model building with balancing Classes

##### Perform class balancing with :
- Random Oversampling
- SMOTE
- ADASYN

## Model Building
- Build different models on the balanced dataset and see the result

In [None]:
# Logistic Regression
from sklearn import linear_model #import the package

num_C = ______  #--> list of values
cv_num =   #--> list of values

#### perfom cross validation on the X_train & y_train to create:
- X_train_cv
- X_test_cv 
- y_train_cv
- y_test_cv 

### Random Oversampling

In [None]:
from imblearn import over_sampling #- import the packages

#perform cross validation & then balance classes on X_train_cv & y_train_cv using Random Oversampling

#perform hyperparameter tuning

#print the evaluation result by choosing a evaluation metric

#print the optimum value of hyperparameters


### Similarly explore other algorithms on balanced dataset by building models like:
- KNN
- SVM
- Decision Tree
- Random Forest
- XGBoost

### Print the class distribution after applying SMOTE 

In [None]:
import warnings
warnings.filterwarnings("ignore")


sm = over_sampling.SMOTE(random_state=0)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from SMOTE are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from SMOTE, we do
X_train_smote_1 = X_train_smote[X_train.shape[0]:]

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]


plt.rcParams['figure.figsize'] = [20, 20]
fig = plt.figure()

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_smote_1[:X_train_1.shape[0], 0], X_train_smote_1[:X_train_1.shape[0], 1],
            label='Artificial SMOTE Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

In [None]:
#perform cross validation & then balance classes on X_train_cv & y_train_cv using SMOTE

#perform hyperparameter tuning

#print the evaluation result by choosing a evaluation metric

#print the optimum value of hyperparameters


##### Build models on other algorithms to see the better performing on SMOTE

### Print the class distribution after applying ADASYN

In [None]:
import warnings
warnings.filterwarnings("ignore")

from imblearn import over_sampling

ada = over_sampling.ADASYN(random_state=0)
X_train_adasyn, y_train_adasyn = ada.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from ADASYN are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from ADASYN, we do
X_train_adasyn_1 = X_train_adasyn[X_train.shape[0]:]

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]



import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]
fig = plt.figure()

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_adasyn_1[:X_train_1.shape[0], 0], X_train_adasyn_1[:X_train_1.shape[0], 1],
            label='Artificial ADASYN Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

In [None]:
#perform cross validation & then balance classes on X_train_cv & y_train_cv using ADASYN

#perform hyperparameter tuning

#print the evaluation result by choosing a evaluation metric

#print the optimum value of hyperparameters


##### Build models on other algorithms to see the better performing on ADASYN

### Select the oversampling method which shows the best result on a model
- Apply the best hyperparameter on the model
- Predict on the test dataset

In [None]:
# perform the best oversampling method on X_train & y_train

clf = ___  #initialise the model with optimum hyperparameters
clf.fit( ) # fit on the balanced dataset
print() --> #print the evaluation score on the X_test by choosing the best evaluation metric

### Print the important features of the best model to understand the dataset

In [None]:
var_imp = []
for i in clf.feature_importances_:
    var_imp.append(i)
print('Top var =', var_imp.index(np.sort(clf.feature_importances_)[-1])+1)
print('2nd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-2])+1)
print('3rd Top var =', var_imp.index(np.sort(clf.feature_importances_)[-3])+1)

# Variable on Index-13 and Index-9 seems to be the top 2 variables
top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-1])
second_top_var_index = var_imp.index(np.sort(clf.feature_importances_)[-2])

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]

np.random.shuffle(X_train_0)

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]

plt.scatter(X_train_1[:, top_var_index], X_train_1[:, second_top_var_index], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], top_var_index], X_train_0[:X_train_1.shape[0], second_top_var_index],
            label='Actual Class-0 Examples')
plt.legend()

In [None]:
#### Print the FPR,TPR & select the best threshold from the roc curve

In [None]:
print('Train auc =', metrics.roc_auc_score(_________)
fpr, tpr, thresholds = metrics.roc_curve(_________)
threshold = thresholds[np.argmax(tpr-fpr)]
print(threshold)