### Importing all neccesary Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')


from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier



from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_curve

In [None]:
train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.shape,test.shape

In [None]:
train.info() ## Information of Dataset

In [None]:
test.info()

In [None]:
#ratio of null values
train.isnull().sum()/train.shape[0] *100

In [None]:
#ratio of null values
test.isnull().sum()/test.shape[0] *100

In [None]:
#categorical features
categorical = train.select_dtypes(include =[np.object])
print("Categorical Features in Train Set:",categorical.shape[1])

#numerical features
numerical= train.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features in Train Set:",numerical.shape[1])

In [None]:
#categorical features
categorical = test.select_dtypes(include =[np.object])
print("Categorical Features in Test Set:",categorical.shape[1])

#numerical features
numerical= test.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features in Test Set:",numerical.shape[1])

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['Credit_Product'].value_counts()

In [None]:
test['Credit_Product'].value_counts()

In [None]:
## Figure Shows the Frequency of Is_Lead
plt.figure(figsize=(8,5))
sns.set_style('whitegrid')
sns.countplot(x='Is_Lead', data=train, palette='RdBu_r')

## Handling missing Data
### 3 Basic Methods to deal with this problem
#### 1) To delete the Data rows containing NaN Values.  
Con:- This will also delete the data which may be important for the prediction and it is not recommended when the dataset is small also for this competition we cannot drop null values as it will give dimension error while submission.  
#### 2) To use Mode value  
con:- I have used mode value but it is giving low accuracy and also reults in biasing

#### 3)Here I have used "Missing" term in null values instead of picking up the mode value which results in bias

In [None]:
dummy3_train = train
dummy3_test = test

In [None]:
dummy3_train

In [None]:
dummy3_train["Credit_Product"].fillna('Missing', inplace=True)
dummy3_test["Credit_Product"].fillna('Missing', inplace=True)

In [None]:
dummy3_train["Credit_Product"].isnull().sum()

In [None]:
dummy3_train = dummy3_train.drop(columns=['ID'])

In [None]:
# label encoding the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

objList = dummy3_train.select_dtypes(include = "object").columns
#Label Encoding for object to numeric conversion
for feat in objList:
    dummy3_train[feat] = le.fit_transform(dummy3_train[feat].astype(str))

print(dummy3_train.info())

In [None]:
# label encoding the data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

objList = dummy3_test.select_dtypes(include = "object").columns
objList = objList.delete(0)
#Label Encoding for object to numeric conversion
for feat in objList:
    dummy3_test[feat] = le.fit_transform(dummy3_test[feat].astype(str))

print(dummy3_test.info())

In [None]:
X = dummy3_train.drop(columns = ['Is_Lead'], axis=1) #Entire dataset except Target column
y = dummy3_train['Is_Lead'] #Target column

In [None]:
# Scale the data to be between -1 and 1
scaler = StandardScaler()
X = scaler.fit_transform(X)
# After Scaling normalize the data to predict better results
X = normalize(X)
X

### Modelling of Machine-Learning Models

In [None]:
# 20% data as validation set
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=22)

In [None]:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)

In [None]:
model_LR = LogisticRegression()
model_LR.fit(X_train,y_train)

In [None]:
logpred = model_LR.predict(X_test)
print(confusion_matrix(y_test, logpred))
print(round(accuracy_score(y_test, logpred),2)*100)

In [None]:
logacc = accuracy_score(y_test, logpred)
logf1score = f1_score(y_test, logpred)
logrecall = recall_score(y_test, logpred)
logbal = balanced_accuracy_score(y_test, logpred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = model_LR.predict_proba(X_test)

probs = probs[:, 1]
lrauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",lrauc)

In [None]:
nb = GaussianNB().fit(X_train, y_train)
# predict on test set
nb_pred = nb.predict(X_test)
print(confusion_matrix(y_test, nb_pred))
print(round(accuracy_score(y_test, nb_pred),2)*100)

In [None]:
nbacc = accuracy_score(y_test, nb_pred)
nbf1score = f1_score(y_test, nb_pred)
nbrecall = recall_score(y_test, nb_pred)
nbbal = balanced_accuracy_score(y_test, nb_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = nb.predict_proba(X_test)
probs = probs[:, 1]
nbauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",nbauc)

In [None]:
#train model
sgd = SGDClassifier(loss= "modified_huber", shuffle = True, random_state= 101).fit(X_train, y_train)
# predict on test set
sgd_pred = sgd.predict(X_test)
print(confusion_matrix(y_test, sgd_pred))
print(round(accuracy_score(y_test, sgd_pred),2)*100)

In [None]:
sgdacc = accuracy_score(y_test, sgd_pred)
sgdf1score = f1_score(y_test, sgd_pred)
sgdrecall = recall_score(y_test, sgd_pred)
sgdbal = balanced_accuracy_score(y_test, sgd_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = sgd.predict_proba(X_test)
probs = probs[:, 1]
sgdauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",sgdauc)

In [None]:
# train model
dtree = DecisionTreeClassifier(max_depth = 10, random_state= 101, max_features =None , min_samples_leaf
= 30).fit(X_train, y_train)
# predict on test set
dtree_pred = dtree.predict(X_test)
print(confusion_matrix(y_test, dtree_pred))
print(round(accuracy_score(y_test, dtree_pred),2)*100)

In [None]:
dtreeacc = accuracy_score(y_test, dtree_pred)
dtreef1score = f1_score(y_test, dtree_pred)
dtreerecall = recall_score(y_test, dtree_pred)
dtreebal = balanced_accuracy_score(y_test, dtree_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = dtree.predict_proba(X_test)
probs = probs[:, 1]
dtreeauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",dtreeauc)

In [None]:
lgb = LGBMClassifier()
lgb.fit(X_train, y_train)
lgb_pred = lgb.predict(X_test)
print(confusion_matrix(y_test, lgb_pred))
print(round(accuracy_score(y_test, lgb_pred),2)*100)

In [None]:
lgbacc = accuracy_score(y_test, lgb_pred)
lgbf1score = f1_score(y_test, lgb_pred)
lgbrecall = recall_score(y_test, lgb_pred)
lgbbal = balanced_accuracy_score(y_test, lgb_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = lgb.predict_proba(X_test)
probs = probs[:, 1]
lgbauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",lgbauc)

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print(confusion_matrix(y_test, xgb_pred))
print(round(accuracy_score(y_test, xgb_pred),2)*100)

In [None]:
xgbacc = accuracy_score(y_test, xgb_pred)
xgbf1score = f1_score(y_test, xgb_pred)
xgbrecall = recall_score(y_test, xgb_pred)
xgbbal = balanced_accuracy_score(y_test, xgb_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = xgb.predict_proba(X_test)
probs = probs[:, 1]
xgbauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",xgbauc)

In [None]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
ada_pred = ada.predict(X_test)
print(confusion_matrix(y_test, ada_pred))
print(round(accuracy_score(y_test, ada_pred),2)*100)

In [None]:
adaacc = accuracy_score(y_test, ada_pred)
adaf1score = f1_score(y_test, ada_pred)
adarecall = recall_score(y_test, ada_pred)
adabal = balanced_accuracy_score(y_test, ada_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = ada.predict_proba(X_test)
probs = probs[:, 1]
adaauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",adaauc)

In [None]:
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
print(confusion_matrix(y_test, mlp_pred))
print(round(accuracy_score(y_test, mlp_pred),2)*100)

In [None]:
mlpacc = accuracy_score(y_test, mlp_pred)
mlpf1score = f1_score(y_test, mlp_pred)
mlprecall = recall_score(y_test, mlp_pred)
mlpbal = balanced_accuracy_score(y_test, mlp_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = mlp.predict_proba(X_test)
probs = probs[:, 1]
mlpauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",mlpauc)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(confusion_matrix(y_test,rf_pred))
print(round(accuracy_score(y_test, rf_pred),2)*100)

In [None]:
rfacc = accuracy_score(y_test, rf_pred)
rff1score = f1_score(y_test, rf_pred)
rfrecall = recall_score(y_test, rf_pred)
rfbal = balanced_accuracy_score(y_test, rf_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = rf.predict_proba(X_test)
probs = probs[:, 1]
rfauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",rfauc)

In [None]:
gr = GradientBoostingClassifier()
gr.fit(X_train, y_train)
gr_pred = gr.predict(X_test)
print(confusion_matrix(y_test,gr_pred))
print(round(accuracy_score(y_test, gr_pred),2)*100)

In [None]:
gracc = accuracy_score(y_test, gr_pred)
grf1score = f1_score(y_test, gr_pred)
grrecall = recall_score(y_test, gr_pred)
grbal = balanced_accuracy_score(y_test, gr_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = gr.predict_proba(X_test)
probs = probs[:, 1]
grauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",grauc)

In [None]:
cat = CatBoostClassifier()
cat.fit(X_train, y_train)
cat_pred = cat.predict(X_test)
print(confusion_matrix(y_test,cat_pred))
print(round(accuracy_score(y_test, cat_pred),2)*100)

In [None]:
catacc = accuracy_score(y_test, cat_pred)
catf1score = f1_score(y_test, cat_pred)
catrecall = recall_score(y_test, cat_pred)
catbal = balanced_accuracy_score(y_test, cat_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = cat.predict_proba(X_test)
probs = probs[:, 1]
catauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",catauc)

### Comparing the Models

In [None]:
models = [('Logistic Regression', logacc, logf1score, logrecall, logbal, lrauc),
          ('Naive_Bayes', nbacc, nbf1score, nbrecall, nbbal, nbauc),
          ('SGD Classifier', sgdacc, sgdf1score, sgdrecall, sgdbal, sgdauc),
          ('Decision TreeClassifier', dtreeacc, dtreef1score, dtreerecall, dtreebal, dtreeauc),
          ('LGBM Classifier', lgbacc, lgbf1score, lgbrecall, lgbbal, lgbauc),
          ('XGB Classifier', xgbacc, xgbf1score, xgbrecall, xgbbal, xgbauc),
          ('AdaBoost Classifier', adaacc, adaf1score, adarecall, adabal, adaauc),
          ('MLP Classifier', mlpacc, mlpf1score, mlprecall, mlpbal, mlpauc),
          ('RandomForest Classifier', rfacc, rff1score, rfrecall, rfbal, rfauc),
          ('Gradient Boosting Classifier', gracc, grf1score, grrecall, grbal, grauc),
          ('CatBoost Classifier', catacc, catf1score, catrecall, catbal, catauc)]

In [None]:
predict = pd.DataFrame(data=models, columns=['Models', 'Accuracy of model', 'F1 Score', 'Recall Score', 'Balanced Accuracy Score', 'ROC AUC Score'])
cm = sns.light_palette("green", as_cmap=True)
s = predict.style.background_gradient(cmap=cm)
s

In [None]:
sns.set(style="whitegrid")
ax = sns.barplot(y="Models", x="ROC AUC Score", data=predict)

### Hyper-Parameter Tuning and  Ensemble Top Model

The Top Models are:  
1) LGBM Classifier  
2) XGB Classifier  
3) CatBoost Classifier  
4) MLP Classifier  
5) Gradient Boosting Classifier  

Parameter Tuning using RandomizedSearchCV

In [None]:
# from scipy.stats import randint
# from sklearn.model_selection import RandomizedSearchCV

# cbc = CatBoostClassifier()

# # Creating the hyperparameter grid
# param_dist = { "learning_rate": np.linspace(0,0.2,5),
#               "max_depth": randint(3, 10)}
               
# #Instantiate RandomSearchCV object
# rscv = RandomizedSearchCV(cbc , param_dist, scoring='roc_auc', cv =5)

# #Fit the model
# rscv.fit(X_train,y_train)

In [None]:
# # Print the tuned parameters and score
# print(rscv.best_params_)
# print(rscv.best_score_)

In [None]:
# lgb = LGBMClassifier()
# rs_params = {

#         'bagging_fraction': (0.5, 0.8),
#         'bagging_frequency': (5, 8),

#         'feature_fraction': (0.5, 0.8),
#         'max_depth': (10, 13),
#         'min_data_in_leaf': (90, 120),
#         'num_leaves': (1200, 1550)

# }

# # Initialize a RandomizedSearchCV object using 5-fold CV-
# rs_cv = RandomizedSearchCV(cbc , param_dist, scoring='roc_auc', cv =5)

# # Train on training data-
# rs_cv.fit(X_train, y_train)

In [None]:
# print(rs_cv.best_params_)
# print(rs_cv.best_score_)

Ensembling CatBoostClassifier and LGBMClassifier

In [None]:
from sklearn.ensemble import StackingClassifier


# define the base models
level0 = list()
level0.append(('lgb', LGBMClassifier(learning_rate = 0.05, max_depth = 3)))
level0.append(('cat', CatBoostClassifier(learning_rate = 0.15000000000000002, max_depth = 3)))

lr = LogisticRegression() ##Base Model
# define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=lr, cv=10)

# fit the model on all available data
model.fit(X_train, y_train)

In [None]:
model_pred = model.predict(X_test)
print(confusion_matrix(y_test, model_pred))
print(round(accuracy_score(y_test, model_pred),2)*100)

In [None]:
modacc = accuracy_score(y_test, model_pred)
modf1score = f1_score(y_test, model_pred)
modrecall = recall_score(y_test, model_pred)
modbal = balanced_accuracy_score(y_test, model_pred)

In [None]:
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
probs = model.predict_proba(X_test)
probs = probs[:, 1]
modauc = roc_auc_score(y_test, probs)
fpr, tpr, thresholds = roc_curve(y_test, probs)
plot_roc_curve(fpr, tpr)
print("AUC-ROC :",modauc)

### Comparing Top Models after Hyperparameter Tuning

In [None]:
models = [('LGBM Classifier', lgbacc, lgbf1score, lgbrecall, lgbbal, lgbauc),
          ('XGB Classifier', xgbacc, xgbf1score, xgbrecall, xgbbal, xgbauc),
          ('CatBoost Classifier', catacc, catf1score, catrecall, catbal, catauc),
          ('MLP Classifier', mlpacc, mlpf1score, mlprecall, mlpbal, mlpauc),
          ('Gradient Boosting Classifier', gracc, grf1score, grrecall, grbal, grauc),
          ('Ensemble Model', modacc, modf1score, modrecall, modbal, modauc)]

In [None]:
predict = pd.DataFrame(data=models, columns=['Models', 'Accuracy of model', 'F1 Score', 'Recall Score', 'Balanced Accuracy Score', 'ROC AUC Score'])
cm = sns.light_palette("violet", as_cmap=True)
s = predict.style.background_gradient(cmap=cm)
s

### Final Submission

In [None]:
test1 = dummy3_test.copy()
dummy3_test = dummy3_test.drop('ID', axis=1)
dummy3_test = scaler.fit_transform(dummy3_test)
dummy3_test = normalize(dummy3_test)
dummy3_test

In [None]:
probstest = model.predict_proba(dummy3_test)
probstest = probstest[:, 1]
test1["Is_Lead"] = probstest
test1[["ID","Is_Lead"]].to_csv("Final-Submission.csv",index=False)
test1[["ID","Is_Lead"]].head()
print("Submission Successful")