In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_columns',500)
pd.set_option('display.width',500)
pd.set_option('display.max_rows', 500)

In [None]:
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import category_encoders as ce

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RandomizedSearchCV
# from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report

In [None]:
df_train_data = pd.read_csv('/kaggle/input/analytics-vidhya-job-a-thon-may-2021/train_s3TEQDk.csv')
df_train_data.head()

In [None]:
df_test_data = pd.read_csv('/kaggle/input/analytics-vidhya-job-a-thon-may-2021/test_mSzZ8RL.csv')
df_test_data.head()

In [None]:
train_data = df_train_data.copy()
test_data = df_test_data.copy()

In [None]:
train_data.shape, test_data.shape

In [None]:
100*len(test_data)/len(train_data)

In [None]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [None]:
details(train_data)

In [None]:
details(test_data)

In [None]:
cols_obj = train_data.select_dtypes('object').columns
cols_obj

In [None]:
cols_num = train_data.select_dtypes('number').columns
cols_num

In [None]:
train_data.Credit_Product.value_counts(dropna=False)

In [None]:
train_data.Is_Active.value_counts().plot.bar()
plt.show()

In [None]:
train_data.groupby('Is_Active')['Credit_Product'].count().plot.bar()
plt.show()

Since we cannot delete rows with NaN's since the submission file shows same shape as test file, we need to impute this categorical column with mode, which is 'No'

In [None]:
# mode_imputation = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# train_data['Credit_Product'] = mode_imputation.fit_transform(train_data[['Credit_Product']]).ravel()
# test_data['Credit_Product'] = mode_imputation.transform(test_data[['Credit_Product']]).ravel()

Instead of imputing with mode value we can create a separate category

In [None]:
train_data['Credit_Product']= train_data['Credit_Product'].replace(np.nan, "Others")
test_data['Credit_Product']= test_data['Credit_Product'].replace(np.nan, "Others")

In [None]:
train_data.Credit_Product.value_counts().plot.bar()
plt.show()

In [None]:
train_data.Gender.value_counts().plot.bar()
plt.show()

In [None]:
train_data.Region_Code.value_counts().plot.bar()
plt.show()

In [None]:
train_data.Occupation.value_counts().plot.bar()
plt.show()

In [None]:
train_data.Channel_Code.value_counts().plot.bar()
plt.show()

In [None]:
train_data.describe(percentiles=(0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99)).apply(lambda s: s.apply('{0:.5f}'.format))

Data Imbalance Check

In [None]:
train_data.Is_Lead.value_counts()

In [None]:
round(100*train_data['Is_Lead'].value_counts(normalize=True),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.title("Lead and Non-Lead Distribution")
plt.legend(["Non-Lead", "Lead"])
plt.show()

Visualizations

In [None]:
# %%time
# row,col,c = 14,3,1
# fig = plt.figure(figsize=(30,80), dpi= 200)

# for i in list(cols_obj):
#     plt.subplot(row,col,c)
#     plt.title(f'{i},subplot:{row}{col}{c}')
#     plt.xlabel(i)
#     train_data[i].value_counts().plot.bar()
#     c = c + 1
# plt.tight_layout()
# plt.show()

In [None]:
sns.barplot(data=train_data, x='Is_Lead', y='Avg_Account_Balance')
plt.show()

In [None]:
sns.barplot(data=train_data, y='Is_Lead', x='Gender')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=3, ncols = 2, figsize=(50,30))

sns.countplot(x="Gender", hue='Is_Lead', data=train_data, ax=axes[0][0])
axes[0][0].set_title('Gender')
plt.xticks(rotation=45)

sns.countplot(x="Region_Code", hue='Is_Lead', data=train_data, ax=axes[0][1])
axes[0][1].set_title('Region_Code')
plt.xticks(rotation=45)

sns.countplot(x="Occupation", hue='Is_Lead', data=train_data, ax=axes[1][0])
axes[1][0].set_title('Occupation')
plt.xticks(rotation=45)

sns.countplot(x="Channel_Code", hue='Is_Lead', data=train_data, ax=axes[1][1])
axes[1][1].set_title('Channel_Code')
plt.xticks(rotation=45)

sns.countplot(x="Credit_Product", hue='Is_Lead', data=train_data, ax=axes[2][0])
axes[2][0].set_title('Credit_Product')
plt.xticks(rotation=45)

sns.countplot(x="Is_Active", hue='Is_Lead', data=train_data, ax=axes[2][1])
axes[2][1].set_title('Is_Active')
plt.xticks(rotation=45)

plt.show()

Binary Encoding

In [None]:
#This can be kept before new df creation since its common
train_data.Gender = train_data.Gender.map({'Male':1,'Female':0})
train_data.Credit_Product = train_data.Credit_Product.map({'Yes':1,'No':0,'Others':3})
train_data.Is_Active = train_data.Is_Active.map({'Yes':1,'No':0})

In [None]:
test_data.Gender = test_data.Gender.map({'Male':1,'Female':0})
test_data.Credit_Product = test_data.Credit_Product.map({'Yes':1,'No':0,'Others':3})
test_data.Is_Active = test_data.Is_Active.map({'Yes':1,'No':0})

In [None]:
#OHE
#This can be kept before new df creation since its common
ohe = OneHotEncoder(sparse=False)
ohe_df1 = pd.DataFrame(ohe.fit_transform(train_data[['Region_Code','Occupation','Channel_Code']]),columns=ohe.get_feature_names())
ohe_df2 = pd.DataFrame(ohe.transform(test_data[['Region_Code','Occupation','Channel_Code']]),columns=ohe.get_feature_names())

In [None]:
train_data = pd.concat([train_data, ohe_df1],1)
test_data = pd.concat([test_data, ohe_df2],1)

In [None]:
train_data.head()

In [None]:
train_data.drop(['Region_Code','Occupation','Channel_Code'],1,inplace=True)
test_data.drop(['Region_Code','Occupation','Channel_Code'],1,inplace=True)

Keeping copy of dataset

In [None]:
df_train = train_data.copy()
df_test = test_data.copy()

Age and Vintage to be made bins

In [None]:
df_train['Age'] = pd.cut(df_train['Age'], [0,25,50,75,100], labels=['<25','25-50','50-75','>75'])
df_train['Vintage'] = pd.cut(df_train['Vintage'], [0,15,30,45,60,75,90,105,120,200], labels=['<15','15-30','30-45','45-60','60-75','75-90','90-105','105-120','>120'])

df_test['Age'] = pd.cut(df_test['Age'], [0,25,50,75,100], labels=['<25','25-50','50-75','>75'])
df_test['Vintage'] = pd.cut(df_test['Vintage'], [0,15,30,45,60,75,90,105,120,200], labels=['<15','15-30','30-45','45-60','60-75','75-90','90-105','105-120','>120'])

In [None]:
df_train['Age'].value_counts()

In [None]:
df_train['Vintage'].value_counts()

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
sns.countplot(y="Age", data=df_train)
plt.title('Age')
plt.subplot(1,2,2)
sns.countplot(y="Vintage", data=df_train)
plt.title('Vintage')

In [None]:
ohe_df3 = pd.DataFrame(ohe.fit_transform(df_train[['Age','Vintage']]),columns=ohe.get_feature_names())
ohe_df4 = pd.DataFrame(ohe.transform(df_test[['Age','Vintage']]),columns=ohe.get_feature_names())

In [None]:
df_train = pd.concat([df_train, ohe_df3],1)
df_test = pd.concat([df_test, ohe_df4],1)

In [None]:
df_train.drop(['Age','Vintage'],1,inplace=True)
df_test.drop(['Age','Vintage'],1,inplace=True)

In [None]:
df_train.head()

Data Preparation for Machine Learning

In [None]:
X = df_train.drop(['ID','Is_Lead'], 1)
y = df_train['Is_Lead']
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=23)

In [None]:
df_train.shape

In [None]:
cols_obj = df_train.select_dtypes('object').columns
cols_obj

In [None]:
cols_num = df_train.select_dtypes('number').columns
cols_num

Distplot

In [None]:
plt.figure(figsize=(15,8))
# for i in enumerate(cols):
#     plt.subplot(1,3,i[0]+1)
sns.distplot(X_train['Avg_Account_Balance'])
plt.show()

Power transformer

In [None]:
pt = PowerTransformer(copy=False)
X_train[['Avg_Account_Balance']] = pt.fit_transform(X_train[['Avg_Account_Balance']])
X_val[['Avg_Account_Balance']] = pt.transform(X_val[['Avg_Account_Balance']])
df_test[['Avg_Account_Balance']] = pt.transform(df_test[['Avg_Account_Balance']])

In [None]:
plt.figure(figsize=(15,8))
# for i in enumerate(cols):
#     plt.subplot(1,3,i[0]+1)
sns.distplot(X_train['Avg_Account_Balance'])
plt.show()

In [None]:
X_train.head()

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

ML

In [None]:
# Models
model_list = list()
#AUC curve
AUCROC_train = list()
AUCROC_val = list()

In [None]:
model_LR = LogisticRegression()

In [None]:
def model_fit(model, X_train, y_train, X_val, y_val, algo=None):
    
    model_LR.fit(X_train, y_train)
    
    y_train_prob = model_LR.predict_proba(X_train)
    y_train_pred = model_LR.predict(X_train)
    y_val_prob = model_LR.predict_proba(X_val)
    y_val_pred = model_LR.predict(X_val)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_val, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_val, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_val, y_val_prob[:,1])
    
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_val, y_val),'\n')
    
    model_list.append(algo)
    AUCROC_train.append(auc_train)
    AUCROC_val.append(auc_val)

In [None]:
def model_fit_evaluation(model, params, X_train, y_train, X_val, y_val, algo=None):
    
    rcv = RandomizedSearchCV(model, params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)
    
    rcv_best = rcv.best_estimator_
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_val)
    y_val_pred = rcv_best.predict(X_val)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_val, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_val, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_val, y_val_prob[:,1])
    
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(rcv_best, X_val, y_val),'\n')
    
    model_list.append(algo)
    AUCROC_train.append(auc_train)
    AUCROC_val.append(auc_val)

In [None]:
model_fit(model_LR, X_train, y_train, X_val, y_val, algo='Logistic Regression with Hyperparameters')

In [None]:
params_LR = {'C':np.logspace(-1, 5, 10), 'class_weight':[None,'balanced'], 'penalty':['l1','l2']}

In [None]:
model_fit_evaluation(model_LR, params_LR, X_train, y_train, X_val, y_val, algo='Logistic Regression with Hyperparameter tuning')

Decision tree and Random Forest

In [None]:
df_train_tree = train_data.copy()
df_test_tree = test_data.copy()

In [None]:
X = df_train_tree.drop(['ID','Is_Lead'], 1)
y = df_train_tree['Is_Lead']
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=23)

In [None]:
model_DT = DecisionTreeClassifier(random_state=23)
params_DT = {
    'max_depth': [10, 20, 50, 100, 200],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'min_samples_split' : [10, 20, 50, 100, 200],
    'criterion': ["gini", "entropy"]
}

In [None]:
model_fit(model_DT, X_train, y_train, X_val, y_val, algo='Decision Tree without Hyperparameters')

In [None]:
%%time
model_fit_evaluation(model_DT, params_DT, X_train, y_train, X_val, y_val, algo='Decision Tree with Hyperparameter tuning')

In [None]:
model_RF = RandomForestClassifier(oob_score = True, random_state=23)
params_RF = {
    'n_estimators': [10, 20, 50, 100, 200],
    'max_depth': [10, 20, 50, 100, 200],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'min_samples_split' : [10, 20, 50, 100, 200],
    'criterion': ["gini", "entropy"]
}

In [None]:
model_fit(model_RF, X_train, y_train, X_val, y_val, algo='Random Forest without Hyperparameters')

In [None]:
%%time
model_fit_evaluation(model_RF, params_RF, X_train, y_train, X_val, y_val, algo='Random Forest with Hyperparameter tuning')

In [None]:
# model_XGB = XGBClassifier(random_state=23)
# params_XGB = {
#     'n_estimators': [5, 10, 20, 50, 100, 200],
#     'max_depth': [5, 10, 20, 50, 100, 200],
#     'sampling_method': ['uniform','gradient_based'],
#     'subsample': [0.2, 0.4, 0.5, 0.6, 0.8, 1],
#     'learning_rate': [0.01,0.05,0.1,0.2,0.3,0.5,1]
# }

In [None]:
# model_fit(model_XGB, X_train, y_train, X_val, y_val, algo='XGB without Hyperparameters')

In [None]:
#Not running due to lack of system properties
# model_fit_evaluation(model_XGB, params_XGB, X_train, y_train, X_val, y_val, algo='XGB with Hyperparameter tuning')

Evaluation

In [None]:
eval_df = pd.DataFrame({'model': model_list, 'train_AUC': AUCROC_train, 'val_AUC': AUCROC_val})
eval_df

Running the entire train dataset to predict on test data set

In [None]:
%%time
rcv = RandomizedSearchCV(model_RF, params_RF, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=23)
rcv.fit(X, y)

rcv_best = rcv.best_estimator_
print('\n')
print('best estimator : ', rcv_best)
print('best parameters: ', rcv.best_params_)
print('best score: ', rcv.best_score_)
print('\n')

y_train_prob = rcv_best.predict_proba(X)
y_train_pred = rcv_best.predict(X)

matrix_train = confusion_matrix(y, y_train_pred)
report_train = classification_report(y, y_train_pred)
auc_train = roc_auc_score(y, y_train_prob[:,1])

print('Confusion Matrix for train')
print('='*60)
print(matrix_train,"\n")
print('Classification Report for train')
print('='*60)
print(report_train,"\n")
print('AUC-ROC for train')
print('='*60)
print(auc_train,'\n')
print('Roc-Auc-Curve for Train set')
print('='*60)
print(plot_roc_curve(rcv_best, X, y),'\n')

Feature importance

In [None]:
rcv_best.feature_importances_
final_df = pd.DataFrame({'Varname': X.columns, 'feature_imp':rcv_best.feature_importances_})
final_df.sort_values(by='feature_imp', ascending=False)

Predicting on test set

In [None]:
#Final prediction
predictions = rcv_best.predict(df_test_tree.drop('ID', axis=1))

#set the output as a dataframe and convert to csv file named submission.csv
submission = pd.DataFrame({ 'ID' : df_test_tree['ID'], 'Is_Lead': predictions })
submission.shape

In [None]:
submission.head(10)

In [None]:
test_data.head()

In [None]:
test_final = pd.merge(left=test_data, right=submission, left_on='ID', right_on='ID')
test_final.head()

In [None]:
test_final_original = pd.merge(left=df_test_data, right=submission, left_on='ID', right_on='ID')
test_final_original.head()

In [None]:
test_final['Age_binned'] = pd.cut(test_final['Age'], [0,25,50,75,100], labels=['<25','25-50','50-75','>75'])
test_final['Vintage_binned'] = pd.cut(test_final['Vintage'], [0,15,30,45,60,75,90,105,120,200], labels=['<15','15-30','30-45','45-60','60-75','75-90','90-105','105-120','>120'])
test_final.drop(['Age','Vintage'],1,inplace=True)
test_final.head()

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
sns.countplot(y="Age_binned", data=test_final)
plt.title('Age')
plt.subplot(1,2,2)
sns.countplot(y="Vintage_binned", data=test_final)
plt.title('Vintage')

In [None]:
fig, axes = plt.subplots(nrows=3, ncols = 2, figsize=(50,30))

sns.countplot(x="Gender", hue='Is_Lead', data=test_final_original, ax=axes[0][0])
axes[0][0].set_title('Gender')
plt.xticks(rotation=45)

sns.countplot(x="Region_Code", hue='Is_Lead', data=test_final_original, ax=axes[0][1])
axes[0][1].set_title('Region_Code')
plt.xticks(rotation=45)

sns.countplot(x="Occupation", hue='Is_Lead', data=test_final_original, ax=axes[1][0])
axes[1][0].set_title('Occupation')
plt.xticks(rotation=45)

sns.countplot(x="Channel_Code", hue='Is_Lead', data=test_final_original, ax=axes[1][1])
axes[1][1].set_title('Channel_Code')
plt.xticks(rotation=45)

sns.countplot(x="Credit_Product", hue='Is_Lead', data=test_final_original, ax=axes[2][0])
axes[2][0].set_title('Credit_Product')
plt.xticks(rotation=45)

sns.countplot(x="Is_Active", hue='Is_Lead', data=test_final_original, ax=axes[2][1])
axes[2][1].set_title('Is_Active')
plt.xticks(rotation=45)

plt.show()

In [None]:
sns.countplot(x="Age_binned", hue='Is_Lead', data=test_final)
plt.show()

In [None]:
sns.countplot(x="Vintage_binned", hue='Is_Lead', data=test_final)
plt.show()