In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
train= pd.read_csv('/kaggle/input/analytics-vidhya-job-a-thon-may-2021/train_s3TEQDk.csv')
train.head()

In [None]:
test= pd.read_csv('/kaggle/input/analytics-vidhya-job-a-thon-may-2021/test_mSzZ8RL.csv')
test.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
#percentage of null values
(train['Credit_Product'].isna().sum()/train['Credit_Product'].count())

In [None]:
(test['Credit_Product'].isna().sum()/test['Credit_Product'].count())

In [None]:
train.drop('ID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

In [None]:
train.info()

**Divinding columns into categorical and numerical for easier EDA**

In [None]:
cat_cols= [col for col in train.columns if train[col].dtype=='object']
num_cols= [col for col in train.columns if train[col].dtype!='object']
num_cols.remove('Is_Lead')
print(cat_cols)
print(num_cols)

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(15,12))
i=1
for col in cat_cols:
    plt.subplot(3,2,i)
    sns.countplot(train[col])
    i+=1

In [None]:
plt.figure(figsize=(15,12))
i=1
for col in num_cols:
    plt.subplot(2,2,i)
    sns.distplot(train[col])
    i+=1

In [None]:
sns.pairplot(train, hue='Is_Lead')

**Insights:**
**The data is skewed**


In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train.corr(), annot=True)

# Null Values

In [None]:
train['Credit_Product'].fillna('Yes', inplace=True)
test['Credit_Product'].fillna('Yes', inplace=True)
train.info()

In [None]:
test.info()

In [None]:
trans= [col for col in cat_cols if col!='Region_Code']
trans

# Feature Engineeing

In [None]:
train_dum= pd.get_dummies(train[trans], drop_first=True)
train_dum.head()


In [None]:
test_dum= pd.get_dummies(test[trans], drop_first=True)
test_dum.head()

In [None]:
train.drop(trans, axis=1,inplace=True)
train.head()

In [None]:
train= pd.concat([train, train_dum], axis=1)
train.head()

In [None]:
test.drop(trans, axis=1, inplace=True)
test= pd.concat([test, test_dum], axis=1)
test.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

enc= LabelEncoder()
train['Region_Code']= enc.fit_transform(train['Region_Code'])
test['Region_Code']= enc.transform(test['Region_Code'])
train.head()

In [None]:
test.head()

**We use log transformation because the data is skewed**

In [None]:
train['Avg_Account_Balance']= np.log(train['Avg_Account_Balance'])
train['Vintage']= np.log(train['Vintage'])
test['Avg_Account_Balance']= np.log(test['Avg_Account_Balance'])
test['Vintage']= np.log(test['Vintage'])

In [None]:
from sklearn.preprocessing import StandardScaler
ss= StandardScaler()

train[['Vintage', 'Avg_Account_Balance', 'Age', 'Region_Code']]= ss.fit_transform(train[['Vintage', 'Avg_Account_Balance', 'Age', 'Region_Code']])
test[['Vintage', 'Avg_Account_Balance','Age', 'Region_Code']]= ss.transform(test[['Vintage', 'Avg_Account_Balance', 'Age', 'Region_Code']])
train.head()

In [None]:
test.head()

In [None]:
sns.distplot(train['Avg_Account_Balance'])

In [None]:
sns.distplot(test['Avg_Account_Balance'])

In [None]:
sns.distplot(train['Vintage'])

In [None]:
sns.distplot(test['Vintage'])

# Model Building

In [None]:
from sklearn.model_selection import train_test_split

X= train.drop('Is_Lead', axis=1)
y= train['Is_Lead']

X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report

**For the sake of computational simplicity, I use Logistic Regression and RandomForest only. SVM and other classifiers took a lot of time to train, so I decided to go with this here**

In [None]:
lr = LogisticRegression()

lr.fit(X_train, y_train)
y_pred= lr.predict(X_test)

print(roc_auc_score(y_train, lr.predict(X_train)))
print(roc_auc_score(y_test, y_pred))

In [None]:
rfc= RandomForestClassifier()

rfc.fit(X_train, y_train)
y_pred= rfc.predict(X_test)

print(roc_auc_score(y_train, rfc.predict(X_train)))
print(roc_auc_score(y_test, y_pred))

# Hyperparamter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

params= {'C':[0.001, 0.01, 0.1, 1],
        'max_iter':[100,200,500]}

random_lr= RandomizedSearchCV(lr, param_distributions= params, cv=5, scoring='roc_auc', random_state=42)


In [None]:
random_lr.fit(X_train,y_train)

In [None]:
best_lr= random_lr.best_estimator_

best_lr.fit(X_train, y_train)
y_pred= best_lr.predict(X_test)

print(roc_auc_score(y_train, best_lr.predict(X_train)))
print(roc_auc_score(y_test, y_pred))

In [None]:
# params= {'n_estimators':[50,100,200,500],
#         'max_depth':[10, 100, 500],
#         'min_weight_fraction_leaf':[0, 0.1, 0.01, 0.2]}

# random_rfc= RandomizedSearchCV(rfc, param_distributions= params, cv=5, random_state=42, scoring='roc_auc')

In [None]:
# random_rfc.fit(X_train,y_train)

In [None]:
# random_rfc.best_estimator_
#RandomForestClassifier(max_depth=100, min_weight_fraction_leaf=0.01,
#                        n_estimators=500)

In [None]:
best_rfc= RandomForestClassifier(max_depth=100, min_weight_fraction_leaf=0.01,n_estimators=500)

best_rfc.fit(X_train, y_train)
y_pred= best_rfc.predict(X_test)

print(roc_auc_score(y_train, best_rfc.predict(X_train)))
print(roc_auc_score(y_test, y_pred))

# ROC Curve

In [None]:
from sklearn.metrics import plot_roc_curve

plot_roc_curve(best_rfc, X_train, y_train)
plot_roc_curve(best_rfc, X_test, y_test)

In [None]:
predictions= best_rfc.predict(test)

In [None]:
predictions

In [None]:
test_data= pd.read_csv('/kaggle/input/analytics-vidhya-job-a-thon-may-2021/test_mSzZ8RL.csv')

sub= pd.DataFrame(test_data['ID'])
sub.head()

# Final Predictions on Test Set

In [None]:
sub['Is_Lead']= predictions
sub.tail()

In [None]:
sub.to_csv('submission.csv', index=False)

# Upvote if you liked my Notebook :)