# Credit Card Lead Prediction

In [None]:
# Importing required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stat
import pylab
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from xgboost import XGBClassifier 
from lightgbm import LGBMClassifier

In [None]:
# Loading datasets

train=pd.read_csv('../input/jobathon-data/Data/train_s3TEQDk.csv')
test=pd.read_csv('../input/jobathon-data/Data/test_mSzZ8RL.csv')

In [None]:
train.head()

In [None]:
test.head()

## Handling Missing Values

In [None]:
train.isnull().sum(), test.isnull().sum()

In [None]:
train['Credit_Product'].value_counts()

In [None]:
train['Credit_Product'].fillna('Yes', inplace = True)
test['Credit_Product'].fillna('Yes', inplace = True)

## Feature Engineering

In [None]:
train.dtypes

In [None]:
# Using LabelEncoder to transform categorical features into numerical features

le = LabelEncoder()
catg_var = ['Gender', 'Occupation','Channel_Code','Credit_Product','Is_Active']
for i in catg_var:
    train[i] = le.fit_transform(train[i])
    test[i] = le.fit_transform(test[i])

In [None]:
# One Hot Encoding 'Region_Code'

list_rc=list(train.Region_Code.value_counts().index)
for i in list_rc:
    train[i]=np.where(train['Region_Code']==i, 1,0)
    test[i]=np.where(test['Region_Code']==i, 1,0)
train.drop('Region_Code', axis = 1, inplace = True)
test.drop('Region_Code', axis = 1, inplace = True)

In [None]:
train.head()

#### Q-Q Plot
To check the distribution and perform operations accordingly

In [None]:
def plot_data(df, feature):
    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    sns.distplot(df[feature])
    plt.subplot(1,2,2)
    stat.probplot(df[feature], dist='norm', plot=pylab)
    plt.show()
    print(df[feature].skew())

In [None]:
plot_data(train, 'Avg_Account_Balance')

In [None]:
train['Avg_Account_Balance']=np.log(train.Avg_Account_Balance)
test['Avg_Account_Balance']=np.log(test.Avg_Account_Balance)
plot_data(train, 'Avg_Account_Balance')

## Splitting dataset and performing oversampling

In [None]:
X = train.drop(['ID','Is_Lead'], axis = 1)
y = train.Is_Lead

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33)

In [None]:
# Changing dtypes
train['Avg_Account_Balance'] = train['Avg_Account_Balance'].astype(int)
test['Avg_Account_Balance'] = test['Avg_Account_Balance'].astype(int)

In [None]:
# Performing Oversampling

ros=RandomOverSampler(0.75)
X_resampled,y_resampled = ros.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_resampled)))

## Model build and training

In [None]:
# XGBoost

xgb = XGBClassifier(use_label_encoder=False, max_depth=3, n_estimators=500, learning_rate=0.05).fit(X_resampled, y_resampled)
pred_y_xgb = xgb.predict(X_test)
accuracy = roc_auc_score(y_test, pred_y_xgb)
accuracy

In [None]:
#LGBMClassifier

clf = LGBMClassifier(n_estimators=500, learning_rate=0.05)
clf.fit(X_resampled, y_resampled)
pred_y_lgbm = clf.predict(X_test)

lgbm_accuracy = roc_auc_score(y_test, pred_y_lgbm)
lgbm_accuracy

## Final prediction and submission

In [None]:
predictions = xgb.predict(test.drop('ID', axis = 1))
submission = pd.DataFrame({'ID': test['ID'],
                    'Is_Lead': predictions})

submission.to_csv("submission.csv", index=False)