In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Important Libraries

import numpy as np # for mathematical computation
import pandas as pd # for dealing with the data
import matplotlib.pyplot as plt # for visualization
import seaborn as sns # for visualization
%matplotlib inline

In [None]:
train = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/train.csv")
test = pd.read_csv("../input/jobathon-may-2021-credit-card-lead-prediction/test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
test.shape

In [None]:
train.info()

*The above information shows that there are nine features to predict the customer's interests in the recommended credit cards.*

*In train dataset there are 245725 observations and 9 features and 1 target column.*

In [None]:
test.info()

*The test dataset contains 105312 obsevations and 9 features.*

In [None]:
# Let's see a statistical summary of the numerical columns in the train and test dataset.

train.describe()

***The above summary shows that the average age of the customers who are eligible to take credit cards is 43 and the minimum age is 23 and the maximum age is 85.***

***Vintage is how long the eligible customers have been on the bank records.The average is 3 year 8 months and the minimum is 7 months, the maximum year is 11 years(135 months)***

In [None]:
test.describe()

## *Exploratory Data Analysis*

In [None]:
plt.figure(figsize = (12,8))
print(train['Is_Lead'].value_counts())
colors = ['#66b3ff','#ffcc99']
plt.pie(train['Is_Lead'].value_counts(), labels = ['0','1'], autopct='%.1f%%',colors= colors)
plt.legend()
plt.show()

***The dataset is quite imbalanced as 76% peoples are not not interested for Credit card and 24% are interested***

***Let's see gender wise customer's response on recommended credit cards.***

In [None]:
plt.figure(figsize = (12,8))
sns.countplot(x='Gender', hue='Is_Lead', data=train).set_title('Gender Wise Customer Response in the Recommended Credit Cards')

***The count plot shows that male customers are highly interests in recommended credit cards when compare to female customers.***

***The percentage of customers not interested in the recommended policies is high in both genders.***

In [None]:
plt.figure(figsize = (12,8))
sns.countplot('Occupation', hue = 'Is_Lead', data = train).set_title('Occupation Wise Customer Response in the Recommended Credit Cards')

***Self employed people are highly interested in recomeended credit cards when comapared to others***

In [None]:
plt.figure(figsize = (12,8))
sns.countplot('Channel_Code', hue = 'Is_Lead', data = train).set_title('Channel Code Wise Customer Response in the Recommended Credit Cards')

***Channel X3 and X2 people are highly interested in recomeended credit cards when comapared to others, while people belongs to channel X1 are hihly non-interested in recommended credit cards.***

In [None]:
plt.figure(figsize = (10,6))
sns.countplot('Channel_Code', hue = 'Is_Lead', data = train).set_title('Channel Code Wise Customer Response in the Recommended Credit Cards')

In [None]:
plt.figure(figsize = (10,6))
sns.countplot(data=train.fillna('Missing'), x='Credit_Product', hue='Is_Lead', palette='summer')

***It is very strange to observe that the missing values do actually have more leads. Thus we should fill the missing values.***

In [None]:
# Region_Code : Code of the Region for the customers

plt.figure(figsize=(15,8))
ax = sns.countplot(train.Region_Code, hue=train.Is_Lead)
ax.set_title('Distribution of Region Code')
plt.xticks(rotation=45)
plt.show()

* Each region has differnt trend
 
* Dummy Encoding of Region Code will increase complexity of Model and also preformance will be lowered
 
* Will use Lead Probabilty Score of each region instead of categories.


*probability_score = no_of_leads_in_region / no_of_customers_in_region*

In [None]:
rc_encoding = train.groupby('Region_Code')['Is_Lead'].mean().reset_index()

plt.figure(figsize=(15,6))
ax = sns.barplot(x='Region_Code', y='Is_Lead', data=rc_encoding.sort_values(by=['Is_Lead'], ascending=False));
ax.set_title('Lead Probability Distribution of Region Code')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Avg_Account_Balance : Average Account Balance for the Customer in last 12 Months

plt.figure(figsize=(12,6))
ax = sns.distplot(train.Avg_Account_Balance/10000)
ax.set_title('Distribution of Average Account Balance (10k scale)')
plt.show()

In [None]:
plt.figure(figsize=(10,6))
ax = sns.boxplot(train.Avg_Account_Balance, orient = 'v')
ax.set_title('Distribution of Average Account Balance ')
plt.show()

*We have outliers in the Average Account Balance feature*

In [None]:
# Age: Age of the Customer (in Years)

plt.figure(figsize=(12, 6))
ax = sns.distplot(train.Age)
ax.set_title('Distribution of Age')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(train.Age).set_title("Distribution of Age")

## *Data Preprocessing*

In [None]:
train.head()

In [None]:
# Replacing null values with 'Not Sure' for both train and test sets. Its al together creating new class

train['Credit_Product'] = train['Credit_Product'].fillna("Not Sure")
test['Credit_Product'] = test['Credit_Product'].fillna("Not Sure")
train[train['Credit_Product'] == 'Not Sure'].head()

In [None]:
# Storing target value in 'Target' attribute for further usage

Target = pd.DataFrame(train['Is_Lead'])

In [None]:
# Dropping unwanted columns 

train = train.drop(['Is_Lead', 'ID'], axis = 1)
test = test.drop(['ID'], axis = 1)

print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

In [None]:
# Concat both sets to data file

data = pd.concat([train, test])
data.shape

In [None]:
# Trying to reduce skewnees by applying some operators 

data['Avg_Account_Balance'] = np.log(data['Avg_Account_Balance'])

data.head()

In [None]:
# Getting numeric and categorical columns

data_num_cols = data._get_numeric_data().columns 
data_cat_cols = data.columns.difference(data_num_cols)
print("Numeric columns: ", data_num_cols)
print()
print("Categorical columns: ", data_cat_cols)

In [None]:
#Separating both numeric and categorical data from set

data_num_data = data.loc[:, data_num_cols]
data_cat_data = data.loc[:, data_cat_cols]

print("Shape of num data:", data_num_data.shape)
print("Shape of cat data:", data_cat_data.shape)

In [None]:
# Using StandardScaler to scale the data

from sklearn import preprocessing
s_scaler = preprocessing.StandardScaler()
data_num_data_s = s_scaler.fit_transform(data_num_data)

data_num_data_s = pd.DataFrame(data_num_data_s, columns = data_num_cols)

fig, (ax1) = plt.subplots(ncols=1, figsize=(8, 5))
ax1.set_title('After StandardScaler')

sns.kdeplot(data_num_data_s['Age'], ax=ax1)
sns.kdeplot(data_num_data_s['Vintage'], ax=ax1)
sns.kdeplot(data_num_data_s['Avg_Account_Balance'], ax=ax1);

***Handle Categorical Variable using Label Encoder***

In [None]:
from sklearn.preprocessing import LabelEncoder

label = LabelEncoder()
data_cat_data = data_cat_data.apply(LabelEncoder().fit_transform)

In [None]:
# Strorig cleaned data into 'data_new'

data_num_data_s.reset_index(drop=True, inplace=True)
data_cat_data.reset_index(drop=True, inplace=True)
#df = pd.concat([df1, df2], axis=1)
data_new = pd.concat([data_num_data_s, data_cat_data], axis = 1)

In [None]:
Target

In [None]:
# Splitting back the data into train and test

train_new = data_new.iloc[:245725,]
test_new = data_new.iloc[245726:,]

print("Shape of train data:", train_new.shape)
print("Shape of test data:", test_new.shape)

In [None]:
train_new.isnull().sum()

In [None]:
train_new.dtypes

In [None]:
# Split the dataset into training and testing set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_new,Target,test_size=0.2,random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## *Model Building*

In [None]:
# Now Ensemble Techniques (Bagging and Boosting)

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
# Fitting random forest classifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Evaluate Model Performance

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score, plot_roc_curve, auc

In [None]:
rf_pred = rf.predict(X_test)

rf_auc = roc_auc_score(y_test, rf_pred)
rf_auc

In [None]:
# Fitting XGB Classifier

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
xgb_pred = rf.predict(X_test)

xgb_auc = roc_auc_score(y_test, rf_pred)
xgb_auc

## *Model Performance Improvement*

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(train_new,Target)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [None]:
# Fitting random forest classifier on balanced dataset

rfb = RandomForestClassifier()
rfb.fit(X_train, y_train)

In [None]:
rfb_pred = rfb.predict(X_test)

rfb_auc = roc_auc_score(y_test, rfb_pred)
rfb_auc

In [None]:
# Fitting LightGBM on balance data

from lightgbm import LGBMClassifier

lgb = LGBMClassifier()
lgb.fit(X_train, y_train)

In [None]:
lgb_predict = lgb.predict(X_test)

In [None]:
lgb_auc_score = roc_auc_score(y_test, lgb_predict)
lgb_auc_score

In [None]:
# Fitting XGB Classifier on balanced dataset

model = XGBClassifier()
model.fit(X_train, y_train)

In [None]:
xgb_predict = model.predict(X_test)

In [None]:
xgb_auc_score = roc_auc_score(y_test, xgb_predict)
xgb_auc_score

In [None]:
# Plotting roc curve

from sklearn import metrics

false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, xgb_predict)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.plot(false_positive_rate, true_positive_rate,label='AUC Level = %0.2f' % (roc_auc))
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

***Since Area Under the Curve is 90% which indicates that model performance is excellent and need no improvement at all.***