In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import scipy.stats as st
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score,roc_curve,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
pd.set_option('display.max_columns',None)
df_train=pd.read_csv('../input/banking-dataset-marketing-targets/train.csv')

In [None]:
df_train.head()

In [None]:
print('train shape: {}'.format(df_train.shape))

In [None]:
df_train.info()

In [None]:
df_train.drop(['ID','poutcome'],axis=1,inplace=True)

In [None]:
df_train.isna().sum()

In [None]:
df_train.describe()

Inferences:

* 1.Age can be assumed to be normally distributed as mean and median are almost equal.
* 2.For Balance and Duration (contact duration), mean > median which indicates that both are right skewed and high outliers are present.
* 3.Campaign (no. of contacts performed during this campaign), mean > median with not much difference hence it is right skewed but it can be said that contacts to customer while campaign are almost same as difference between mean and median is not much.

# EDA

In [None]:
df_train['subscribed'].value_counts().plot(kind='bar')
plt.xlabel('Subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
df_train['subscribed'].value_counts()

Target variable is highly imbalanced , hence imbalanced data treatment is required.

In [None]:
plt.figure(figsize=(20,8))
sns.countplot(data=df_train,x='job',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
sns.countplot(data=df_train,x='marital',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
sns.countplot(data=df_train,x='education',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
sns.countplot(data=df_train,x='housing',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
sns.countplot(data=df_train,x='loan',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
sns.countplot(data=df_train,x='contact',hue='subscribed')
plt.ylabel('No. of subscription')
plt.show()

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(data=df_train,x='month',hue='subscribed')

From above exploratory data analysis, bank should target the following customer segment:
* Customer working in management, bluecollar and technical fields.
* Married customer are most probable of subscribing the product.
* Customer should have atleast secondary education.
* Customer possesing houseloan and personal loan are of less probablity to subscrib the product.
* The customers who are connected through cellular contact are of high probability for product subscription.
* Months that are most suitable to conduct a 2nd marketing campaign are from April to August with May having the most high chances of customers subcribing product. 

In [None]:
final_train = pd.get_dummies(data=df_train,columns=['job','marital','education','default','housing','loan','contact','month'])

In [None]:
final_train['subscribed']=final_train['subscribed'].replace('no',0)
final_train['subscribed']=final_train['subscribed'].replace('yes',1)

In [None]:
final_train.head()

In [None]:
final_train.shape

In [None]:
cor=final_train.corr()
sub_cor=abs(cor['subscribed'])
sig_features=sub_cor[sub_cor>0.05]
print(sig_features)
print(sig_features.count())

Above features are highly correlated with target variable i.e subscribed and thus are the significant features.But we will still perform feature selection technique to find the most significant features.

# Feature Selection

In [None]:
X=final_train.drop('subscribed',axis=1)
y=final_train['subscribed']

In [None]:
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif

In [None]:
x=X.drop(['age','job_admin.','marital_divorced','education_primary','default_no','loan_no','housing_no','contact_unknown','month_apr'],axis=1)
vif1 = pd.DataFrame()
vif1["VIF Factor"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif1["features"] = x.columns
vif1

In [None]:

df= final_train.drop(['age','job_admin.','marital_divorced','education_primary','default_no','loan_no','housing_no','contact_unknown','month_apr'],axis=1)

In [None]:
df.shape

In [None]:
X=df.drop('subscribed',axis=1)
y=df['subscribed']

# Train-Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

# Imbalanced data treatment :Over sampling minority class

In [None]:
Xytrain = pd.concat([X_train,y_train],axis=1)

print('before oversampling: ','\n', Xytrain['subscribed'].value_counts())
Xytrain0 = Xytrain[Xytrain['subscribed']==0]
Xytrain1 = Xytrain[Xytrain['subscribed']==1]

len0 = len(Xytrain0)
len1 = len(Xytrain1)

Xytrain1_os = Xytrain1.sample(len0,replace = True, random_state=3)
Xytrain_os = pd.concat([Xytrain0, Xytrain1_os],axis=0)

print('after undersampling: ','\n',Xytrain_os['subscribed'].value_counts())

y_train_os = Xytrain_os['subscribed']
X_train_os = Xytrain_os.drop('subscribed',axis=1)

# Standardizing data

In [None]:
ss = StandardScaler()

Xtrains = ss.fit_transform(X_train_os)
Xtests = ss.transform(X_test)

In [None]:
def model_eval(algo, Xtrains, y_train_os, Xtests, y_test):
    algo.fit(Xtrains,y_train_os)
    ytrain_pred = algo.predict(Xtrains)
    ytrain_prob = algo.predict_proba(Xtrains)[:,1]

    print('Overall accuracy - train:' , accuracy_score(y_train_os, ytrain_pred))
    print('Confusion matrix - train: ','\n',confusion_matrix(y_train_os,ytrain_pred))
    print('AUC - train', roc_auc_score(y_train_os,ytrain_prob))
    print('\n')
    print('Classification report - train: ','\n',classification_report(y_train_os,ytrain_pred))

    ytest_pred = algo.predict(Xtests)
    ytest_prob = algo.predict_proba(Xtests)[:,1]

    print('\n')
    print('Overall accuracy - test:' , accuracy_score(y_test, ytest_pred))
    print('Confusion matrix - test: ','\n',confusion_matrix(y_test,ytest_pred))
    print('AUC - test', roc_auc_score(y_test,ytest_prob))
    print('Classification report - test: ','\n',classification_report(y_test,ytest_pred))

    fpr,tpr,thresholds = roc_curve(y_test,ytest_prob)
    plt.plot(fpr,tpr)
    plt.plot(fpr,fpr,'r')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.show()

# Decision Tree Classifier

In [None]:
dt=DecisionTreeClassifier(max_depth = 5, criterion = 'gini',random_state=3)
model_eval(dt, Xtrains, y_train_os, Xtests, y_test)

#  Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=100,criterion='gini',max_depth=5,random_state=3)
model_eval(rf, Xtrains, y_train_os, Xtests, y_test)

# Logistic Regression

In [None]:
lr=LogisticRegression(solver='liblinear', fit_intercept=True,random_state=3)
model_eval(lr, Xtrains, y_train_os, Xtests, y_test)

# Boosting : Adaboost

In [None]:
ada = AdaBoostClassifier(random_state = 3)
model_eval(ada, Xtrains, y_train_os, Xtests, y_test)

# Naive Bayes Classifier: Gaussian

In [None]:
clf = GaussianNB()
model_eval(clf, Xtrains, y_train_os, Xtests, y_test)

# Conclusion

* Through EDA we get to know the customer segment target for cross selling of fixed deposit (bank product).
* Feature selection technique provides significant features to identify the target customers.
* Applied different classification algorithms to check which algorithm will give the best accurate results with the significant features.