# ML Project - Bank Marketing Prediction


> Reading dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('bank-marketing.csv')
df

# Cleaning the data

- droping the data which is not giving proper information.

In [None]:
df2 = df.drop(df[df['education'] == 'unknown'].index, axis = 0, inplace = False)
df2

- droping the outliers in the data.

In [None]:
from scipy.stats import zscore
print(df2['balance'].mean())
df2['baloutliers']= zscore(df2['balance'])
cle = (df2['baloutliers']>3) | (df2['baloutliers']<-3 )
df3 = df2.drop(df2[cle].index, axis = 0, inplace = False)
df4 = df3.drop('baloutliers', axis=1)

- droping the column 'contact' because it is of no use.

In [None]:
df5 = df4.drop('contact', axis=1)

- converting categorical month column to numerical.

In [None]:
df5['Month'] = df5['month']

In [None]:
Month = {"jan":1,"feb":2,"mar":3,"apr":4,"may":5,"jun":6,"jul":7,"aug":8,"sep":9,"oct":10,"nov":11,"dec":12}
df5['Month'] = [Month[item] for item in df5['Month']]
df5

- droping the record of those customer who cut the call after knowning it is from bank(in the starting 5 sec).

In [None]:
df5['duration'] = df5['duration'].apply(lambda n:n/60).round(2)
df6 = df5.drop(df5[df5['duration']<5/60].index, axis = 0, inplace = False)
df6

In [None]:
df7 = df6.drop(df6[df6['poutcome'] == 'other'].index, axis = 0, inplace = False)
df7

- converting target column response to numerical for the better understanding of ML algorithm.

In [None]:
df7['Response'] = df7['response']
df7['Response'] = pd.get_dummies(df7['Response'], drop_first = True)

- Describeing the pdays column:
 > mean
 
 > median
 
 > mode

In [None]:
df['pdays'].describe()

- mean = 40.197
- median = -1
- minimum = -1
yes,the minimum and the median value both are same -1.

-  Describe the pdays column : this time limiting to the relevant values of pdays. 

In [None]:
ddf = df.copy()

In [None]:
ddf.drop(ddf[ddf['pdays'] == -1].index, inplace = True)
ddf['pdays'].describe()

- mean = 224.577
- median = 194
- minimum = 1
yes,there is a difference in a median and the mean value you can see previous mean is '40' but now it is '224'same withw median it changes from '-1' to '194'.

- A horizontal bar graph with the median values of balance for each education level value.

> importing matplot and seaborn liberaries for the better visualisation of data

In [None]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

In [None]:
ddf2 = df7.copy()

In [None]:
ddf2['Edu'] = df7['education']

In [None]:
Education = {"primary":1,"secondary":2,"tertiary":3}
ddf2['Edu'] = [Education[item] for item in ddf2['Edu']]

In [None]:
barG = ddf2[['Edu','balance']].groupby("Edu").median().plot(kind='barh',legend = False,color = 'yellowgreen')
barG.set_ylabel("Education  \n1:primary , 2:secondary ,3: tertiary")
barG.set_xlabel("balance")
plt.show()

In [None]:
#ddf2.groupby('Edu').median()
#if someone wants to se numbers.

> tertiary group has highest median value according to the graph.

- boxplot on pdays column to see outliers in the data.

In [None]:
sns.boxplot(df7['pdays'])
print('outliers')

 > here we can see there is too many outliers point.

# EDA:  Exploratory Data Analysis 

In [None]:
sns.catplot("response","duration",data = df7)

- by this chart we can say that when the duration of call is less ,the more is the chances of "No" as a response.

In [None]:
sns.catplot("response","balance",data = df7)

- by this chart we can say that the response of person was not depend on balance.

In [None]:
sns.catplot("response","pdays",data = df7)

- by this chart we can say that count of pday increases the chances of "yes" response is more.

In [None]:
sns.catplot("response","previous",data = df7) 
sns.catplot("response","campaign",data = df7)

- by this chart we can say that there is no such relation between response and previous contact with a person.
same thing with campaign.

In [None]:
g= sns.pairplot(df7)

- by this pairplot we can say that the target variable "response" is related with all these column but best with duration.

In [None]:
plt.figure(figsize=(30,30))
ax = sns.heatmap(df7.corr(), annot = True, linewidth = 3)
ax.tick_params(size = 10, labelsize = 10)
plt.title("bank marketing", fontsize = 25)
plt.show()

- by this heatmap we can say that response is highly correlated with duration column.

#  Machine Learning Algorithm

> droping column and data which are of no use.

In [None]:
df7.drop(['marital'],axis=1, inplace=True)
df8 = df7.iloc[:, 0:7]

In [None]:
df7.drop(['month'],axis=1, inplace=True)

In [None]:
df7.drop(['response'],axis=1, inplace=True)

> creating dummies to convert categorical variable to numerical.

In [None]:
df7 = pd.get_dummies(df7,drop_first=True)

In [None]:
df10=df7['Response'].copy()

In [None]:
df7.drop(['Response'],axis=1, inplace=True)

In [None]:
df7 = pd.merge(df7, df10, left_index = True, right_index = True)

 **LOGISTIC REGRESSION**.

> importing liberaries to apply algorithm on the data.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()

In [None]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
import warnings
warnings.filterwarnings('ignore')

In [None]:
models = []
models.append(('LR', LogisticRegression()))

- splitting the data.

In [None]:
df_train, df_test = train_test_split(df7, test_size=0.2, random_state=51)

In [None]:
X_train = df_train.drop('Response', axis=1)
y_train = df_train['Response']
 
print('Shape of X = ', X_train.shape)
print('Shape of y = ', y_train.shape) 

### RFE

In [None]:
from sklearn.feature_selection import RFE

In [None]:
LR.fit(X_train, y_train)

rfe = RFE(LR, 10)  
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]
col

In [None]:
X_train_rfe = X_train[col]

In [None]:
import statsmodels.api as sm  
X_train_rfe = sm.add_constant(X_train_rfe)

In [None]:
LR = sm.OLS(y_train,X_train_rfe).fit()

### VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
X = X_train_rfe
vif['Features'] = X.columns
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
import statsmodels.api as sm  
X_train_lm = sm.add_constant(X_train_rfe)
LR = sm.OLS(y_train,X_train_lm).fit()  
print(LR.summary())

In [None]:
array = df7.values
X = array[:,0:-1]
Y = array[:,-1]

In [None]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state=51)

In [None]:
import warnings
warnings.filterwarnings('ignore')
result = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=51)    
    croresult = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')    
    result.append(croresult)
    output = "%s: %f (%f)" % (name, croresult.mean(), croresult.std())
    print(output)

In [None]:
LR = LogisticRegression()
LR.fit(X_train, Y_train)

In [None]:
predictions = LR.predict(X_test)

- Accuracy score-

In [None]:
print(accuracy_score(Y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
import pylab as pl
cm = confusion_matrix(Y_test, predictions)
pl.matshow(cm)
pl.title('Confusion matrix \n')
pl.colorbar()
pl.show()

**RANDOM FOREST**

In [None]:
array = df7.values
X = array[:,0:-1]
Y = array[:,-1]

- Train test split.

In [None]:
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, test_size=0.2, random_state=51)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
models = []
models.append(('RFC', RandomForestClassifier()))

In [None]:
result = []
for name, model in models:
    kfold = KFold(n_splits=10, random_state=51)
    croresults = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='neg_mean_squared_error')
    result.append(croresults)
    output = "%s: %f (%f)" % (name, croresults.mean(), croresults.std())
    print(output)

In [None]:
RFC = RandomForestClassifier(n_estimators=50)
RFC.fit(X_train, Y_train)

In [None]:
predictions = RFC.predict(X_test)

- Accuracy score-

In [None]:
print(accuracy_score(Y_test, predictions))

In [None]:
from sklearn.metrics import confusion_matrix
import pylab as pl
cm = confusion_matrix(Y_test, predictions)
pl.matshow(cm)
pl.title('Confusion matrix \n')
pl.colorbar()
pl.show()

- The best metric is of randomforest but actually by these confusion matrix we can say that the dataset is highly unbalanced, with nearly all client actually decline to subscribe.This says that the accuracy score is biased, and further evaluation should be carried out to determine the accuracy of logistic regression model.

- By all these evaluation we can say that the random forest model perform well on the dataset as the score is high.

- By the evaluation we can say that the top feature is "Duration".