In [None]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [None]:
# Reading from CSV
bm0= pd.read_csv("/kaggle/input/bank-marketing.csv")
print("Dataset with rows {} and columns {}".format(bm0.shape[0],bm0.shape[1]))
bm0.head()

In [None]:
bm0.info()

Hence dataset does not contain any missing value.

#### Univariate Analysis

In [None]:
bm0.describe()

#### Describe the pdays column, make note of the mean, median and minimum values. Anything fishy in the values?

In [None]:
bm0.pdays.describe()

If we purely look at numerical summary ie mean and standard deviation, we can't see that lot of values is -1. We can see that 75% values of pdays are -1. So -1 has special meaning over here ie previous campaign was made to them or not. So in our case if we want to make decision on customer who did have campaign previously, then we must exclude all the cases of -1. So, by doing this we can get to customer who had previously campaign. 

#### Describe the pdays column again, this time limiting yourself to the relevant values of pdays. How different are the mean and the median values?

 pdays  uses -1 as indicator and not value. Hence treat these value as missing
 - Ignore these values in our average/median/state calculations.
 - Keep it NaN  
 Wherever pdays is -1, replace with NaN  

In [None]:
bm1=bm0.copy()

In [None]:
bm1.drop(bm1[bm1['pdays'] < 0].index, inplace = True) 

In [None]:
bm1.pdays.describe()

This time mean and median has changed significantly because we have removed the case where pdays value is -1 ie we have removed the customer that were not contacted previously for campaign.

#### Plot a horizontal bar graph with the median values of balance for each education level value. Which group has the highest median?

In [None]:
bm1.groupby(['education'])['balance'].median().plot.barh()

Thus, we can conclude from graph that customer with tertiary level of education has highest median value for balance.

#### Make a box plot for pdays. Do you see any outliers?

In [None]:
bm1.pdays.plot.box()
plt.show()

Yes, from the above box plot we can see that there are outliers present in pdays.

#### The final goal is to make a predictive model to predict if the customer will respond positively to the campaign or not. The target variable is “response”. So performing bi-variate analysis to identify the features that are directly associated with the target variable.


#### Bi- variate Analysis

#### Converting the response variable to a convenient form

In [None]:
bm1.response.value_counts(normalize=True)

In [None]:
bm1.replace({'response': {"yes": 1,'no':0}},inplace=True)

In [None]:
bm1.response.value_counts()

#### Make suitable plots for associations with numerical features and categorical features’

In [None]:
# here we are seperating object and numerical data types 
obj_col = []
num_col = []
for col in bm1.columns:
    if bm1[col].dtype=='O':
        obj_col.append(col)
    else:
        num_col.append(col)

In [None]:
print("Object data type features ",obj_col)
print("Numerical data type features ",num_col)

In [None]:
from numpy import median
for col in obj_col[1:]:
    plt.figure(figsize=(8,6))
    sns.violinplot(bm1[col],bm1["response"])
    plt.title("Response vs "+col,fontsize=15)
    plt.xlabel(col,fontsize=10)
    plt.ylabel("Response",fontsize=10)
    plt.show()
#sns.despine()
# violin plots give best of both worlds 
# it gives boxplot and distribution of data like whether the data is skewed or not.
# if normally distributed then it's the best you can get.
# you can also use barplots in this case.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(bm1.corr(),annot=True,cmap='RdBu_r')
plt.title("Correlation Of Each Numerical Features")
plt.show()

we can see that duration variable is highly correlated with response variable 'Response Flag' . Whereas pdays variable is not highly correlated with response variable 'Response Flag'.

In [None]:
for col in num_col[:-1]:
    plt.figure(figsize=(10,8))
    sns.jointplot(x = bm1[col],y = bm1["response"],kind='reg')
    plt.xlabel(col,fontsize = 15)
    plt.ylabel("Response",fontsize = 15)
    plt.grid()
    plt.show()

#### Label Encoding of Categorical Variables.

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
bm2 = bm1[obj_col].apply(LabelEncoder().fit_transform)

In [None]:
bm2.head()

In [None]:
bm3 = bm2.join(bm1[num_col])

In [None]:
bm3.head()

In [None]:
bm3.corr()

#### Model Building

#### Logistic Regression Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
np.random.seed(42)

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
X = bm3.drop("response", axis=1)
X.head()

In [None]:
y= bm3[['response']]
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=42)

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
cv_score= cross_val_score(lr,X_train,y_train, cv=5)
np.mean(cv_score)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_pred,y_test)

In [None]:
f1_score(y_pred,y_test)

#### RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rfe = RFE(lr, 5)
rfe.fit(X_train,y_train)

In [None]:
rfe.support_

In [None]:
X_train.columns[rfe.support_]

In [None]:
cols = X_train.columns[rfe.support_]

In [None]:
lr.fit(X_train[cols],y_train)

In [None]:
y_pred2 = lr.predict(X_test[cols])

In [None]:
f1_score(y_pred2,y_test)

In [None]:
confusion_matrix(y_pred2,y_test)

#### use statsmodel

In [None]:
import statsmodels.api as sm

In [None]:
X_train.head()

Add intercept manually for statsmodel to work

In [None]:
X_train_sm = sm.add_constant(X_train[cols])
X_train_sm.head()

In [None]:
lr1 = sm.OLS(y_train, X_train_sm).fit()

In [None]:
lr1.summary()

#### VIF

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
# Create a dataframe that will contain the names of all the feature variables and their respective VIFs
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Housing, loan, default, poutcome are imp feature from logistic regression model perspective

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(max_depth=5, random_state=42,max_leaf_nodes=50)

In [None]:
rfc.fit(X_train,y_train)

In [None]:
cv1_score= cross_val_score(rfc,X_train,y_train, cv=5)
np.mean(cv1_score)

In [None]:
y_pred1 = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred1))

In [None]:
f1_score(y_test,y_pred1)

In [None]:
confusion_matrix(y_test,y_pred1)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test,y_pred1)

#### RFE

In [None]:
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
rfe1 = RFE(rfc, 5)
rfe1.fit(X_train,y_train)

In [None]:
rfe1.support_

In [None]:
X_train.columns[rfe1.support_]

In [None]:
cols = X_train.columns[rfe1.support_]

In [None]:
rfc.fit(X_train[cols],y_train)

In [None]:
y_pred3 = rfc.predict(X_test[cols])

In [None]:
f1_score(y_pred3,y_test)

In [None]:
confusion_matrix(y_pred3,y_test)

Housing, month, pdays, poutcome, duration are imp feature from RANDOM FOREST perspective.