# ML Project - Bank Marketing Prediction

By - A.Yash Kumar Rao

In [None]:
# First, we'll import pandas,numpy , a data processing and CSV file I/O library
import pandas as pd
import numpy as np

# We'll also import seaborn, a Python graphing library
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)

In [None]:
# Next, we'll load the Iris flower dataset, which is in the "../input/" directory
df = pd.read_csv(r"../input/banckingmarket/bank.csv") # the iris dataset is now a Pandas DataFrame

# Let's see what's in the iris data - Jupyter notebooks print the result of the last thing you do
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
categorical_columns = [column for column in df.columns[:-1] if df[column].dtype == "O"]
print(categorical_columns)

In [None]:
for column in categorical_columns:
    print("Unique values in",column,"are", df[column].unique() )

In [None]:
df.describe()

### Describe the pdays column, make note of the mean, median and minimum values. Anything fishy in the values?

In [None]:
df['pdays'].describe()

In [None]:
print("Mean of the pdays column is", df['pdays'].mean())
print("Median of the pdays column is", df['pdays'].median())
print("Min of the pdays column is", df['pdays'].min())

__"pdays" column signifies No. of days passed after the client was contacted from previous campaign.
If value is -1 i.e. It is an outlier because no. of days passed can't be negative.__

### Describe the pdays column again, this time limiting yourself to the relevant values of pdays. How different are the mean and the median values?

In [None]:
print("Mean of pdays column after eliminating -1 values is", df[df['pdays'] != -1]['pdays'].mean() )
print("Median of pdays column after eliminating -1 values is", df[df['pdays'] != -1]['pdays'].median() )

__After skipping the outlier -1 mean and median values are changed to a much extent.__

### Plot a horizontal bar graph with the median values of balance for each education level value. Which group has the highest median?

In [None]:
from matplotlib import *
df.groupby('education')['balance'].median().plot.barh(color='green')
plt.ylabel('Balance')
plt.title('Education wise Median of Balance');
plt.ylabel('Education')
plt.xlabel('Balance')
plt.title('Grouping Education based on balance');

__Tertiary education level has the highest median.__

### Make a box plot for pdays. Do you see any outliers?

In [None]:
sns.boxplot(df.pdays, orient='v');

__Yes, there are lot of outliers in pdays column.__

In [None]:
df.head()

In [None]:
df.isnull().sum()

__There are no null values.__

In [None]:
df.response.replace({'no':0,'yes':1} ,inplace = True)

In [None]:
df.response.sample(10)

__We have successfully handled the Target variable i.e Response columns.__

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True);

In [None]:
sns.countplot(df['education'],hue=df['response']);

In [None]:
sns.boxplot(df['education'],df['age'],hue=df['response']);

### Are pdays and poutcome associated with the target?

In [None]:
sns.pairplot(df,hue='response');

### Associations of categorical variables

In [None]:
sns.countplot(x='response', data=df);

In [None]:
sns.countplot(y='job', data=df);

In [None]:
sns.countplot(x='marital', data=df);

In [None]:
sns.countplot(y='education', data=df)

### Are the features about the previous campaign data useful?

In [None]:
for column in categorical_columns:
    pd.crosstab(df[column],df['response']).plot.bar()
    plt.title(column)

In [None]:
pd.crosstab(df['poutcome'],df['response']).plot.bar()
plt.title('poutcome');

__'poutcome' column is not assosciated with target column because it has more than 80% missing values.__

In [None]:
df.drop('poutcome',axis=1,inplace=True)

In [None]:
df['pdays'].value_counts()

In [None]:
df['previous'].value_counts().head()

### how do you handle the pdays column with a value of -1 where the previous campaign data is missing?

In [None]:
df['pdays_no_contact'] = np.where(df['pdays']==-1,1,0)
df['pdays_no_contact'].value_counts()

__We created a new column since majority of users were not previously contacted. We are capturing importance of missing values.__ 

### Handling Missing Values in Categorical columns

In [None]:
categorical_columns

In [None]:
for column in categorical_columns[:-1]:
    print(df[column].value_counts(),"\n")

__Missing Values are represented as Unknown which is better way to handle missing data by becoming a new category itself rather than imputing it with the mode of the particular column.__

### Handling Outliers in the Data.

In [None]:
num_columns = [col for col in df.columns if col not in categorical_columns]
print(num_columns)

In [None]:
dist=df.hist(figsize=(12,10)) # display numerical feature distribution

In [None]:
##### Assuming Age follows A Gaussian Distribution we will calculate the boundaries which differentiates the outliers

upper_boundary = df['age'].mean() + 3* df['age'].std()
lower_boundary = df['age'].mean() - 3* df['age'].std()
print(lower_boundary), print(upper_boundary),print(df['age'].mean())

In [None]:
index = df[(df['age']>upper_boundary) | (df['age']<lower_boundary)].index
df.drop(index=index,axis=0,inplace=True)

In [None]:
df[(df['age']>upper_boundary) | (df['age']<lower_boundary)]

In [None]:
##### Assuming Balance follows A Gaussian Distribution we will calculate the boundaries which differentiates the outliers

#### Lets compute the Interquantile range to calculate the boundaries
IQR=df.balance.quantile(0.75)-df.balance.quantile(0.25)

lower_bridge = df['balance'].quantile(0.25)-(IQR*1.5)
upper_bridge = df['balance'].quantile(0.75)+(IQR*1.5)
print(lower_bridge)
print(upper_bridge)

In [None]:
df[(df['balance']>upper_bridge) | (df['balance']<lower_bridge)]

In [None]:
index = df[(df['balance']>upper_bridge) | (df['balance']<lower_bridge)].index
df.drop(index=index,axis=0,inplace=True)

In [None]:
df[(df['balance']>upper_bridge) | (df['balance']<lower_bridge)]

In [None]:
df.reset_index(inplace=True)

In [None]:
df['balance'].hist()

In [None]:
df['age'].hist()

__Outliers are handled now.__

## Handling Categorical columns.

In [None]:
df.head()

In [None]:
df.drop('index',axis=1,inplace=True)
df.head()

In [None]:
df['month'].unique()

In [None]:
dictionary={'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12
}

df['month']=df['month'].map(dictionary)

In [None]:
df['month'].unique()

In [None]:
df.head()

In [None]:
df1 = df.copy()
df1.head()

In [None]:
df1 = pd.get_dummies(df1,drop_first=True)
df1.head()

In [None]:
df1.shape

In [None]:
df1.loc[df1['pdays']==-1,'pdays']=0
df1['pdays'].head()

## Feature Selection

In [None]:
# from sklearn.ensemble import ExtraTreesClassifier
# import matplotlib.pyplot as plt
# model=ExtraTreesClassifier()
# model.fit(X,y)

In [None]:
# print(model.feature_importances_)

In [None]:
# plt.figure(figsize=(10,10))
# ranked_features=pd.Series(model.feature_importances_,index=X.columns)
# ranked_features.nlargest(32).plot(kind='barh');

### Handling Imbalanced Dataset

In [None]:
#Check the percentage of 0 to 1
No_sub = len(df[df['response'] == 0])
Sub = len(df[df['response'] == 1])
percent_No_sub = (No_sub/len(df['response'])) * 100
percent_sub = (Sub/len(df['response'])) * 100

print('Percentage of subsription : ',percent_sub)
print('Percentage of no subscription : ', percent_No_sub)


df['response'].value_counts().plot.bar();

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
X = df1.drop('response',axis=1)
y = df1['response']

In [None]:
from collections import Counter

os=SMOTETomek(1)
X_ns,y_ns = os.fit_sample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_ns)))

In [None]:
y_ns.value_counts()

In [None]:
y_ns.head()

In [None]:
X_ns.head()

## Feature Selection for model development

In [None]:
# X_ns.drop(['job_management', 'job_technician', 'job_entrepreneur',
#        'job_blue-collar', 'job_unknown', 'job_retired', 'job_admin.',
#        'job_services', 'job_self-employed', 'job_unemployed',
#        'job_housemaid', 'job_student'],axis=1)

In [None]:
job = 'job_'+df['job'].unique()

In [None]:
job

In [None]:
X_ns.drop(['job_management', 'job_technician', 'job_entrepreneur',
       'job_blue-collar', 'job_unknown', 'job_retired',
       'job_services', 'job_self-employed', 'job_unemployed',
       'job_housemaid', 'job_student'],axis=1,inplace=True)

In [None]:
'marital_'+df['marital'].unique()

In [None]:
X_ns.drop(['marital_married', 'marital_single'],axis=1,inplace=True)

In [None]:
X_ns.drop(['targeted_yes', 'default_yes'],axis=1,inplace=True)

In [None]:
X_ns.head()

In [None]:
X_ns.shape

In [None]:
y_ns.shape

## Feature Scaling

In [None]:
#### standarisation: We use the Standardscaler from sklearn library
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
### fit vs fit_transform
scaler.fit_transform(X_ns)

In [None]:
X_scaled = pd.DataFrame(scaler.fit_transform(X_ns),columns=X_ns.columns)
X_scaled.head()

## Model Development

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test= train_test_split(X_scaled, y_ns, test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(lr,X_scaled,y_ns,cv=15)

score

In [None]:
score.mean()

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=20, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

## Hyperparameter Optimization

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 50, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,15)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
clf_randomcv = RandomizedSearchCV(estimator=clf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
clf_randomcv.fit(X_train,y_train)

In [None]:
clf_randomcv.best_estimator_

In [None]:
clf_randomcv.best_score_

In [None]:
clf_randomcv.best_params_

In [None]:
clf_best_random = clf_randomcv.best_estimator_

In [None]:
y_pred = clf_best_random.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))

## K-Fold cross validation

In [None]:
from sklearn.model_selection import cross_val_score
score=cross_val_score(clf_best_random,X_scaled,y_ns,cv=15)

score

In [None]:
score.mean()

__Random Forest Performs much better as its average accuracy score is 87.3% to that of Logistic regression which has an accuracy of 85.8%.__

__I have used Accuracy as a metric to compare because I have handled the imbalanced data, would it be imbalanced I should have used F1-score.__