# Bank-Marketing-Prediction

In [None]:
# import all necessary libraries
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

#
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 

# model metrics 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

# import models

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# Get the data

In [None]:
# load the data with a delimiter = ;
df = pd.read_csv('dataset/bank-full.csv', delimiter=';')

In [None]:
# get a headup of the data
df.head()

In [None]:
df.shape

In [None]:
# get a description of our features
df.info()

In [None]:
df.describe()

# Explore and visualize the data 

In [None]:
# create an additional dataframe that holds features upon exploration
df_features = pd.DataFrame()

In [None]:
# check the datatypes of our features
df.dtypes

In [None]:
# describe our numerical data
df.describe()

In [None]:
# describe our categorical data
df.describe(include=['O'])

In [None]:
df.head()

**Target variable: y**

In [None]:
# used to convert columns with values (no, yes) into numerical values of (0, 1)
def yes_no_encoder(data):
    if 'no' in data:
        data = 0
    elif 'yes' in data:
        data = 1
    
    return data

In [None]:
# convert our categorical target to numeric
df['y'] = df['y'].apply(yes_no_encoder)

In [None]:
df_features['y'] = df['y']

In [None]:
fig = plt.figure(figsize=(20, 1))
sns.countplot(y='y', data=df)
print(df.y.value_counts())

**Feature: Age**

In [None]:
df_features['age'] = df['age']

In [None]:
sns.distplot(df_features['age'], kde=False)

**Feature: Job**

In [None]:
df_features['job'] = df['job']

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(y='job', data=df_features)

**Feature: marital**

In [None]:
df_features['marital'] = df['marital']

In [None]:
# returns percentage distribution of all categorical items in a specified column
def value_perc(feature):
    perc = feature.value_counts(normalize=True).reset_index()
    perc.columns = ['value', 'perc']
    perc['perc'] = round( perc['perc'] * 100 , 2)
    return perc

In [None]:
sns.countplot(y='marital', data=df_features)
print(value_perc(df_features.marital))

**Feature: Education**

In [None]:
df_features['education'] = df['education']

In [None]:
sns.countplot(y='education', data=df_features)

**Feature: default**

In [None]:
df_features['default'] = df['default']

In [None]:
# conver categorical default values to numeric values
df_features['default'] = df_features['default'].apply(yes_no_encoder)

In [None]:
sns.countplot(y='default', data=df_features)
print(value_perc(df['default']))

**Feature: Balance**

In [None]:
df_features['balance'] = df['balance']

In [None]:
sns.distplot(df_features['balance'])
print("The mean balance: ", round(df_features['balance'].mean(), 2))
print("The mean balance: ", round(df_features['balance'].std(), 2))

In [None]:
df.head()

**Feature: Housing**

In [None]:
df_features['housing'] = df['housing']

In [None]:
df_features['housing'] = df_features['housing'].apply(yes_no_encoder) # convert yes/no to numeric equivalent

In [None]:
sns.countplot(y='housing', data=df)
print(value_perc(df['housing']))

**Feature: Loan**

In [None]:
df_features['loan'] = df['loan']

In [None]:
df_features['loan'] = df_features['loan'].apply(yes_no_encoder)

In [None]:
sns.countplot(y='loan', data=df)

**Feature: Contract**

In [None]:
df_features['contact'] = df['contact']

In [None]:
sns.countplot(y='contact', data=df_features)

**Feature: Day**

In [None]:
df_features['day'] = df['day']

In [None]:
sns.distplot(df_features['day'], kde=False)
print("Mean number of day: ", df_features['day'].mean())
print("Mean number of day: ", df_features['day'].std())

**Feature: month**

In [None]:
df_features['month'] = df['month']

In [None]:
sns.countplot(y='month', data=df_features)
print(value_perc(df_features['month']))

**Feature: Duration**

In [None]:
df_features['duration'] = df['duration']

In [None]:
sns.distplot(df_features['duration'])
print("Mean duration: ", df_features['duration'].mean())
print("Std.Dev duration: ", df_features['duration'].std())

**Feature: Campaign**

In [None]:
df_features['campaign'] = df['campaign']

In [None]:
sns.distplot(df_features['campaign'])

**Feature: Pdays**

In [None]:
df_features['pdays'] = df['pdays']

In [None]:
sns.distplot(df_features['pdays'])

**Feature: Previous**

In [None]:
df_features['previous'] = df['previous']

In [None]:
sns.distplot(df_features['previous'])

**Feature: Poutcome**

In [None]:
df_features['poutcome']=  df['poutcome']

In [None]:
sns.countplot(y='poutcome', data=df_features)

In [None]:
# check for missing values
def missing_values(data):
    return data.isnull().sum()

missing_values(df_features)

# Prepare the data

**Feature encoding**

In [None]:
# get all categorical feature
obj_cols = df_features.select_dtypes(include=['object']).columns

In [None]:
# get dummies for string features
df_features = pd.get_dummies(df_features, columns=obj_cols, drop_first=True)

**Train/test split**

In [None]:
y = df_features['y']
X = df_features.drop(['y'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Sampling our data**

In [None]:
from imblearn.under_sampling import ClusterCentroids

In [None]:
cc = ClusterCentroids(random_state=0)

In [None]:
X_train, y_train = cc.fit_resample(X_train, y_train)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Building the machine learning model

In [None]:
# train the model and use it to predict the label for unseen data
def fit_ml_algo(algo, X_train, y_train, X_test, y_test):
    
    model = algo.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    acc = round(accuracy_score(y_pred, y_test) * 100, 2)
    cf_matrix = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = precision_score(y_pred, y_test)
    f1 = f1_score(y_test, y_pred)
     
    return acc, cf_matrix, precision, recall, f1

In [None]:
acc, cf_matrix, precision, recall, f1 = fit_ml_algo(GradientBoostingClassifier(), X_train, y_train, X_test, y_test)
ax = sns.heatmap(cf_matrix, annot=True, fmt='g') #notation: "annot" not "annote"
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)

**Model: Logistic Regression**