In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/bank-marketing-campaign/bank-full - Copy.csv', sep=';')
test = pd.read_csv('../input/bank-marketing-campaign/bank.csv', sep=';')

# **Data Exploration**

In [None]:
display(train.head())
display(test.head())

In [None]:
train.info(), test.info()

> No null Values detected

***Dividing Columns into Numeric Columns and Object Columns***

In [None]:
obj_columns = []
int_columns = []
for i, x in enumerate(train.dtypes.tolist()):
    if x == 'object':
        obj_columns.append(train.columns[i])
    elif x == 'int64':
        int_columns.append(train.columns[i])

In [None]:
for x in obj_columns:
    print(f'Number of Unique Values in {x} column: ', train[x].nunique())

**Encoding Plan** <br>
* **One Hot Encoding** : 'job, marital, contact, poutcome' <br>
* **Binary Encoding** : 'loan, housing, default' <br>
* **Label Encoding**: 'y' <br>
* **Ordinal Encoding**: 'education' <br>
* **Sin/Cosine Encoding**: 'month' <br>

# **Data Visualization**

> **Job Distribution**

In [None]:
def graph(name, u):
    train[name].value_counts().plot(kind="bar",ax=u, color=colors)
    
    plt.setp(u.get_xticklabels(), rotation=0)
    u.set_title(name, fontsize=11, fontdict={"fontweight": "bold"})
    
    for p in u.patches:
        text = str(int(p.get_height()))
        u.annotate(text, (p.get_x()+p.get_width()/2, p.get_height()+100),
                   ha="center", va='center', fontsize=8, fontweight="bold")

###############################################################################
# EXPLORATORY DATA ANALYSIS

fig2, ax2 = plt.subplots(4,2, figsize=(11, 10), gridspec_kw={"wspace" : 0.4, "hspace" : 0.3, "top": 0.95})

colors=["#ff0000","#ff8000","#ffff00","#80ff00","#00ff00", "#00ff80", "#00ffff", "#0080ff", "#0000ff", "#8000ff", "#ff00ff", "#ff0080"]

graph("loan",ax2[0,0])
graph("marital",ax2[0,1])
graph("education",ax2[1,0])
graph("default",ax2[1,1])
graph("contact",ax2[2,0])
graph("poutcome",ax2[2,1])
graph("month",ax2[3,0])
graph("housing",ax2[3,1])
plt.rcParams['axes.axisbelow'] = True

> Referred to @datark1's notebook on Mushroom Classification. 'Mushrooms - EDA, logistic regression, features'

In [None]:
jobs = train['job'].unique().tolist()

In [None]:
labels1 = []
for x in jobs:
    labels1.append(x + '\n' + str(round(len(train[(train['job'] == x) & (train['y'] == 'yes')]) / len(train[train['job'] == x]) * 100, 2)) + '%')

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x = train['job'], hue=train['y'])
locs, label = plt.xticks()
new_xticks = labels1
_ = plt.xticks(locs, new_xticks, rotation=45) 
_ = plt.xlabel('Job (Subscription Rate)')

# **Data Preprocessing**

> Make a copy of a data frame

In [None]:
c_train = train.copy()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from numpy import asarray
from sklearn.preprocessing import StandardScaler

In [None]:
o_encoder = OrdinalEncoder()
b_encoder = preprocessing.LabelBinarizer()
l_encoder = preprocessing.LabelEncoder()

In [None]:
X = c_train.drop(['y'], axis=1)
y = c_train['y']

> One Hot Encoding

In [None]:
oh_list = ['job', 'marital', 'contact', 'poutcome']

In [None]:
for a in oh_list:
    if a == 'contact':
        d = pd.get_dummies(X[a], prefix='ct')
        X = pd.concat([X,d], axis=1)
        del X[a]
    elif a == 'poutcome':
        d = pd.get_dummies(X[a], prefix='pc')
        X = pd.concat([X,d], axis=1)
        del X[a]
    else:
        d = pd.get_dummies(X[a], drop_first=True)
        X = pd.concat([X,d], axis=1)
        del X[a]
X.head()

>  

> Ordinal Encoding

In [None]:
a = o_encoder.fit_transform(asarray(X['education']).reshape(-1,1))
a = a.reshape(45211,)
X['education'] = a
X.head()

> Binary Encoding

In [None]:
b_list = ['loan', 'housing', 'default']

In [None]:
for l in b_list:
    a = b_encoder.fit_transform(X[l])
    X[l] = a

X.head()

> Label Encoding

In [None]:
a = l_encoder.fit_transform(y)
y = a
y

> Sin/Cosine Encoding (Month)

**I thought the value of the 'month' column should not be treated as numbers (December is not 12 times of January). Since December and January are apart from only a month as much as January and February are, I have decided to convert this integer value with a sin() function so that my logic above makes sense.**

In [None]:
import math
X['month'] = X['month'].map({'jan':1,
               'feb':2,
               'mar':3,
               'apr':4,
               'may':5,
               'jun':6,
               'jul':7,
               'aug':8,
               'sep':9,
               'oct':10,
               'nov':11,
               'dec':12})
X['month'] = X['month'].apply(lambda x: math.sin(2*math.pi*x/12))

In [None]:
scale_col = ['age', 'balance', 'duration', 'day']

In [None]:
for l in scale_col:
    sc = StandardScaler()
    scaled_X = sc.fit_transform(asarray(X[l]).reshape(-1,1))
    X[l] = scaled_X.reshape(45211,)
X.head()

# **Machine Learning**

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

In [None]:
model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train, y_train)
pred1 = model1.predict(X_test)
acc1 = accuracy_score(pred1, y_test)
print(classification_report(pred1, y_test))
print(acc1)

In [None]:
model2 = GaussianNB()
model2.fit(X_train, y_train)
pred2 = model2.predict(X_test)
acc2 = accuracy_score(pred2, y_test)
print(classification_report(pred2, y_test))
print(acc2)

In [None]:
model3 = KNeighborsClassifier()
model3.fit(X_train, y_train)
pred3 = model3.predict(X_test)
acc3 = accuracy_score(pred3, y_test)
print(classification_report(pred3, y_test))
print(acc3)

In [None]:
model4= DecisionTreeClassifier(max_depth=10, min_samples_leaf=15)
model4.fit(X_train, y_train)
pred4 = model4.predict(X_test)
acc4 = accuracy_score(pred4, y_test)
print(classification_report(pred4, y_test))
print(acc4)

In [None]:
model5 = RandomForestClassifier()
model5.fit(X_train, y_train)
pred5 = model5.predict(X_test)
acc5 = accuracy_score(pred5, y_test)
print(classification_report(pred5, y_test))
print(acc5)

In [None]:
model6 = SVC()
model6.fit(X_train, y_train)
pred6 = model6.predict(X_test)
acc6 = accuracy_score(pred6, y_test)
print(classification_report(pred6, y_test))
print(acc6)

In [None]:
len(X_test.columns), len(set(X_test.columns))

In [None]:
model7 = XGBClassifier()
model7.fit(X_train, y_train)
pred7 = model7.predict(X_test)
acc7 = accuracy_score(pred7, y_test)
print(classification_report(pred7, y_test))
print(acc7)

# **Feature Importance**

In [None]:
values = model1.coef_[0]
names = X_train.columns

importance = pd.DataFrame({"value": values, "name": names}).sort_values("value")
importance = importance.set_index("name")

# TOP20 FACTORS
top20 = pd.concat([importance["value"].head(10),importance["value"].tail(10)])

fig, ax = plt.subplots(figsize=(12,5), gridspec_kw={"top": 0.90, "bottom":0.05, "left":0.2})

top20.plot.barh(ax=ax)

plt.rcParams['axes.axisbelow'] = True
plt.ylabel("variable name")
plt.grid(True)
plt.title("Classification - TOP20 features (importance)")

> Referred to @datark1's notebook on Mushroom Classification. 'Mushrooms - EDA, logistic regression, features'

**poutcome success, duration have the most positive effect on the prediction and poutcome unknown and contact unknown have the most negative effect on the prediction**

# **Model Comparison Table**

In [None]:
acc_table = pd.DataFrame({'Model': ['Logistic Regression',
                                   'Naive Bayes',
                                   'KNN',
                                   'Decision Tree',
                                   'Random Forest Tree',
                                   'SVC',
                                   'XGB'],
                         'Accuracy Score': [acc1,
                                           acc2,
                                           acc3,
                                           acc4,
                                           acc5,
                                           acc6,
                                           acc7]})
acc_table = acc_table.sort_values(by='Accuracy Score', ascending=False)
acc_table.style.background_gradient(cmap='Blues')

# **ROC Curve of the Best Model**

In [None]:
metrics.plot_roc_curve(model7, X_test, y_test)
print('roc_auc_score is: ', roc_auc_score(y_test, pred7))