In [None]:
# Get libraries
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score
import time
from sklearn.cluster import KMeans

In [None]:
# Reading the data
df1 = pd.read_csv('../input/bank-marketing-dataset/bank.csv', sep=',')

# Exploratory Data Analysis

In [None]:
df1.head()

In [None]:
df1.tail()

In [None]:
print(df1.info(), '\n')
print(df1.describe(), '\n')
print('The shape of the data matrix: ', df1.shape)

In [None]:
# Change dependent variable name 'deposit' to 'y'
df1 = df1.rename({'deposit':'y'}, axis=1)

In [None]:
# Count the number of rows for each outcome
print(df1.groupby('y').size())

In [None]:
# Define output labels and drop the 'y' column
df1['deposit'] = (df1['y'] == 'yes').astype('int')
df1.drop('y', axis=1, inplace=True)

In [None]:
# Calculating the ratio of positive respondents
print(sum(df1['deposit']) / len(df1['deposit']))

# Feature Engineering

In [None]:
# Numeric features
cols_num = ['age', 'balance', 'day', 'duration', 'campaign']
#print(df1[cols_num].isnull().sum())

# Categorical and binary features
cols_cat = ['job', 'marital', 'education', 'contact', 'month', 'default', 'housing', 'loan']
#print(df1[cols_cat].isnull().sum())

In [None]:
# Discover elements in categorical features
for feature in cols_cat:
    print(df1.groupby(feature).size().sort_values(ascending=False), '\n')

In [None]:
# One-Hot Encoding
cols_new_cat = pd.get_dummies(df1[cols_cat], drop_first=False)
cols_new_cat.head()

df2 = pd.concat([df1, cols_new_cat], axis=1)

In [None]:
df2.head()

In [None]:
# Create a list for column names of the categorical data to keep track of them easily
cols_all_cat = list(cols_new_cat.columns)

In [None]:
# Create a df that has all features(categoricals are encoded) and the 'deposit'
cols_input = cols_num + cols_all_cat
df_encoded = df2[cols_input + ['deposit']]

In [None]:
df_encoded.head()

# Pre-processing

In [None]:
# Independent variables and dependent variable split
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]

In [None]:
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Stratified KFold

When I split the data into folds, I want to make sure that each fold is a good representative of the whole data. In this dataset with underrepresentation of term deposit subscribers (7.24%), I have to enforce a correct distribution for each five fold.

In [None]:
# Stratified CV  splits the data into k folds, making sure each fold is an appropriate
# representative of the original data.
# I tend to weight each instance equally (no term deposit vs. term deposit)

skf = StratifiedKFold(n_splits=5, random_state=None)

# skf.get_n_splits(X_scaled, y)
# for train_index, test_index in skf.split(X_scaled, y):
#    print('TRAIN:', train_index, 'TEST:', test_index)
#    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
#    y_train, y_test = y[train_index], y[test_index]


# Model Selection

In [None]:
# Baseline models
baseline_model_dict = {}

In [None]:
# K Nearest Neighbors (KNN)
knn = KNeighborsClassifier()
knn_results = cross_val_score(knn, X_scaled, y, cv=skf)
print("KNN Accuracy: %.2f%%" % (knn_results.mean() * 100.0))
baseline_model_dict['KNN'] = knn_results.mean() * 100.0

In [None]:
# Logistic Regression
lr = LogisticRegression()
lr_results = cross_val_score(lr, X_scaled, y, cv=skf)
print("LogReg Accuracy: %.2f%%" % (lr_results.mean() * 100.0))
baseline_model_dict['LogReg'] = lr_results.mean() * 100.0

In [None]:
# Support Vector Classifier
svc = SVC()
svc_results = cross_val_score(svc, X_scaled, y, cv=skf)
print("SVC Accuracy: %.2f%%" % (svc_results.mean() * 100.0))
baseline_model_dict['SVC'] = svc_results.mean() * 100.0

In [None]:
# Naive Bayes
nb = GaussianNB()
nb_results = cross_val_score(nb, X_scaled, y, cv=skf)
print("NB Accuracy: %.2f%%" % (nb_results.mean() * 100.0))
baseline_model_dict['NB'] = nb_results.mean() * 100.0

In [None]:
# Decision Tree
tree = DecisionTreeClassifier()
tree_results = cross_val_score(tree, X_scaled, y, cv=skf)
print("Decision Tree Accuracy: %.2f%%" % (tree_results.mean() * 100.0))
baseline_model_dict['Decision Tree'] = tree_results.mean() * 100.0

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf_results = cross_val_score(rf, X_scaled, y, cv=skf)
print("Random Forest Accuracy: %.2f%%" % (rf_results.mean() * 100.0))
baseline_model_dict['Random Forest'] = rf_results.mean() * 100.0

In [None]:
# Gradient Boosting Classifier
gbc = GradientBoostingClassifier()
gbc_results = cross_val_score(gbc, X_scaled, y, cv=skf)
print("GBC Accuracy: %.2f%%" % (gbc_results.mean() * 100.0))
baseline_model_dict['GBC'] = gbc_results.mean() * 100.0

In [None]:
# XGBoost Classifier
xgb = XGBClassifier()
xgb_results = cross_val_score(xgb, X_scaled, y, cv=skf)
print("XGB Accuracy: %.2f%%" % (xgb_results.mean() * 100.0))
baseline_model_dict['XGB'] = xgb_results.mean() * 100.0

In [None]:
# Plot Baseline Model Accuracies
keys = baseline_model_dict.keys()
values = baseline_model_dict.values()

fig1, ax1 = plt.subplots()
ax1.bar(keys, values)
ax1.set_xlabel('Baseline Models')
ax1.set_ylabel('Accuracy(%)')
ax1.set_xticklabels(keys, rotation=70)
plt.show()

 It can be concluded that 'Stochastic Gradient Descent' base model has the highest accuracy score with 92.47%.

# Hyperparameter Tuning

I am only going to optimize the hyper parameters for stochastic gradient descent, random forest, and gradient boosting classifiers. I will not optimize KNN because it took a while to train. I will not optimize logistic regression since it performs similarly to stochastic gradient descent. Similarly, I will not optimize decision trees since they tend to overfit and perform worse than random forests and gradient boosting classifiers.

In [None]:
tuned_model_dict = {}

### Random Forest 

In [None]:
# number of trees
n_estimators = range(200, 1000, 200)

# maximum number of features to use at each split
max_features = ['auto', 'sqrt']

# maximum depth of the tree
max_depth = range(2, 20, 2)

# criterion for evaluating a split
criterion = ['gini', 'entropy']

# Random Grid
random_grid_rf = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'criterion': criterion}

acc = make_scorer(accuracy_score)

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid_rf,
                                scoring='accuracy', n_iter=100,
                                cv=skf, verbose=1, random_state=42, n_jobs=-1)

t1 = time.time()
rf_random.fit(X_scaled, y)
t2 = time.time()
print(t2-t1)

print(rf_random.best_params_)

In [None]:
# Tuned Random Forest
rf_tuned = RandomForestClassifier(n_estimators=600, max_features='sqrt', max_depth=2, 
                                  criterion='entropy', random_state=42)
rf_tuned_results = cross_val_score(rf_tuned, X_scaled, y, cv=skf)
print("Tuned Random Forest Accuracy: %.2f%%" % (rf_tuned_results.mean() * 100.0))
tuned_model_dict['Random Forest'] = rf_tuned_results.mean() * 100.0

In [None]:
# Get feature importances
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
results_rf = rf_tuned.fit(X_scaled, y)
plot_feature_importance(results_rf.feature_importances_,X_scaled.columns,'Tuned Random Forest')

### Gradient Boosting Classifier(GBC)

In [None]:
# number of trees
n_estimators = range(50, 200, 50)

# maximum depth of the tree
max_depth = range(1, 5, 1)

# learning rate
learning_rate = [0.001, 0.01, 0.1]

random_grid_gbc = {'n_estimators': n_estimators,
                  'max_depth': max_depth,
                  'learning_rate': learning_rate}

In [None]:
gbc = GradientBoostingClassifier()
gbc_random = RandomizedSearchCV(estimator=gbc, param_distributions=random_grid_gbc, 
                                n_iter=20, cv=skf, scoring='accuracy')
t1 = time.time()
gbc_random.fit(X_scaled, y)
t2 = time.time()
print(t2-t1)

print(gbc_random.best_params_)

In [None]:
# Tuned Gradient Boosting Classifier
gbc_tuned = GradientBoostingClassifier(n_estimators=100, learning_rate=0.001,
                                       max_depth=2, random_state=42)
gbc_tuned_results = cross_val_score(gbc_tuned, X_scaled, y, cv=skf)
print("Tuned GBC Accuracy: %.2f%%" % (gbc_tuned_results.mean() * 100.0))
tuned_model_dict['GBC'] = gbc_tuned_results.mean() * 100.0

In [None]:
results_gbc = gbc_tuned.fit(X_scaled, y)
plot_feature_importance(results_gbc.feature_importances_,X_scaled.columns,'Tuned  Gradient Boosting')

### XGBoost Classifier

In [None]:
# number of trees
n_estimators = range(50, 200, 50)

# maximum depth of the tree
max_depth = range(1, 5, 1)

# learning rate
learning_rate = [0.001, 0.01, 0.1]

# buraya comment yazılacak
colsample_bytree = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

# gamma
gamma = st.uniform(0, 10)

random_grid_xgb = {'n_estimators': n_estimators,
                  'max_depth': max_depth,
                  'learning_rate': learning_rate,
                  'colsample_bytree': colsample_bytree,
                  'gamma': gamma}

In [None]:
xgb = XGBClassifier()
xgb_random = RandomizedSearchCV(estimator=xgb, param_distributions=random_grid_xgb, 
                                n_iter=20, cv=skf, scoring='accuracy')
t1 = time.time()
xgb_random.fit(X_scaled, y)
t2 = time.time()
print(t2-t1)

print(xgb_random.best_params_)

In [None]:
# Tuned XGBoost Classifier
xgb_tuned = XGBClassifier(objective= 'binary:logistic', n_estimators=150, max_depth=3,
                         learning_rate=0.01, colsample_bytree=0.2, gamma=0.01)
xgb_tuned_results = cross_val_score(xgb_tuned, X_scaled, y, cv=skf)
print("Tuned XGB Accuracy: %.2f%%" % (xgb_tuned_results.mean() * 100.0))
tuned_model_dict['XGB'] = xgb_tuned_results.mean() * 100.0

In [None]:
results_xgb = xgb_tuned.fit(X_scaled, y)
plot_feature_importance(results_xgb.feature_importances_,X_scaled.columns,'Tuned XGBoost')

### Evaluation

In [None]:
# Aggregate the results and compare to the baseline models
df_accuricies = pd.DataFrame([baseline_model_dict, tuned_model_dict], index=['Base','Tuned']).transpose()

fig2, ax2= plt.subplots(1,1, figsize=(6,5))

df_accuricies.plot.bar(ax=ax2)
ax2.set_ylabel('Accuracy(%)')
ax2.set_xlabel('All Models')
ax2.set_xticklabels(baseline_model_dict.keys(), rotation=70)

fig2.tight_layout()
fig2.show()


Looking at the results, we can see that the hyperparameter tuning improved the models. Random Forest and Gradient Boosting models are powerful algorithms which can be frequently used for this kind of classification problems. Tuned Random Forest and Tuned Gradient Boosting Classifier have both highest accurcy with 92.76%. Tuned Stochastic Gradient Descent Classifier is also very successful with 92.64% prediction accuracy.

# Bonuses

## Exploratory Data Analysis

In [None]:
# Examine the numerical data distributions

df_main = df1.copy()
df_main.hist(bins=20, figsize=(14,10))
plt.show()

In [None]:
# Calculating the ratio of positive respondents 
print("The ratio of outcome 'yes':" ,sum(df_main['deposit']) / len(df_main['deposit']))

# Calculating the ratio of negative respondents 
print("The ratio of outcome 'no':", 1 - sum(df_main['deposit']) / len(df_main['deposit']))

##### Notes:

92.76% refused to suscribe to term deposits while 7.24% accepted to suscribe term deposits. 

In [None]:
# Create a pairplot in order to see pairwise distributions of data points
sns.pairplot(df_main, hue='deposit')

##### Notes:
- For most of the variables our pair plot is overlapping a lot.
- Pair plots of age-campaign and day-campaign are much efficient in distinguishing between different classes with very few overlapes.

### Age Variable

In [None]:
# Categorize the customers by age

df_main.loc[df_main['age'] < 30, 'age_cat'] = 20
df_main.loc[(df_main['age'] >= 30) & (df_main['age'] < 40), 'age_cat'] = 30
df_main.loc[(df_main['age'] >= 40) & (df_main['age'] < 50), 'age_cat'] = 40
df_main.loc[(df_main['age'] >= 50) & (df_main['age'] < 60), 'age_cat'] = 50
df_main.loc[df_main['age'] >= 60, 'age_cat'] = 60

df_main['age_cat'] = df_main['age_cat'].astype(np.int64)

In [None]:
# How likely is each age category to suscribe to a term deposit
table_age_cat = pd.crosstab(df_main['deposit'], df_main['age_cat']).apply(lambda x: x/x.sum() * 100).round(2)
table_age_cat

In [None]:
# Number of customers in each age category
print(df_main['age_cat'].value_counts())

In [None]:
# Countplot of Age Categories
fig6, ax6 = plt.subplots()
sns.countplot(x='age_cat', data=df_main)
ax6.set_title('Age Categories', fontsize=10)
ax6.set_xlabel('Age Categories')
plt.show()

In [None]:
# Deposits by Age Frequency Distributions (KDE Plot)
fig7 = plt.figure(figsize=(12,4))
ax7 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 0), 'age'], 
                  shade=True, label='Refused')

ax7 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 1), 'age'], 
                  shade=True, label='Accepted')

ax7.set(xlabel='Age', ylabel='Frequency')
plt.title('Deposits by Age')
plt.show()

#### Notes:
- Most of the customers the bank targeted have 30-39 years old.
- There are many customers in their 40s and 50s, but their deposit suscription is low.
- The youngest and eldest population segments were the most likely to open a term deposit account with 10.88% and 18.02% respectively.


### Job Variable

In [None]:
# How likely is each job category to suscribe to a term deposit
table_job_cat = pd.crosstab(df_main['deposit'], df_main['job']).apply(lambda x: x/x.sum() * 100).round(2)
table_job_cat

In [None]:
# How likely is each job category to suscribe to a term deposit (plotted)
stacked = table_job_cat.stack().reset_index().rename(columns={0:'value'})

fig7 = plt.figure(figsize=(15,5),)
ax7 = sns.barplot(x=stacked['deposit'], y=stacked['value'], hue=stacked['job'], palette='Paired')

In [None]:
# Number of customers in each job category
print(df_main['job'].value_counts())

In [None]:
# Number of customers in each job category (plotted)
fig8, ax8 = plt.subplots(figsize=(16,4))
sns.countplot(x='job', data=df_main, palette='Paired')
ax8.set_title('Job Categories', fontsize=10)
ax8.set_xlabel('Jobs')
plt.show()

In [None]:
# Age vs Occupation Box Plots
ax9 = plt.figure(figsize=(15,8))
ax9 = sns.boxplot(x='job', y='age', hue='deposit', data=df_main)

ax9.set_title('Age vs Occupation', fontsize=15)
ax9.set_xlabel('Jobs', fontsize=15)
ax9.set_ylabel('Age', fontsize=15)
handles, _ = ax9.get_legend_handles_labels()
ax9.legend(handles, ["Refused", "Accepted"])
plt.show()

#### Notes:
- Blue-Collars, people working in Management and Technicians received the most offers from the call-center to suscribe term deposits.
- Students, entrepreneurs and unemployed peoples received the less amount of offers from the call-center.
- Students and retires people have the highest subscription rate with 15.65% and 10.51% respectively. This result is consistent with the age variable outputs.
- In the boxplot, the customers who belong to the retired category and refused to suscribe a term deposit are much younger than the customers who accepted to suscribe a term deposit.
- In addition, the median age difference between self employed customers based on term deposit subscription is high. 


### Marital Variable

In [None]:
# The percent of suscribed term deposits for different marital status
table_marital = pd.crosstab(df_main['deposit'], df_main['marital']).apply(lambda x: x/x.sum() * 100).round(2)
table_marital

In [None]:
# Number of customers for each marital status
print(df_main['marital'].value_counts())

In [None]:
# Countplot of each marital status for each outcome
fig11, ax11 = plt.subplots(figsize=(12,8))
ax11 = sns.countplot(x="marital", data=df_main, hue='deposit')

ax11.set_xlabel('Marital Status', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

##### Notes:
- Single and divorced customers tend to be more likely to suscribe a term deposit than married customers.
- In other words, married customers are less likely to subscribe for term deposit.

### Education Variable

In [None]:
# The percent of suscribed term deposits for different education levels
table_education = pd.crosstab(df_main['deposit'], df_main['education']).apply(lambda x: x/x.sum() * 100).round(2)
table_education

In [None]:
# Number of customers for each education level info
print(df_main['education'].value_counts())

In [None]:
# Countplot of each education levels for each outcome
fig12, ax12 = plt.subplots(figsize=(12,8))
ax12 = sns.countplot(x="education", data=df_main, hue='deposit')

ax12.set_xlabel('Education Levels', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

##### Notes:
- Customers with tertiary level education have the highest subscription rate with 9.18%.
- It could be argued that as the level of education increases, people are more likely to invest in less risky assets like term deposit.
- Moreover, it can be asserted that income level is directly proportional to the education of people especially in Europe. Therefore, people may subscribe more frequently to a term deposit with more income.

### Default Variable

In [None]:
# Default - Term Deposit Ratio
table_default = pd.crosstab(df_main['deposit'], df_main['default']).apply(lambda x: x/x.sum() * 100).round(2)
table_default

In [None]:
# Number of customers for default info
print(df_main['default'].value_counts())

In [None]:
# Countplot of each default info for each outcome
fig13, ax13 = plt.subplots(figsize=(12,8))
ax13 = sns.countplot(x="default", data=df_main, hue='deposit')

ax13.set_xlabel('Default Info', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

##### Notes:
- I would expect much more spread between success rate of term deposit subscription depending on customer default info.(Success rate: default:'no' - 7.26%  vs. default:'yes' - 6.06%)
- However, sample size of customers who has credit in detail inherently is very small. 

### Balance Variable

In [None]:
# Categorize the customers by balance amount

df_main.loc[df_main['balance'] <= 0, 'balance_cat'] = 'no balance'
df_main.loc[(df_main['balance'] > 0) & (df_main['balance'] <= 1000), 'balance_cat'] = 'low balance'
df_main.loc[(df_main['balance'] > 1000) & (df_main['balance'] <= 5000), 'balance_cat'] = 'avg balance'
df_main.loc[df_main['balance'] >= 5000, 'balance_cat'] = 'high balance'

df_main['balance_cat'].value_counts()

In [None]:
# Countplot of each balance category for each outcome
fig10, ax10 = plt.subplots(figsize=(12,8))
ax10 = sns.countplot(x="balance_cat", data=df_main, hue='deposit')

ax10.set_xlabel('Balance Categories', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

In [None]:
# For different age groups, countplot of each balance category for each outcome
g = sns.factorplot(x='balance_cat',
                      hue='deposit', col='age_cat',
                      data=df_main, kind='count', size=4)

g.set_xticklabels(rotation=60)
g.fig.set_size_inches(15,4)
g.set(xlabel='Balance Category')
titles = ['Age 20s', 'Age 30s', 'Age 40s', 'Age 50s', 'Age 60s']
for ax, title in zip(g.axes.flat, titles):
    ax.set_title(title)
labels = ['Refused', 'Accepted']
for t, l in zip(g._legend.texts, labels): t.set_text(l)
plt.show()

##### Notes:
- The marketing campaign targets excessively customers who has the low balance.
- On the other hand, it targets fewer people who have an average balance and high balance.

### Housing Variable

In [None]:
# The percent of suscribed term deposits for each housing loan status of customers
table_house = pd.crosstab(df_main['deposit'], df_main['housing']).apply(lambda x: x/x.sum() * 100).round(2)
table_house

In [None]:
# Number of customers for each housing info
print(df_main['housing'].value_counts())

In [None]:
# The percent of having mortgage in each balance category
table_house_balance = pd.crosstab(df_main["housing"], df_main["balance_cat"]).apply(lambda x: x/x.sum() * 100).round(2)
table_house_balance

##### Notes:
- Apparently, having a house loan was a huge reason for not suscribing a term deposit.
- People with no balance and low balance have a higher probability of having a house loan which in return will lead to customers that refused suscribing term deposits.

### Loan Variable

In [None]:
# The percent of suscribed term deposits for each personal loan status of customers
table_loan = pd.crosstab(df_main['deposit'], df_main['loan']).apply(lambda x: x/x.sum() * 100).round(2)
table_loan

In [None]:
# Number of customers for each loan info
print(df_main['loan'].value_counts())

In [None]:
# The percent of having personal loan in each balance category
table_loan_balance = pd.crosstab(df_main['loan'], df_main['balance_cat']).apply(lambda x: x/x.sum() * 100).round(2)
table_loan_balance

##### Notes:
- It is obvious that the customers has financial compromises to pay back its personal loan and thus, there is no cash for he or she to suscribe to a term deposit account.

### Day Variable

In [None]:
# The percent of suscribed term deposits regarding different days of a month
table_day = pd.crosstab(df_main['deposit'], df_main['day']).apply(lambda x: x/x.sum() * 100).round(2)
table_day

In [None]:
# Number of customers depending last contact day of the month 
print(df_main['day'].value_counts())

In [None]:
fig15, ax15 = plt.subplots(figsize=(16,8))
ax15 = sns.countplot(x="day", data=df_main, hue='deposit')

ax15.set_xlabel('Last Contact Day', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

In [None]:
df_day = table_day.transpose()
df_day.sort_values(by=1, ascending=False)

##### Notes:
- Although the number of data for the first day of a month is small, the highest success rate(22.73%) belongs to this day. 
- 30th of a month has the second best figure with %15.37 success rate.
- It is not surprising to see this results because salary payments are usually made at this time of a month.
- The call center should consider to contact much more with customers on the first day of a month.

### Month Variable

In [None]:
# Create a column with the numeric values of the months

df_main.loc[df_main['month'] == 'jan', 'month_num'] = 1
df_main.loc[df_main['month'] == 'feb', 'month_num'] = 2
df_main.loc[df_main['month'] == 'mar', 'month_num'] = 3
df_main.loc[df_main['month'] == 'apr', 'month_num'] = 4
df_main.loc[df_main['month'] == 'may', 'month_num'] = 5
df_main.loc[df_main['month'] == 'jun', 'month_num'] = 6
df_main.loc[df_main['month'] == 'jul', 'month_num'] = 7
df_main.loc[df_main['month'] == 'aug', 'month_num'] = 8
df_main.loc[df_main['month'] == 'sep', 'month_num'] = 9
df_main.loc[df_main['month'] == 'oct', 'month_num'] = 10
df_main.loc[df_main['month'] == 'nov', 'month_num'] = 11
df_main.loc[df_main['month'] == 'dec', 'month_num'] = 12

df_main["month_num"] = df_main["month_num"].astype(np.int64)

#df_main.head()

In [None]:
# Amount of suscribed vs non-suscribed term deposits accounts per month
months_table = pd.crosstab(index=df_main['deposit'], columns=df_main['month_num'], margins=True)
months_table

In [None]:
# The percent of suscribed term deposits per month
months_table_pct = (months_table/months_table.loc['All']) * 100
months_table_pct.round(2)

In [None]:
# Frequency of distribution of deposits by month
fig4 = plt.figure(figsize=(15,8),)
ax4 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 0),'month_num'], 
                  shade=True, label='Refused')
ax4 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 1),'month_num'], 
                  shade=True, label='Accepted')
ax4.set(xlabel='Months of the Year', ylabel='Frequency')
plt.title('Deposits by Month')
plt.show()

#### Notes:
- There is a wide gap during the month of May between rejected and accepted term deposit suscriptions.
- October(61.25%), March(48.45%) and April(%16.59) have high ratios meaning there were more accepted requests for term deposits suscriptions than rejected requests.

### Duration Variable

In [None]:
# Convert duration to minutes of conversation
df_main['duration_min'] = df_main['duration'] / 60
df_main['duration_min'] = df_main['duration_min'].round(2)
df_main.head()

In [None]:
# Deposits by Age Frequency Distributions (KDE Plot)
fig16 = plt.figure(figsize=(12,4))
ax16 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 0), 'duration_min'], 
                  shade=True, label='Refused')

ax16 = sns.kdeplot(df_main.loc[(df_main['deposit'] == 1), 'duration_min'], 
                  shade=True, label='Accepted')

ax16.set(xlabel='Duration', ylabel='Frequency')
plt.title('Deposits by Duration')
plt.show()

##### Notes:
- As we can see from the plot, 'accepted' clients and 'refused' clients are forming two relatively separate distributions. Compared to 'refused' clients, 'accepted' clients were contacted had longer call durations.

### Campaign Variable

In [None]:
campaign_pct = pd.crosstab(df_main['deposit'], df_main['campaign']).apply(lambda x: x/x.sum() * 100).round(2)

# Creates a table that indicates success rates of each campaigns
campaign_pct

In [None]:
# How likely customers in each campaign to suscribe to a term deposit (plotted)
stacked = campaign_pct.stack().reset_index().rename(columns={0:'value'})

fig5 = plt.figure(figsize=(10,5),)
ax5 = sns.barplot(x=stacked['deposit'], y=stacked['value'], hue=stacked['campaign'])
ax5.get_legend().remove()

##### Notes:
- Based from the graph above we know that the first campaigns are the most successfull with 8.59% success rate.
- Notice rejection for offers increases after four calls that should be the threshold for the marketing team.
- Obviously, the more the calls to a customer in a shorter period of time, the more irritated the customer will be and thus, a higher level of probability for the customer to refuse suscribing a term deposit.

### Contact Variable

In [None]:
# The percent of suscribed term deposits for each contact types
table_contact = pd.crosstab(df_main['deposit'], df_main['contact']).apply(lambda x: x/x.sum() * 100).round(2)
table_contact

In [None]:
# Number of customers for each contact type
print(df_main['contact'].value_counts())

In [None]:
# Countplot of each contact types for each outcome
fig14, ax14 = plt.subplots(figsize=(12,8))
ax14 = sns.countplot(x="contact", data=df_main, hue='deposit')

ax14.set_xlabel('Contact Types', fontsize=12)
plt.legend(labels=['Refused', 'Accepted'])
plt.show()

##### Notes:
- Customers with cell-phone contact tend to be more likely to suscribe a term deposit than customers who use a telephone in their communication to the call center.

### Correlation Matrix

In [None]:
corr = df_main.corr()

plt.figure(figsize=(16,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.2, cbar=True)
plt.title('Correlation Matrix', fontsize=16)
#fig=plt.gcf()
#fig.set_size_inches(18,15)
#plt.xticks(fontsize=14)
#plt.yticks(fontsize=14)
plt.show()

##### Notes:
- Duration of the call is the feature that most positively correlates with whether a potential client will open a term deposit or not.

## Recommendations

- First of all, the marketing team should try to engage customers and have longer calls. The correlation of the ‘duration’ variable with the target variable shows that the higher the duration, the more likely it is that the customer will subscribe to the term deposits.This makes intuitive sense because longer duration shows that the customer is interested in the product.

- The cell phone seems to be the most suitable mode of communication (for this case).

- I think, the customer's account balance has a high influence on the campaign's outcome. People who are in average or high balance categories are more likely to subscribe for term deposit. Therefore, future campaigns should concentrate on these customers.

- The marketing team should target relatively old age customers who would be looking for safe and profitable investment options. In addition, they should consider to reach younger customers much more. This is because they may not have enough information about sophisticated investment products such as stoks and bonds. Therefore, they may respond positively even if the return will be small.

- The call center should shift its marketing focus from blue-Collars, technicians to students and retired clients which is consistent with the previous finding of higher subscription rates among the younger and older.

- The call center should resist calling a client for more than four times, which can be disturbing and increase dissatisfaction.

- The timing is important. To improve the marketing campaign, the marketing managers should consider initiating the campaign at an end of a mounth or at an exact beginning of a mount when the subscription rate tends to be higher. 

- If a client has a long term loan such as a mortgage, it will be very difficult for her or him to subscribe a term deposit. Therefore, more attention should be given to customers who do not have mortgage in future campaigns.

- There is no information about interest rates in this data set. I think it is most important factor when a customer subscribe to a term deposit. In fact, people are considering not only the interest rate but also the actual rate of return, especially in countries with high inflation like Turkey.(If we can know the exact date of the calls, short term treasury bill interest can be used as an indicator that can be a reference to the interest offered to the customers. It can be useful for prediction purposes.)

### Cluster Analysis

In [None]:
# Eliminate columns that we don´t want to include in the analysis (all marketing campaign columns).

In [None]:
customer_cols = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan']
df_customer = df_main[customer_cols]
df_customer.head()

In [None]:
# One-Hot Encoding
df_cluster = pd.get_dummies(df_customer)
df_cluster.head()

In [None]:
# Standardize the data
scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(df_cluster)
X_cluster_scaled = pd.DataFrame(X_cluster_scaled, columns=df_cluster.columns)

In [None]:
X_cluster_scaled.head()

In [None]:
def plot_cluster(X, y, title="Cluster plot"):
    fig = X.plot.scatter(x='age', y='balance', color=y)
    fig.layout.update(autosize=False, width=600, height=600,
                  coloraxis = dict(showscale=False, colorscale='Portland'),
                  font=dict(size=18),
                  title=dict(text=title, x=0.5, y=0.95, xanchor='center'))
    fig.update_traces(marker=dict(size=3))
    return fig

In [None]:
pd.options.plotting.backend = "plotly"

In [None]:
# Defining the kmeans function with initialization as k-means++
model = KMeans(n_clusters=3, random_state=123, init='k-means++').fit(pd.get_dummies(df_customer))
pred = model.labels_
fig = plot_cluster(df_customer, pred, title="Encoded Categorical Data")
fig

In [None]:
# Inertia on the fitted data
model.inertia_

In [None]:
# Fitting multiple k-means algorithms and storing the values in an empty list
errors = []
for cluster in range(1,20):
    model = KMeans(n_jobs=-1, n_clusters=cluster, init='k-means++')
    model.fit(pd.get_dummies(df_customer))
    errors.append(model.inertia_)

# converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'Errors':errors})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['Errors'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# Defining the kmeans function with optimal clusters
model_optimum = KMeans(n_clusters=5, random_state=123, init='k-means++').fit(pd.get_dummies(df_customer))
pred = model_optimum.labels_
fig = plot_cluster(df_customer, pred, title="Encoded Categorical Data")
fig