In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Disclaimers
This is my applied statistics term project. I haven't optimized this, I haven't done any feature elimination, nor used a state-of-the-art ensemble model, since I was obliged to use GLMs.

In [None]:
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import scipy

In [None]:
df = pd.read_csv("../input/bank-marketing-campaign/bank.csv", sep=";")

In [None]:
df.head()

## Bank client data:

- age (numeric)
- job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student","blue-collar","self-employed","retired","technician","services") 
- marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
- education (categorical: "unknown","secondary","primary","tertiary")
- default: has credit in default? (binary: "yes","no")
- balance: average yearly balance, in euros (numeric) 
- housing: has housing loan? (binary: "yes","no")
- loan: has personal loan? (binary: "yes","no")


# Related with the last contact of the current campaign:

- contact: contact communication type (categorical: "unknown","telephone","cellular") 
- day: last contact day of the month (numeric)
- month: last contact month of year (categorical: "jan", "feb", "mar", …, "nov", "dec")
- duration: last contact duration, in seconds (numeric)

## other attributes:

- campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
- pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
- previous: number of contacts performed before this campaign and for this client (numeric)
- poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

## Output variable (desired target):
- y: has the client subscribed a term deposit? (binary: "yes","no")
- Missing Attribute Values: None

## Exploratory data analysis

In [None]:
df.describe()

In [None]:
df.info

In [None]:
df.drop(columns = ["day"], inplace=True)

Looking at class imbalances and feature values

In [None]:
sns.histplot(x="y",data = df)

In [None]:
sns.histplot(x = "month", data=df)

Does age affect the subscription?

In [None]:
sns.barplot(x="y", y="age",data=df)

Can we come up with a function that maps balance or duration of the campaign to target?

In [None]:
plt.plot(df["balance"], df["y"], "o")

In [None]:
plt.plot(df["duration"], df["y"], "o")

Higher client balance does have a relationship with increase in subscription.

In [None]:
sns.barplot(x="y", y="balance", data=df)

What about the occupation of the client?

In [None]:
df.groupby(['y', 'job']).size().plot(kind='bar')

Management level workers tend to subscribe more than others.

What about month?
Note that campaign has been conducted for some of the months, thus resulting in numbers that look higher. 

In [None]:
df.groupby(['y', 'month']).size().plot(kind='bar')

## Preprocessing

I will encode the education as ordinal.

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ord_enc = OrdinalEncoder(categories = [["unknown","primary", "secondary", "tertiary"]])
df.education = ord_enc.fit_transform(df.loc[:, ["education"]])
df.head()

Is there any correlation between the variables?

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True)

Directly replacing binary features with 1 & 0's.

In [None]:
df.loan = df.loan.replace(["yes"], 1)
df.loan = df.loan.replace(["no"], 0)
df.housing = df.housing.replace(["yes"], 1)
df.housing = df.housing.replace(["no"], 0)
df.default = df.default.replace(["yes"], 1)
df.default = df.default.replace(["no"],0)
df.y = df.y.replace(["yes"],1)
df.y = df.y.replace(["no"],0)

I will encode the rest of the variables as dummies.

In [None]:
#select the variables to encode first
cols_to_encode = df.select_dtypes(include="object")
for col in cols_to_encode:
  df = pd.concat([df, pd.get_dummies(df[col], prefix="%s"%col)], axis=1)
  df.drop([col], axis=1, inplace=True)

Scaling the numerical variables.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
df

## Predicting subscriptions with Logistic Regression

In [None]:
X = df.loc[:, df.columns!="y"]
y = df["y"]

In [None]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, stratify=y)

In [None]:
reg.fit(X_train, y_train)
pred = reg.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, recall_score, accuracy_score

In [None]:
print(classification_report(pred, y_test))

We do not want to miss false negatives (people who tend to subscribe but are predicted as not subscribing)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(reg, X, y, cv = 10, scoring="recall").mean()

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(reg, X, y, cv = 10, scoring="accuracy").mean()

## Chi-square test of independence between housing loan and target

Housing loan has most effect on subscriptions. Credit has least effect and loan has more effect than credit.

* Null hypothesis: There's no relationship between the person currently using a housing loan and the person's subscription.
* Alternative hypothesis: There's a relationship between the person currently using a housing loan and the person's subscription.
* I determined the significance level as 0.05

In [None]:
from scipy.stats import chi2_contingency

contingency = pd.crosstab(df["y"], df["housing"])

chi_2, p_val, dof, expected_freq = chi2_contingency(contingency)
print(f"Chi-square score between housing loan and subscription is {chi_2}, p-value is {p_val}")

There's a significant relationship between housing loan and the subscription.

In [None]:
print(contingency)

In [None]:
contingency = pd.crosstab(df["y"], df["default"])

chi_2, p_val, dof, expected_freq = chi2_contingency(contingency)
print(f"Chi-square score between credit and subscription is {chi_2}, p-value is {p_val}")

In [None]:
contingency = pd.crosstab(df["y"], df["loan"])

chi_2, p_val, dof, expected_freq = chi2_contingency(contingency)
print(f"Chi-square score between loan and subscription is {chi_2}, p-value is {p_val}")

## Ridge regression on predicting balance

In [None]:
from sklearn.linear_model import Ridge, LinearRegression, PoissonRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
y = df[["balance"]]
X = df.loc[:, df.columns!="balance"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

In [None]:
cross_val_score(ridge, X, y, cv = 10, scoring="r2")

## Multivariate linear regression

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)
linear_pred = linear.predict(X_test)
cross_val_score(linear, X, y, cv = 10, scoring="r2")

# Poisson regressor

Number of contacts performed for a person predicted with Poisson Regressor.
The model uses D^2, explained deviance, generalization for R^2.

In [None]:
y = df[["campaign"]]
X = df.loc[:, df.columns!="campaign"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

In [None]:
poisson = PoissonRegressor()
poisson.fit(X_train, y_train)
poisson_pred = poisson.predict(X_test)

In [None]:
mean_squared_error(poisson_pred, y_test)