# Predicting campaign success

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## Load data and preprocess

In [2]:
raw_train_data = pd.read_csv('Bank_data_train.csv')
raw_train_data.head(10)

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
5,5,0.899,0.0,0.0,1.0,0.0,126.0,no
6,6,4.962,0.0,0.0,0.0,0.0,84.0,no
7,7,4.858,0.0,1.0,0.0,0.0,17.0,no
8,8,4.962,0.0,0.0,0.0,0.0,704.0,yes
9,9,4.865,0.0,0.0,0.0,0.0,185.0,no


In [3]:
data_train = raw_train_data.copy()
# Remove Unnamed column as it is same as index
data_train = data_train.drop(['Unnamed: 0'], axis=1)

In [4]:
# Change may values -> each value > 0 map to 1
data_train['may'] = data_train['may'].map(lambda x: 1 if x >= 1 else 0)
# Change targets values -> 'yes' to 1 and 'no' to 0
data_train['y'] = data_train['y'].map({'yes':1, 'no':0})
data_train

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0,0.0,117.0,0
1,0.767,0.0,0.0,1,1.0,274.0,1
2,4.858,0.0,1.0,0,0.0,167.0,0
3,4.120,0.0,0.0,0,0.0,686.0,1
4,4.856,0.0,1.0,0,0.0,157.0,0
...,...,...,...,...,...,...,...
513,1.334,0.0,1.0,0,0.0,204.0,0
514,0.861,0.0,0.0,1,1.0,806.0,1
515,0.879,0.0,0.0,0,0.0,290.0,0
516,0.877,0.0,0.0,1,1.0,473.0,1


## Regression

In [5]:
# Dependent and independent variables
y_train = data_train['y']
X1_train = data_train[['interest_rate', 'credit', 'march', 'previous', 'duration']]
#X1_train = data_train[['interest_rate','credit','march', 'may','previous','duration']]

In [6]:
# Scale inputs
scaler = StandardScaler()
# Fit inputs
scaler.fit(X1_train)
# Scale features
X1_train = scaler.transform(X1_train)
X1_train

array([[-0.80090846, -0.18973666,  1.65940447, -0.38212262, -0.77094694],
       [-1.10329382, -0.18973666, -0.60262583,  2.61696099, -0.31450316],
       [ 1.07846723, -0.18973666,  1.65940447, -0.38212262, -0.62558268],
       ...,
       [-1.04356338, -0.18973666, -0.60262583, -0.38212262, -0.26798659],
       [-1.04463   , -0.18973666, -0.60262583,  2.61696099,  0.26404661],
       [ 1.13553113, -0.18973666, -0.60262583, -0.38212262, -0.69826481]])

### Create logistic regression

In [7]:
# Create regression without constant and 'may' as they were insignificant
#x_train = sm.add_constant(X1_train)

reg_log = sm.Logit(y_train, X1_train)
#reg_log = sm.Logit(y_train, X1_train)
results_log = reg_log.fit()
# Get summary
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.337827
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,513.0
Method:,MLE,Df Model:,4.0
Date:,"Sun, 03 Dec 2023",Pseudo R-squ.:,0.5126
Time:,17:12:13,Log-Likelihood:,-174.99
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,2.1520000000000002e-78

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
x1,-1.5353,0.167,-9.188,0.000,-1.863,-1.208
x2,0.3995,0.179,2.232,0.026,0.049,0.750
x3,-0.8165,0.146,-5.602,0.000,-1.102,-0.531
x4,0.4795,0.156,3.065,0.002,0.173,0.786
x5,2.3601,0.251,9.417,0.000,1.869,2.851


### Make predictions on train data

In [8]:
# Predict values
pred_values_train = (results_log.predict(X1_train) >= 0.5).astype(int)
pred_values_train


array([0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,

### Metrics

In [9]:
# Create confusion matrix
cm_train = confusion_matrix(y_train, pred_values_train)
cm_train

array([[222,  37],
       [ 31, 228]], dtype=int64)

In [10]:
# Format to DataFrame
cm_df_train = pd.DataFrame(cm_train)
cm_df_train.columns = ['Predicted 0', 'Predicted 1']
cm_df_train = cm_df_train.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df_train

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,222,37
Actual 1,31,228


In [11]:
# Accuracy
accuracy_score(y_train, pred_values_train)

0.8687258687258688

In [12]:
# Precision
precision_score(y_train, pred_values_train)

0.8603773584905661

In [13]:
# Recall
recall_score(y_train, pred_values_train)

0.8803088803088803

## Test model

### Load test data and preprocess

In [14]:
raw_test_data = pd.read_csv('Bank_data_testing.csv')
data_test = raw_test_data.copy()
# Remove Unnamed column as it is same as index
data_test = data_test.drop(['Unnamed: 0'], axis=1)

In [15]:
# Change targets values -> 'yes' to 1 and 'no' to 0
data_test['y'] = data_test['y'].map({'yes':1, 'no':0})
data_test

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.120,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0
...,...,...,...,...,...,...,...
217,4.963,0.0,0.0,0.0,0.0,458.0,1
218,1.264,0.0,1.0,1.0,0.0,397.0,1
219,1.281,0.0,1.0,0.0,0.0,34.0,0
220,0.739,0.0,0.0,2.0,0.0,233.0,0


### Make predictions on test data

In [16]:
# Dependent and independent variables
y_test = data_test['y']
X1_test = data_test[['interest_rate', 'credit', 'march', 'previous', 'duration']]
#X_test = sm.add_constant(X1_test)

In [17]:
# Scale new data
scaler.fit(X1_test)
X1_test = scaler.transform(X1_test)
X1_test

array([[-0.85250017, -0.18043874,  1.62460588, -0.33166248,  0.21516363],
       [ 1.08021443, -0.18043874, -0.6155339 , -0.33166248, -0.65145084],
       [ 1.02458531, -0.18043874,  1.62460588, -0.33166248, -0.74909754],
       ...,
       [-0.86945381, -0.18043874,  1.62460588, -0.33166248, -0.89068526],
       [-1.15660603, -0.18043874, -0.6155339 , -0.33166248, -0.40489292],
       [-0.99395708, -0.18043874, -0.6155339 , -0.33166248, -0.39268709]])

In [18]:
# Predict values
y_predict_test = (results_log.predict(X1_test) >= 0.5).astype(int)

### Metrics

In [19]:
# Create confusion matrix
cm_test = confusion_matrix(y_test, y_predict_test)
cm_test

array([[94, 17],
       [15, 96]], dtype=int64)

In [20]:
# Format to DataFrame
cm_df_test = pd.DataFrame(cm_test)
cm_df_test.columns = ['Predicted 0', 'Predicted 1']
cm_df_test = cm_df_test.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df_test

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,94,17
Actual 1,15,96


In [21]:
# Accuracy
accuracy_score(y_test, y_predict_test)

0.8558558558558559

In [22]:
# Precision
precision_score(y_test, y_predict_test)

0.8495575221238938

In [23]:
# Recall
recall_score(y_test, y_predict_test)

0.8648648648648649