In [25]:
#Matrices
import numpy as np 
import pandas as pd 

#Stats
import scipy
import scipy.stats as st
from scipy.optimize import fmin
from scipy import integrate
from scipy.stats.mstats import mquantiles
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm 
import statsmodels.formula.api as smf 
from itertools import combinations 
from patsy import dmatrix
#!pip install mlxtend
from mlxtend.feature_selection import SequentialFeatureSelector


#Plotting
import matplotlib.pyplot as plt
import seaborn as sns

#Misc
#!pip install stargazer
from stargazer.stargazer import Stargazer #I also have the .py in the folder (in case it fails to install or run)



In [30]:
data_bank = pd.read_csv("bank-additional-full.csv", sep = ",") #load data
data_bank_dictionary = {"******** CLIENT DATA ********": 
                        {"age": "years", "job": "type of job", "marital": "marital status", "education": "education", 
                         "default": "has credit default?", "housing":"has housing loan", "loan":"has personal loan?"},
                       "******** LAST CONTACT CURRENT CAMPAIGN ********": 
                        {"contact": "contact communication type", "month": "last contact month of year",
                        "day_of_week": "last contact day of the week", 
                         "duration": "last contact duration, in seconds. Important note: this attribute affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model"},
                       "******** OTHER ********": 
                        {"campaign": "number of contacts performed during this campaign and for this client",
                        "pdays": "days that passed by after last contact from a previous campaign (999 means not previously contacted)",
                        "previous": "number of contacts performed before this campaign and for this client",
                        "poutcome": "outcome of the previous marketing campaign"},
                       "******** SOCIOECONOMIC ********": 
                        {"emp.var.rate": "employment variation rate - quarterly indicator",
                        "cons.price.idx": "consumer price index - monthly indicator",
                        "cons.conf.idx": "consumer confidence index - monthly indicator",
                        "euribor3m": "euribor 3 month rate - daily indicator",
                        "nr.employed": "number of employees - quarterly indicator"},
                       "******** DEPENDENT/ENDOGENOUS VARIABLE ********": 
                        {"y": "has the client subscribed a term deposit?"}}
print("VARIABLE MEANINGS (BY TYPE):\n")
data_bank_dictionary

VARIABLE MEANINGS (BY TYPE):



{'******** CLIENT DATA ********': {'age': 'years',
  'job': 'type of job',
  'marital': 'marital status',
  'education': 'education',
  'default': 'has credit default?',
  'housing': 'has housing loan',
  'loan': 'has personal loan?'},
 '******** LAST CONTACT CURRENT CAMPAIGN ********': {'contact': 'contact communication type',
  'month': 'last contact month of year',
  'day_of_week': 'last contact day of the week',
  'duration': "last contact duration, in seconds. Important note: this attribute affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model"},
 '******** OTHER ********': {'campaign': 'number of contacts performed during this campaign and for this client',
  'pdays': 'days that passed by after last contact from a previ

In [31]:
data_bank.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [32]:
# Get the names of the numerical columns
numerical_cols = data_bank.select_dtypes(include=['number']).columns
print(numerical_cols)

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'],
      dtype='object')


In [39]:
# Get the names of the categorical columns
categorical_cols = data_bank.select_dtypes(exclude=['number']).columns
print(categorical_cols)

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome', 'y'],
      dtype='object')


In [38]:
#Center numerical variables. 
#It facilitates interpretability + algorithm optimization
data_bank.loc[:, numerical_cols] = st.zscore(data_bank.loc[:, numerical_cols]) #center numerical variables
data_bank.loc[:, numerical_cols]

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.533034,0.010471,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
1,1.628993,-0.421501,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
2,-0.290186,-0.124520,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
3,-0.002309,-0.413787,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
4,1.533034,0.187888,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
...,...,...,...,...,...,...,...,...,...,...
41183,3.164336,0.292025,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41184,0.573445,0.481012,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41185,1.533034,-0.267225,-0.204909,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41186,0.381527,0.708569,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697


In [48]:
#Dummy coding of categorical data
#This is necessary so that all variables are numerical 0,1 (regressions work with numbers)

#Let's begin with an example, then we will do all of them
jobs = list(data_bank["job"].unique())
print("JOBS IN DATA:\n", jobs)
# Remove "unemployed" from the array and then insert "unemployed" at the beginning of the array. 
#We do this because we want "unemployed" as the reference category in the regression
#and the first element of the array will be the reference
ref_cat = 'unemployed'
jobs.remove(ref_cat)
jobs.insert(0, ref_cat) 
print("\n\nJOBS WITH THE REFERENCE CATEGORY FIRST:\n", jobs)



JOBS IN DATA:
 ['housemaid', 'services', 'admin.', 'blue-collar', 'technician', 'retired', 'management', 'unemployed', 'self-employed', 'unknown', 'entrepreneur', 'student']


JOBS WITH THE REFERENCE CATEGORY FIRST:
 ['unemployed', 'housemaid', 'services', 'admin.', 'blue-collar', 'technician', 'retired', 'management', 'self-employed', 'unknown', 'entrepreneur', 'student']


In [79]:
#Now let's dummy code jobs in the dataset
dc = dmatrix("~ C(job, levels=jobs)", data_bank) #Dummy coding of job
print("NUMBER OF ORIGINAL JOB CATEGORIES:\n", len(jobs))
print("\nNUMBER OF JOB DUMMIES:\n", dc.shape[1]-1) #minus one because we do not count the intercept
print("\nDUMMY COLUMNS (NOTE THAT THE REFERENCE IS NOT PRESENT)\n", dc.design_info.column_names)
print("\nDUMMY CODES\nCOLUMNS AS ORGANIZED IN jobs (WITHOUT THE REFERENCE)\n", dc[:, 1:]) #Each column is a job (without the reference). There is a one if the job is there, else zero

print("\nEXAMPLE. IN DUMMY CODES THE FIRST COLUMN IS FOR HOUSEMAID.\n THE FIRST ROW IS A 1 BECAUSE IN data_bank['job'] THE FIRST ROW IS HOUSEMAID\n", data_bank.loc[:, "job"])

NUMBER OF ORIGINAL JOB CATEGORIES:
 12

NUMBER OF JOB DUMMIES:
 11

DUMMY COLUMNS (NOTE THAT THE REFERENCE IS NOT PRESENT)
 ['Intercept', 'C(job, levels=jobs)[T.housemaid]', 'C(job, levels=jobs)[T.services]', 'C(job, levels=jobs)[T.admin.]', 'C(job, levels=jobs)[T.blue-collar]', 'C(job, levels=jobs)[T.technician]', 'C(job, levels=jobs)[T.retired]', 'C(job, levels=jobs)[T.management]', 'C(job, levels=jobs)[T.self-employed]', 'C(job, levels=jobs)[T.unknown]', 'C(job, levels=jobs)[T.entrepreneur]', 'C(job, levels=jobs)[T.student]']

DUMMY CODES
COLUMNS AS ORGANIZED IN jobs (WITHOUT THE REFERENCE)
 [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

EXAMPLE. IN DUMMY CODES THE FIRST COLUMN IS FOR HOUSEMAID.
 THE FIRST ROW IS A 1 BECAUSE IN data_bank['job'] THE FIRST ROW IS HOUSEMAID
 0          housemaid
1           services
2           services
3             admin.
4           services

In [None]:
Div_l = ['E', 'W']
dm = dmatrix("~ 1 + C(League, levels=League_l) + C(Division, levels=Div_l) + C(NewLeague, levels=League_l)", 
             Hitters) #Dummy coding of categorical variables
print(dm.design_info.column_names) #to change manually to more readable names

In [None]:
#Dummy coding of categorical data

In [None]:
Hitters = Hitters.dropna().reset_index(drop=True)
non_cat_cols = np.append(np.array(Hitters.columns[0:13]), Hitters.columns[15:19]) #numerical columns
Hitters.loc[:, non_cat_cols] = st.zscore(Hitters.loc[:, non_cat_cols]) #center numerical variables

#Dummy coding of categorical data
League_l = ['N','A'] #first element will be the reference 0
Div_l = ['E', 'W']
dm = dmatrix("~ 1 + C(League, levels=League_l) + C(Division, levels=Div_l) + C(NewLeague, levels=League_l)", 
             Hitters) #Dummy coding of categorical variables
print(dm.design_info.column_names) #to change manually to more readable names

#Concatenate dummy coded variables with non-categorical variables
Hitters_d = pd.concat([pd.DataFrame(dm, columns=["Intercept", "League_1_American", "Division_1_West", "New_League_1_American"]), 
                       Hitters.loc[:, non_cat_cols]], axis = 1)
Hitters_d.drop(columns=['Intercept'], inplace = True)
print(Hitters_d.shape)
Hitters_d.dtypes

X = Hitters_d.loc[:, Hitters_d.columns[0:19]] #Predictors
y = Hitters_d.loc[:, "Salary"] #variable to predict