#Header



Ensemble Modeling and Logistic Regression

Sarah Kim

# Task I: Setup, Data Importation, and Data Preparation for Modeling

In [5]:
# Mathematics and Dataframe Packages
import numpy as np
import pandas as pd

# Google Colab and CSV Packages
from google.colab import drive
import csv

# Graphical Packages
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Data Preparation Packages, 
from sklearn.model_selection import train_test_split

# Cross-Validation and Grid-Searching Packages
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# Modeling Packages - Single
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Modeling Packages - Ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import StackingClassifier

# Model Evaluation Packages
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report

# Suppress Warning Messages
import warnings
warnings.filterwarnings('ignore')

##Task I: Setup, Data Importation and Preparation


###B: Import Data

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
cd_df = pd.read_csv('/content/gdrive/MyDrive/Data_Sets/CD_additional_balanced.csv')


###C: Overall Structure

In [8]:
cd_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9280 entries, 0 to 9279
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             9280 non-null   int64  
 1   job             9280 non-null   object 
 2   marital         9280 non-null   object 
 3   education       9280 non-null   object 
 4   default         9280 non-null   object 
 5   housing         9280 non-null   object 
 6   loan            9280 non-null   object 
 7   contact         9280 non-null   object 
 8   month           9280 non-null   object 
 9   day_of_week     9280 non-null   object 
 10  duration        9280 non-null   int64  
 11  campaign        9280 non-null   int64  
 12  pdays           9280 non-null   int64  
 13  previous        9280 non-null   int64  
 14  poutcome        9280 non-null   object 
 15  emp.var.rate    9280 non-null   float64
 16  cons.price.idx  9280 non-null   float64
 17  cons.conf.idx   9280 non-null   f

In [9]:
cd_df.describe()


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,9280.0,9280.0,9280.0,9280.0,9280.0,9280.0,9280.0,9280.0,9280.0,9280.0
mean,40.403448,387.361746,2.333297,887.282435,0.315302,-0.496272,93.479178,-40.218125,2.960209,5135.306487
std,12.06203,357.711742,2.334467,313.802415,0.700671,1.721204,0.63451,5.360642,1.890402,87.105317
min,17.0,1.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,31.0,145.0,1.0,999.0,0.0,-1.8,92.893,-42.7,1.244,5076.2
50%,38.0,265.0,2.0,999.0,0.0,-0.1,93.444,-41.8,4.021,5191.0
75%,48.0,528.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.959,5228.1
max,98.0,4199.0,39.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1


In [10]:
cd_df.head(15)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,41,blue-collar,divorced,basic.4y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
1,49,entrepreneur,married,university.degree,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
2,49,technician,married,basic.9y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
3,41,technician,married,professional.course,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
4,45,blue-collar,married,basic.9y,unknown,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
5,42,blue-collar,married,basic.9y,no,yes,yes,telephone,may,mon,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
6,39,housemaid,married,basic.9y,no,yes,no,telephone,may,mon,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
7,28,unknown,single,unknown,unknown,yes,yes,telephone,may,tue,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
8,44,services,married,high.school,no,yes,no,telephone,may,tue,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes
9,42,technician,married,professional.course,no,no,no,telephone,may,tue,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,yes


###D:Convert integer variables to a float datatype

In [11]:
int_list = list(cd_df.select_dtypes(include='int64').columns)
int_list

['age', 'duration', 'campaign', 'pdays', 'previous']

In [12]:
cd_df[int_list] = cd_df[int_list].astype('float64')


###E: Handle y variable

In [13]:
#extract y variable
y_target = cd_df.pop('y')

#index y variable
y_target = pd.get_dummies(y_target,drop_first=True)

#add back to df
cd_df['y_target'] = y_target

###F: Convert all object variables to category datatypes


In [14]:
object_list = list(cd_df.select_dtypes(include='object').columns)
cd_df[object_list] = cd_df[object_list].astype('category')

In [15]:
cd_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9280 entries, 0 to 9279
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             9280 non-null   float64 
 1   job             9280 non-null   category
 2   marital         9280 non-null   category
 3   education       9280 non-null   category
 4   default         9280 non-null   category
 5   housing         9280 non-null   category
 6   loan            9280 non-null   category
 7   contact         9280 non-null   category
 8   month           9280 non-null   category
 9   day_of_week     9280 non-null   category
 10  duration        9280 non-null   float64 
 11  campaign        9280 non-null   float64 
 12  pdays           9280 non-null   float64 
 13  previous        9280 non-null   float64 
 14  poutcome        9280 non-null   category
 15  emp.var.rate    9280 non-null   float64 
 16  cons.price.idx  9280 non-null   float64 
 17  cons.conf.idx 

###G: One-hot encode the categorical variables


In [16]:
category_list = list(cd_df.select_dtypes(include='category').columns)
category_list

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [17]:
cd_onehot_df = pd.get_dummies(cd_df,columns=category_list)


###H: Splittings

In [18]:
random_state = 0

In [19]:
# independent/dependent variable for the training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(cd_onehot_df.drop('y_target',axis=1),
                                                    cd_onehot_df['y_target'],
                                                    test_size=0.2,
                                                    random_state=random_state)

# independent/dependent variable for full dataset
X = cd_onehot_df
y = cd_onehot_df.pop('y_target')

##Task II: Simple Model


###A:5 fold model on full dataset


In [20]:
#define
dtmodel = DecisionTreeClassifier(random_state=random_state).fit(X,y)

#5-fold
five_fold = cross_validate(dtmodel,X,y,cv=5,scoring=['f1'],return_train_score=True)

In [21]:
pd.DataFrame(five_fold)


Unnamed: 0,fit_time,score_time,test_f1,train_f1
0,0.075135,0.011425,0.060718,1.0
1,0.063527,0.005357,0.282943,1.0
2,0.065974,0.005228,0.12489,1.0
3,0.067241,0.005162,0.187026,1.0
4,0.073887,0.005236,0.167939,1.0


###B: Display results as a dataframe


In [22]:
## Display the Cross-Validation Results as a Dataframe
dtmodel_df = pd.DataFrame(five_fold).agg('mean').round(5)
dtmodel_df


fit_time      0.06915
score_time    0.00648
test_f1       0.16470
train_f1      1.00000
dtype: float64

##Task III: Grid Searches


###A: RandomForestClassifier() model


In [23]:
# Choose Parameter Options
parameters = {'max_depth':list(range(5,20)),
              'n_estimators':[2,3,4,5,6,7,8,9,10,100,300,500,700,900,1000]}

# Sequential Grid Search
rf_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1,random_state=random_state),parameters,cv=5,scoring=['f1'],return_train_score=True,refit=False,verbose=3)
rf_grid = rf_grid.fit(X,y)

# Get Results
# Display Specific Result Dataframe Columns
# Sort by Highest Testing F1-Score
results = rf_grid.cv_results_
pd.DataFrame(results)[['param_max_depth',
                       'param_n_estimators',
                       'mean_fit_time',
                       'mean_score_time',
                       'mean_train_f1',
                       'mean_test_f1']].sort_values('mean_test_f1',ascending=False).head(10)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
[CV 1/5] END max_depth=5, n_estimators=2; f1: (train=0.843, test=0.067) total time=   1.5s
[CV 2/5] END max_depth=5, n_estimators=2; f1: (train=0.888, test=0.520) total time=   0.2s
[CV 3/5] END max_depth=5, n_estimators=2; f1: (train=0.882, test=0.160) total time=   0.1s
[CV 4/5] END max_depth=5, n_estimators=2; f1: (train=0.798, test=0.757) total time=   0.2s
[CV 5/5] END max_depth=5, n_estimators=2; f1: (train=0.856, test=0.456) total time=   0.2s
[CV 1/5] END max_depth=5, n_estimators=3; f1: (train=0.860, test=0.000) total time=   0.2s
[CV 2/5] END max_depth=5, n_estimators=3; f1: (train=0.850, test=0.254) total time=   0.2s
[CV 3/5] END max_depth=5, n_estimators=3; f1: (train=0.881, test=0.189) total time=   0.2s
[CV 4/5] END max_depth=5, n_estimators=3; f1: (train=0.879, test=0.625) total time=   0.2s
[CV 5/5] END max_depth=5, n_estimators=3; f1: (train=0.785, test=0.438) total time=   0.3s
[CV 1/5] END max_depth=5, 

KeyboardInterrupt: ignored

###B:SVC() model using the sigmoid kernel


In [None]:
parameters_svm_sig = {'C':list(range(1,11))}
sig_grid = GridSearchCV(SVC(kernel='sigmoid',random_state=random_state),parameters_svm_sig,cv=5,scoring=['f1'],return_train_score=True,refit='f1',verbose=3)
sig_grid = sig_grid.fit(X,y)

results = sig_grid.cv_results_
pd.DataFrame(results)[['param_C',
                       'mean_fit_time',
                       'mean_score_time',
                       'mean_train_f1',
                       'mean_test_f1']].sort_values('mean_test_f1',ascending=False).head(10)

In [None]:
results = sig_grid.cv_results_
pd.DataFrame(results)[['param_C',
                       'mean_fit_time',
                       'mean_score_time',
                       'mean_train_f1',
                       'mean_test_f1']].sort_values('mean_test_f1',ascending=False).head(10)

###C:SVC() model using radial basis function kernel


In [None]:
parameters_svm_sig = {'C':list(range(1,11))}
basis_grid = GridSearchCV(SVC(kernel='rbf',random_state=random_state),parameters_svm_sig,cv=5,scoring=['f1'],return_train_score=True,refit='f1',verbose=3)
basis_grid = basis_grid.fit(X,y)

basis_results = basis_grid.cv_results_
pd.DataFrame(basis_results)[['param_C',
                       'mean_fit_time',
                       'mean_score_time',
                       'mean_train_f1',
                       'mean_test_f1']].sort_values('mean_test_f1',ascending=False).head(10)


##Task IV: Majority Voting


###A: Instantiate three new respective models

In [None]:
# Random Forest
model1_grid = RandomForestClassifier(random_state=random_state,n_estimators=2,max_depth=5)
# SVM with Sigmoid Kernel
model2_grid =SVC(kernel='sigmoid',random_state=random_state, C=1)
# SVM with RBF Kernel 
model3_grid = SVC(kernel='rbf',random_state=random_state, C=9)

##B: Include models in a VotingClassifier()


In [None]:
voting_grid_model = VotingClassifier(estimators=[('rf',model1_grid),
                                                 ('svm_sigmoid',model2_grid),
                                                 ('svm_rbf',model3_grid)],
                                     voting='hard')

###C: Evaluate model


In [None]:
voting_grid_df = pd.DataFrame(cross_validate(voting_grid_model,X,y,cv=5,scoring=['f1'],return_train_score=True)).agg('mean').round(5)
voting_grid_df

##Task V: Gradient Boosting


###A: Select parameters


In [None]:
grb_parameters = {'max_depth':list(range(5,11)),
              'n_estimators':[2,4,6,8,10,100,300,400,700]}

###B: Create and fit a GradientBoostingClassifier()


In [None]:
grb_grid = GridSearchCV(GradientBoostingClassifier(random_state=random_state),grb_parameters,cv=5,scoring=['f1'],return_train_score=True,refit=False,verbose=3)
grb_grid = grb_grid.fit(X,y)


###C: Show results


In [None]:
grb_results = grb_grid.cv_results_
pd.DataFrame(grb_results)[['param_max_depth',
                       'param_n_estimators',
                       'mean_fit_time',
                       'mean_score_time',
                       'mean_train_f1',
                       'mean_test_f1']].sort_values('mean_test_f1',ascending=False).head(10)


##Task VI: Stacked Gradient Boosting


###A: Assign three new models


In [None]:
# Random Forest
model1 = RandomForestClassifier(random_state=random_state,n_estimators=2,max_depth=5)
# SVM with Sigmoid Kernel
model2 =SVC(kernel='sigmoid',random_state=random_state, C=1)
# SVM with RBF Kernel 
model3 = SVC(kernel='rbf',random_state=random_state, C=9)

##B: Define estimators and final_estimator


In [None]:
# Collection of Models for the Stack
estimators = [('rf',model1),
              ('svm_sigmoid',model2),
              ('svm_rbf',model3)]

# Define the Gradient Boosting Estimator for the Stack
final_estimator = GradientBoostingClassifier(n_estimators=25,
                                             subsample=0.5,
                                             min_samples_leaf=25,
                                             max_features=1,
                                             random_state=random_state)

# Define the Stacking Model
stacked_model = StackingClassifier(estimators=estimators,final_estimator=final_estimator)


###C: Display results


In [None]:
stacked_df = pd.DataFrame(cross_validate(stacked_model,X,y,scoring=['f1'],return_train_score=True)).agg('mean').round(5)
stacked_df

##Task VII: Logistic Regression


###A: Define and Fit a LogisticRegression() model


In [None]:
lr_model = LogisticRegression(random_state=random_state).fit(X_train,y_train)


###B: Make predictions on test data


In [None]:
lr_predictions = lr_model.predict(X_test)


###C: Evaluate the predictions


In [None]:
#Confusion Matrix
con_mat_test = confusion_matrix(y_test,lr_predictions,labels=[0,1])
ax = ConfusionMatrixDisplay(confusion_matrix=con_mat_test,display_labels=[0,1])
ax.plot(values_format='',cmap=plt.cm.Blues)
plt.show()


In [None]:
print(classification_report(y_test,lr_predictions))


##Task VIII: IPYNB and HTML Files
