## Model Building

### Load Libraries

In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np

In [15]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import StackingClassifier

### Read data

In [4]:
df = pd.read_csv("cleaned_v2.csv")
df.head()

Unnamed: 0,salary,mins_beerdrinking_year,works_hours,mins_exercising_year,education_rank,coffee_per_year,great_customer_class
0,70773.0,0.0,40.0,0.0,9.0,359.708169,0.0
1,76597.0,0.0,30.0,0.0,9.0,359.708169,0.0
2,47947.25,0.0,10.0,0.0,10.0,276.0,0.0
3,41740.25,0.0,20.0,0.0,7.0,359.708169,0.0
4,37149.297355,447.920607,36.0,0.0,9.0,120.0,0.0


In [6]:
y = df.great_customer_class
X = df[["salary", 'mins_beerdrinking_year', "works_hours", "mins_exercising_year", "education_rank", "coffee_per_year"]]

### Split data

In [8]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.3, random_state = 1)

### Build models with 5 different algorithms

In [10]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()
bayes = GaussianNB()
rf = RandomForestClassifier(100)

models = {'logreg': logreg, 
         'knn': knn, 
         'svm': svm,
         'bayes': bayes,
         'rf': rf}

In [12]:
def evaluate_model(model, Xtrain, ytrain):
    cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    scores = cross_val_score(model, X, y, scoring= 'accuracy', cv = cv, n_jobs = -1)
    return scores

In [13]:
results = algorithms = []
for algo, model in models.items():
    scores = evaluate_model(model, Xtrain, ytrain)
    results.append(scores)
    algorithms.append(algo)
    print(f"Algorithm {algo}'s Accuracy >>> {np.mean(scores)} & Standard Deviation >>> {np.std(scores)}")

Algorithm logreg's Accuracy >>> 0.9238917745170178 & Standard Deviation >>> 0.0035594471547441345
Algorithm knn's Accuracy >>> 0.9209991848100534 & Standard Deviation >>> 0.0042340228281918385
Algorithm svm's Accuracy >>> 0.9240876725966326 & Standard Deviation >>> 0.00227186659866084
Algorithm bayes's Accuracy >>> 0.9092092116463951 & Standard Deviation >>> 0.0050077832880320175
Algorithm rf's Accuracy >>> 0.9266858921640768 & Standard Deviation >>> 0.005884853154727841


### Stack above models with another one

In [16]:
#if using stacking
#logreg as level 1 final estimator
stacking_model = StackingClassifier(estimators = [('logreg', logreg), 
                                                 ('knn', knn), 
                                                 ('svm', svm), 
                                                 ('bayes', bayes),
                                                 ('rf', rf)], 
                                   final_estimator = logreg, cv = 5)

In [17]:
algo = 'Stacking'
scores = evaluate_model(stacking_model, Xtrain, ytrain)
print(f"Algorithm {algo}'s Accuracy >>> {np.mean(scores)} & Standard Deviation >>> {np.std(scores)}")

Algorithm Stacking's Accuracy >>> 0.934309072414838 & Standard Deviation >>> 0.003463137599966917


Multiple models are used to make predictions in level 0, and in the top level 1, a simple logistic regression is applied over the predictions of level 0 models. The accuracy has increased slightly to 0.934309072414838.