# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


# Split data (train/test)

In [3]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [5]:
train_target = train['alc']
test_target = test['alc']

train_inputs = train.drop(['alc'], axis=1)
test_inputs = test.drop(['alc'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [6]:
def new_col_fps(df):

    df1 = df.copy()
    df1['failures_per_studytime'] = (df1['failures']/df1['studytime']).fillna(0)
    df1['failures_per_studytime'].replace(np.inf, 1, inplace=True)
    return df1[['failures_per_studytime']]

In [7]:
def new_col_fpt(df):

    df1 = df.copy()
    df1['failures_per_traveltime'] = (df1['failures']/df1['traveltime']).fillna(0)
    df1['failures_per_traveltime'].replace(np.inf, 1, inplace=True)
    return df1[['failures_per_traveltime']]

##  Identify the numeric, binary, and categorical columns

In [8]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [9]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [10]:
categorical_columns

['gender']

In [11]:
feat_eng_columns = ['failures','studytime','traveltime']

# Pipeline

In [12]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [13]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [14]:
# Create a pipeline for the transformed column here
fps = Pipeline(steps=[('fps', FunctionTransformer(new_col_fps)),
                               ('scaler', StandardScaler())])
fpt = Pipeline(steps=[('fpt', FunctionTransformer(new_col_fpt)),
                               ('scaler', StandardScaler())])

In [15]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('tran1s', fps, feat_eng_columns),
        ('trans2', fpt, feat_eng_columns)],
        remainder='drop')

# Transform: fit_transform() for TRAIN

In [16]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  0.        ,
        -0.09949927, -0.10893723],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  1.        ,
        -0.09949927, -0.10893723],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  1.        ,
        -0.09949927, -0.10893723],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  0.        ,
        -0.09949927, -0.10893723],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  1.        ,
        -0.09949927, -0.10893723],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  1.        ,
        -0.09949927, -0.10893723]])

In [17]:
train_x.shape

(23800, 15)

# Tranform: transform() for TEST

In [18]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  0.        ,
        -0.09949927, -0.10893723],
       [-1.23984621, -0.30388608,  0.04019664, ...,  1.        ,
        -0.09949927, -0.10893723],
       [-0.28670367,  0.33104402,  0.04019664, ...,  1.        ,
        -0.09949927, -0.10893723],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  0.        ,
        -0.09949927, -0.10893723],
       [-1.23984621, -0.93881619,  0.04019664, ...,  1.        ,
        -0.09949927, -0.10893723],
       [-1.23984621,  0.96597412,  0.04019664, ...,  0.        ,
        -0.09949927, -0.10893723]])

In [19]:
test_x.shape

(10200, 15)

# Calculate the Baseline

In [20]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target)

In [21]:
from sklearn.metrics import accuracy_score
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5234873949579832


In [22]:
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5194117647058824


# Train a voting classifier 

In [23]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

dtree_clf = DecisionTreeClassifier(max_depth=20)
log_clf = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=10, max_iter=1000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_target)

In [24]:
from sklearn.metrics import accuracy_score

In [25]:
#Train accuracy
train_y_pred = voting_clf.predict(train_x)
train_acc = accuracy_score(train_target, train_y_pred)
print('Train acc: {}' .format(train_acc))

Train acc: 0.8369747899159664


In [26]:
#Test accuracy
test_y_pred = voting_clf.predict(test_x)
test_acc = accuracy_score(test_target, test_y_pred)
print('Test acc: {}' .format(test_acc))

Test acc: 0.8213725490196079


In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_target, test_y_pred)

array([[4379,  919],
       [ 903, 3999]], dtype=int64)

In [28]:
# Inspect the accuracy of each:
for clf in (dtree_clf, log_clf, sgd_clf, voting_clf):
    clf.fit(train_x, train_target.ravel())
    test_y_pred = clf.predict(test_x)
    print(clf.__class__.__name__, 'Test acc=', accuracy_score(test_target, test_y_pred))
    
#Logistic Regression has the highest accuracy, but the other models are still pretty close

DecisionTreeClassifier Test acc= 0.7549019607843137
LogisticRegression Test acc= 0.8202941176470588
SGDClassifier Test acc= 0.8156862745098039
VotingClassifier Test acc= 0.8192156862745098


# Train a bagging classifier

In [29]:
from sklearn.ensemble import BaggingClassifier 
bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=60, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_target)

In [30]:
#Train accuracy
train_y_pred = bag_clf.predict(train_x)
train_acc = accuracy_score(train_target, train_y_pred)
print('Train acc: {}' .format(train_acc))

Train acc: 0.8220588235294117


In [31]:
#Test accuracy
test_y_pred = bag_clf.predict(test_x)
test_acc = accuracy_score(test_target, test_y_pred)
print('Test acc: {}' .format(test_acc))

# No overfitting, not as good as hard voting

Test acc: 0.8201960784313725


# Train a random forest classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_depth=5) 

rnd_clf.fit(train_x, train_target)

In [33]:
train_y_pred = rnd_clf.predict(train_x)
train_acc = accuracy_score(train_target, train_y_pred)
print('Train acc: {}' .format(train_acc))

Train acc: 0.7878151260504201


In [34]:
test_y_pred = rnd_clf.predict(test_x)
test_acc = accuracy_score(test_target, test_y_pred)
print('Test acc: {}' .format(test_acc))

Test acc: 0.7884313725490196


In [35]:
confusion_matrix(test_target, test_y_pred)

#max depth of 5 is the least overfitting -- Decreases accuracy by 3 points

array([[4332,  966],
       [1192, 3710]], dtype=int64)

# Train an adaboost classifier

In [36]:
from sklearn.ensemble import AdaBoostClassifier 

ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=500, 
            learning_rate=0.05) 

ada_clf.fit(train_x, train_target)

In [37]:
train_y_pred = ada_clf.predict(train_x)
train_acc = accuracy_score(train_target, train_y_pred)
print('Train acc: {}' .format(train_acc))

Train acc: 0.8651680672268908


In [38]:
test_y_pred = ada_clf.predict(test_x)
test_acc = accuracy_score(test_target, test_y_pred)
print('Test acc: {}' .format(test_acc))

Test acc: 0.8233333333333334


In [39]:
confusion_matrix(test_target, test_y_pred)

# set max_depth to 5 and learning rate to 0.05 -- that reduced overfitting and increase test accuracy

array([[4411,  887],
       [ 915, 3987]], dtype=int64)

# Train a gradient boosting classifier

In [40]:
from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=5, n_estimators=100, learning_rate=0.05) 

gbclf.fit(train_x, train_target)

In [41]:
train_y_pred = gbclf.predict(train_x)
train_acc = accuracy_score(train_target, train_y_pred)
print('Train acc: {}' .format(train_acc))

Train acc: 0.8323529411764706


In [42]:
test_y_pred = gbclf.predict(test_x)
test_acc = accuracy_score(test_target, test_y_pred)
print('Test acc: {}' .format(test_acc))

Test acc: 0.8219607843137255


In [43]:
confusion_matrix(test_target, test_y_pred)
# Updated max_depth to 5 and learning rate to 0.05 to limit overfitting with no effect on accuracy

#  This is the best performing model of this exercise by 10th of points.

#Baseline:
#Train: 52.35%
#Test: 51.94%

#Voting:
#Train: 83.69%
#Test: 82.13%

#Bagging:
#Train: 82.18%
#Test: 82.11%

#Random Forest:
#Train: 78.79%
#Test: 78.76%

#adaboost:
#Train: 86.51%
#Test: 82.35%

#Gradient Boosting:
#Train: 83.23%
#Test: 82.19%

array([[4416,  882],
       [ 934, 3968]], dtype=int64)