# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [22]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [23]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


# Split data (train/test)

In [24]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(alcohol, test_size=0.3)

# Data Prep

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Separate the target variable 

In [26]:
train_target = train['alc']
test_target = test['alc']

train_inputs = train.drop(['alc'], axis=1)
test_inputs = test.drop(['alc'], axis=1)

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [27]:
def new_col(df):
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()
    
    df1['minor'] = np.where(df1['age'] < 18, 1, 0)
    
    return df1[['minor']]

In [28]:
new_col(train_inputs)

Unnamed: 0,minor
12759,1
4374,1
8561,1
10697,1
19424,1
...,...
16850,1
6265,1
11284,1
860,0


##  Identify the numeric, binary, and categorical columns

In [29]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [30]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences']

In [31]:
categorical_columns

['gender']

In [32]:
feat_eng_columns = ['age']

# Pipeline

In [33]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [34]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [35]:
# Create a pipeline for the transformed column here

my_new_column = Pipeline(steps=[('my_new_column', FunctionTransformer(new_col))])


In [37]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('trans', my_new_column, feat_eng_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [38]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.66643886,  0.96597412,  0.90362635, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.66643886, -0.93881619, -1.68666277, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.66643886,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 0.66643886, -2.20867639, -2.55009248, ...,  1.        ,
         0.        ,  1.        ],
       [ 1.6195814 , -0.30388608, -1.68666277, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.6195814 , -0.30388608, -2.55009248, ...,  0.        ,
         1.        ,  0.        ]])

In [39]:
train_x.shape

(23800, 14)

# Tranform: transform() for TEST

In [40]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.        ,
         0.        ,  1.        ],
       [-1.23984621, -0.30388608,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ...,  1.        ,
         0.        ,  1.        ],
       [-1.23984621, -0.93881619,  0.04019664, ...,  0.        ,
         1.        ,  1.        ],
       [-1.23984621,  0.96597412,  0.04019664, ...,  1.        ,
         0.        ,  1.        ]])

In [41]:
test_x.shape

(10200, 14)

# Calculate the Baseline

In [42]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_target)

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_target, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

Baseline Train Accuracy: 0.5234873949579832


In [45]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_target, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

Baseline Test Accuracy: 0.5194117647058824


# Train a voting classifier 

In [46]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import SGDClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier


dtree_clf = DecisionTreeClassifier(max_depth=20)
log_clf = LogisticRegression(multi_class='multinomial', solver = 'lbfgs', C=10, max_iter=1000)
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3)

voting_clf = VotingClassifier(
            estimators=[('dt', dtree_clf), 
                        ('lr', log_clf), 
                        ('sgd', sgd_clf)],
            voting='hard')

voting_clf.fit(train_x, train_target)

In [48]:
from sklearn.metrics import accuracy_score

In [50]:
#Train accuracy

train_target_pred = voting_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8377731092436975


In [52]:
#Test accuracy

test_target_pred = voting_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8203921568627451


In [53]:
from sklearn.metrics import confusion_matrix

In [54]:
confusion_matrix(test_target, test_target_pred)

array([[4385,  913],
       [ 919, 3983]], dtype=int64)

# Train a bagging classifier

In [55]:
from sklearn.ensemble import BaggingClassifier 


#If you want to do pasting, change "bootstrap=False"
#n_jobs=-1 means use all CPU cores
#bagging automatically performs soft voting

bag_clf = BaggingClassifier( 
            SGDClassifier(), n_estimators=50, 
            max_samples=1000, bootstrap=True, n_jobs=-1) 

bag_clf.fit(train_x, train_target)


In [56]:
#Train accuracy

train_target_pred = bag_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8225210084033614


In [58]:
#Test accuracy

test_target_pred = bag_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8227450980392157


# Train a random forest classifier

In [59]:
from sklearn.ensemble import RandomForestClassifier 

rnd_clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs=-1) 

rnd_clf.fit(train_x, train_target)

In [60]:
#Train accuracy

train_target_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8422689075630252


In [61]:
#Test accuracy

test_target_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.812843137254902


# Train an adaboost classifier

In [62]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=5), n_estimators=500, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_target)

In [63]:
#Train accuracy

train_target_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8837394957983193


In [64]:
#Test accuracy

test_target_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8186274509803921


In [68]:
from sklearn.ensemble import AdaBoostClassifier 


ada_clf = AdaBoostClassifier( 
            DecisionTreeClassifier(max_depth=3), n_estimators=500, 
            learning_rate=0.1) 


ada_clf.fit(train_x, train_target)

In [69]:
#Train accuracy

train_target_pred = ada_clf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.841890756302521


In [70]:
#Test accuracy

test_target_pred = ada_clf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8303921568627451


# Train a gradient boosting classifier

In [65]:
#Use GradientBoosting

from sklearn.ensemble import GradientBoostingClassifier

gbclf = GradientBoostingClassifier(max_depth=2, n_estimators=100, learning_rate=0.1) 

gbclf.fit(train_x, train_target)

In [66]:
#Train accuracy

train_target_pred = gbclf.predict(train_x)

train_acc = accuracy_score(train_target, train_target_pred)

print('Train acc: {}' .format(train_acc))

Train acc: 0.8190336134453782


In [67]:
#Test accuracy

test_target_pred = gbclf.predict(test_x)

test_acc = accuracy_score(test_target, test_target_pred)

print('Test acc: {}' .format(test_acc))

Test acc: 0.8116666666666666
