# Exercise - Ensemble

In this exercise, we will focus on underage drinking. The data set contains data about high school students. Each row represents a single student. The columns include the characteristics of deidentified students. This is a binary classification task: predict whether a student drinks alcohol or not (this is the **alc** column: 1=Yes, 0=No). This is an important prediction task to detect underage drinking and deploy intervention techniques. 

## Description of Variables

The description of variables are provided in "Alcohol - Data Dictionary.docx"

## Goal

Use the **alcohol.csv** data set and build a model to predict **alc**. 

# Read and Prepare the Data

In [1]:
# Common imports

import pandas as pd
import numpy as np

np.random.seed(42)

# Get the data

In [2]:
#We will predict the "price" value in the data set:

alcohol = pd.read_csv("alcohol.csv")
alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc
0,18,2,1,4,2,0,5,4,2,5,2,M,1
1,18,4,3,1,0,0,4,4,2,3,9,M,1
2,15,4,3,2,3,0,5,3,4,5,0,F,0
3,15,3,3,1,4,0,4,3,3,3,10,F,0
4,17,3,2,1,2,0,5,3,5,5,2,M,1


In [3]:
## Identify any issues with data imbalance

alcohol['alc'].value_counts() # we can see that these are a bit imbalanced, but nothing to be too concerned about. If the imbalance was greater, use one of the techniques to balance the data that we discussed in data mining.


# If you had not seen how to address data imbalance, you could use the following code to balance the data:

# There are three main techniques to balance the data (see powerpoint presentation for more details on these techniques):
# 1. Random Over Sampling
# 2. Random Under Sampling
# 3. SMOTE (Synthetic Minority Over-sampling Technique)
# 4. ADASYN (Adaptive Synthetic Sampling)

# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import SMOTE
# from imblearn.over_sampling import ADASYN

# ros = RandomOverSampler(random_state=0)
# rus = RandomUnderSampler(random_state=0)
# smote = SMOTE(random_state=0)
# adasyn = ADASYN(random_state=0)

# X_resampled, y_resampled = ros.fit_resample(X, y)
# X_resampled, y_resampled = rus.fit_resample(X, y)
# X_resampled, y_resampled = smote.fit_resample(X, y)
# X_resampled, y_resampled = adasyn.fit_resample(X, y)



alc
0    17757
1    16243
Name: count, dtype: int64

## Feature Engineering: Derive a new column

Examples:
- Ratio of study time to travel time
- Student is younger than 18 or not
- Average of father's and mother's level of education
- (etc.)

In [4]:
alcohol['study_2_travel'] = (alcohol['studytime'] / alcohol['traveltime']).replace([np.inf, -np.inf], np.nan)
alcohol['younger_than_18'] = (alcohol['age'] < 18).astype(int)
alcohol['avg_edu'] = (alcohol['Medu'] + alcohol['Fedu']) / 2

alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,gender,alc,study_2_travel,younger_than_18,avg_edu
0,18,2,1,4,2,0,5,4,2,5,2,M,1,0.5,0,1.5
1,18,4,3,1,0,0,4,4,2,3,9,M,1,0.0,0,3.5
2,15,4,3,2,3,0,5,3,4,5,0,F,0,1.5,1,3.5
3,15,3,3,1,4,0,4,3,3,3,10,F,0,4.0,1,3.0
4,17,3,2,1,2,0,5,3,5,5,2,M,1,2.0,1,2.5


In [5]:
# encode gender M and F to 1 and 0 respectively

alcohol = pd.get_dummies(alcohol, columns=['gender', 'alc'], drop_first=True, dtype='int')

alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,study_2_travel,younger_than_18,avg_edu,gender_M,alc_1
0,18,2,1,4,2,0,5,4,2,5,2,0.5,0,1.5,1,1
1,18,4,3,1,0,0,4,4,2,3,9,0.0,0,3.5,1,1
2,15,4,3,2,3,0,5,3,4,5,0,1.5,1,3.5,0,0
3,15,3,3,1,4,0,4,3,3,3,10,4.0,1,3.0,0,0
4,17,3,2,1,2,0,5,3,5,5,2,2.0,1,2.5,1,1


In [6]:
alcohol = alcohol.rename(columns={'alc_1': 'alc_use'})

alcohol.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,health,absences,study_2_travel,younger_than_18,avg_edu,gender_M,alc_use
0,18,2,1,4,2,0,5,4,2,5,2,0.5,0,1.5,1,1
1,18,4,3,1,0,0,4,4,2,3,9,0.0,0,3.5,1,1
2,15,4,3,2,3,0,5,3,4,5,0,1.5,1,3.5,0,0
3,15,3,3,1,4,0,4,3,3,3,10,4.0,1,3.0,0,0
4,17,3,2,1,2,0,5,3,5,5,2,2.0,1,2.5,1,1


# Data Prep

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

In [8]:
# Split into X and y

y = alcohol['alc_use']
X = alcohol.drop('alc_use', axis=1)


##  Identify the numeric, binary, and categorical columns

In [9]:
# Identify the numerical columns
numeric_columns = X.select_dtypes('number').columns.to_list()

# Identify the categorical columns
categorical_columns = X.select_dtypes('object').columns.to_list()

In [10]:
numeric_columns

['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences',
 'study_2_travel',
 'younger_than_18',
 'avg_edu',
 'gender_M']

In [11]:
categorical_columns

[]

In [12]:
binary_columns = [col for col in X.columns if X[col].nunique() == 2]
binary_columns

['younger_than_18', 'gender_M']

In [13]:
for binary_col in binary_columns:
    numeric_columns.remove(binary_col)
    
numeric_columns


['age',
 'Medu',
 'Fedu',
 'traveltime',
 'studytime',
 'failures',
 'famrel',
 'freetime',
 'goout',
 'health',
 'absences',
 'study_2_travel',
 'avg_edu']

# Split data (train/test)

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Pipeline

In [15]:
numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [16]:
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [17]:
binary_transformer = Pipeline( steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ]
)

In [18]:
preprocessor = ColumnTransformer(
    [
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)        
    ],
    remainder='passthrough'
)

# Transform: fit_transform() for TRAIN

In [19]:
#Fit and transform the train data
X_train = preprocessor.fit_transform(X_train)

In [20]:
X_train.shape

(23800, 15)

# Tranform: transform() for TEST

In [21]:
# Transform the test data
X_test = preprocessor.transform(X_test)

X_test

array([[-1.23984621,  0.33104402,  1.76705606, ...,  1.01168573,
         1.        ,  0.        ],
       [-1.23984621, -0.30388608,  0.04019664, ..., -0.1702172 ,
         1.        ,  1.        ],
       [-0.28670367,  0.33104402,  0.04019664, ...,  0.22375045,
         1.        ,  1.        ],
       ...,
       [ 0.66643886, -0.30388608,  0.04019664, ..., -0.1702172 ,
         1.        ,  0.        ],
       [-1.23984621, -0.93881619,  0.04019664, ..., -0.56418484,
         1.        ,  1.        ],
       [-1.23984621,  0.96597412,  0.04019664, ...,  0.61771809,
         1.        ,  0.        ]])

In [22]:
X_test.shape

(10200, 15)

# Calculate the Baseline

## Train a Logistic Regress Classifier (use random search hyperparameter tuning)

## Train a random forest classifier (use random search hyperparameter tuning)

## Train an adaboost classifier (use random search hyperparameter tuning)

## Train a Voting Classifier using previous models (test both soft and hard voting)

## Train a StackedClassifier with the above models (minus the VotingClassifier)

## Discuss the results of the models and the best model based on F1 score results.