Kaggle Intermediate Machine Learning Tutorial

## Importing Libraries

In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # MAE metric
from sklearn.model_selection import train_test_split

### Random Forest Regressor MAE Scoring of test data

In [3]:
def score_dataset(X_train, X_valid, y_train, y_valid): 
    model = RandomForestRegressor(n_estimators=100)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Feature Selection
Only using numerical features to keep things simple

In [4]:
df = pd.read_csv('melb_data.csv')

# Target is house price
y = df.Price

# Use only numerical predictors
melb_predictors = df.drop(['Price'], axis=1) # get rid of target
X = melb_predictors.select_dtypes(exclude=['object']) # drop categorial features

X_train, X_test, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

### Dataset has some missing values
We can either drop them or impute between the missing values

In [5]:
df = pd.read_csv('melb_data.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


## MAE from Dropping Missing Numerical Data

In [6]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] # these columns have missing data

reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_test.drop(cols_with_missing, axis=1)

print(f'MAE from dropping: {score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)}')

MAE from dropping: 184767.28635619025


## Simple Imputation

In [7]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.fit_transform(X_test))

# Imputation removes column names, so we have to add them back in
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_test.columns

# Score this method
print(f'MAE from imputation: {score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid)}')

MAE from imputation: 185288.19211831124


### What happens if we keep track of which values were imputed?

In [8]:
# Copy the original data
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

# Make new columns to keep track of imputations
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_test_plus[col + '_was_missing'] = X_test_plus[col].isnull()

# Simple imputaiton
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(my_imputer.fit_transform(X_test_plus))

# Re-add column names
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

# Score this method
print(f'MAE of imputation + keeping track of columns: {score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_valid)}')

MAE of imputation + keeping track of columns: 186204.85520443227


## Handling Categorical Features

### Reload Data

In [12]:
# Data loading
X = melb_predictors.copy()
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=.8, test_size=.2)

# Drop columns with missing values
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# Select categorical features with low cardinality (number of unique values in column)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

# Select numerical features
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep only these features
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [13]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
11557,h,SP,Western Metropolitan,3,7.5,3040.0,3.0,2.0,402.0,-37.76027,144.88954,1543.0
6852,u,VB,Northern Metropolitan,2,1.8,3053.0,2.0,1.0,0.0,-37.8018,144.965,6786.0
7664,h,S,Southern Metropolitan,3,11.7,3125.0,3.0,2.0,697.0,-37.8592,145.0948,5678.0
1728,u,PI,Southern Metropolitan,2,11.4,3163.0,2.0,1.0,109.0,-37.8971,145.0597,7822.0
433,h,S,Southern Metropolitan,2,12.2,3147.0,2.0,1.0,586.0,-37.8683,145.1082,2894.0


In [14]:
# Get list of categorical features
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print(f'Categorical features: {object_cols}')

Categorical features: ['Type', 'Method', 'Regionname']


### Dropping Categorical Variables

In [15]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print(f'MAE from dropping categorical features: {score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)}')

MAE from dropping categorical features: 180365.46799632357


### Ordinal encoding

In [17]:
from sklearn.preprocessing import OrdinalEncoder

# Copy original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to categorical features
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.fit_transform(X_valid[object_cols])

print(f'MAE from ordinal encoding: {score_dataset(label_X_train, label_X_valid, y_train, y_valid)}')

MAE from ordinal encoding: 168141.16600524232


### One-Hot Encoding
Typically one-hot encoding works best!

In [18]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to all categorical features
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) # ignores new categorical features in validation data + returns columns as np arrays rather than sparse matrices
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.fit_transform(X_valid[object_cols]))

# encoder removed index; add them back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical data (replace with OH encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add OH encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print(f'MAE from one-hot encoding: {score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)}')

MAE from one-hot encoding: 168308.07145206537


## Data Pipelining

In [19]:
# Reload data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=.8, test_size=.2)

# Feature selection
# low cardinality categorical features
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep only these features
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [20]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
10093,u,PI,Northern Metropolitan,2,3.0,3067.0,2.0,1.0,1.0,0.0,38.0,1970.0,-37.80206,145.00015,4019.0
5381,u,PI,Northern Metropolitan,1,2.6,3121.0,1.0,1.0,1.0,0.0,,,-37.8331,144.9975,14949.0
13565,h,S,Eastern Metropolitan,4,17.2,3133.0,4.0,2.0,2.0,791.0,,,-37.84467,145.19276,4181.0
9572,h,S,Southern Metropolitan,3,13.8,3188.0,3.0,2.0,1.0,455.0,,1990.0,-37.93739,145.00513,5454.0
1385,h,S,Northern Metropolitan,3,5.2,3056.0,3.0,2.0,1.0,280.0,130.0,1890.0,-37.7719,144.9661,11918.0


### Defining the preprocessing steps
Here we use a simple imputer for numerical features and one-hot encoding for categorical features. We bundle them in a data pipeline using $\texttt{ColumnTransformer}$. 

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for numerical features
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing steps into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
    ]
)