Kaggle Intermediate Machine Learning Tutorial

## Importing Libraries

In [28]:
import numpy as np 
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error # MAE metric
from sklearn.model_selection import train_test_split

### Random Forest Regressor MAE Scoring of test data

In [74]:
def score_dataset(X_train, X_valid, y_train, y_valid): # only 10 estimators so it's quick
    model = RandomForestRegressor(n_estimators=10)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Feature Selection
Only using numerical features to keep things simple

In [66]:
df = pd.read_csv('melb_data.csv')

# Target is house price
y = df.Price

# Use only numerical predictors
melb_predictors = df.drop(['Price'], axis=1) # get rid of target
X = melb_predictors.select_dtypes(exclude=['object']) # drop categorial features

X_train, X_test, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)

### Dataset has some missing values
We can either drop them or impute between the missing values

In [67]:
df = pd.read_csv('melb_data.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


## MAE from Dropping Missing Numerical Data

In [75]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] # these columns have missing data

reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_test.drop(cols_with_missing, axis=1)

print(f'MAE from dropping: {score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)}')

MAE from dropping: 190412.82104784698


## Simple Imputation