In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data/melb_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
# Separate target from predictors
y = data.Price
X = data.drop('Price',axis=1)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
# Divide data into training and validation subsets
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [6]:
col_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
col_with_missing

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [7]:
# Drop columns with missing values (simplest approach)
X_train_full.drop(col_with_missing,axis=1,inplace=True)
X_test_full.drop(col_with_missing,axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


`"Cardinality"` means the number of unique values in a column

Select categorical columns with relatively low cardinality (convenient but arbitrary)

In [35]:
low_cardinality_cols = [cname for cname in list(X_train_full.select_dtypes(['object']).columns) if X_train_full[cname].nunique()<10]        
low_cardinality_cols

['Type', 'Method', 'Regionname']

In [36]:
numerical_cols = list(X_train_full.select_dtypes(exclude=['object']).columns)
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Landsize',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [38]:
# Keep selected columns only
my_cols = low_cardinality_cols+numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [11]:
object_col = list(X_train_full.select_dtypes(['object']).columns)
object_nunique = list(map(lambda col : X_train_full[col].nunique(),object_col))
d = dict(zip(object_col,object_nunique))
sorted(d.items(),key = lambda x:x[1])

[('Type', 3),
 ('Method', 5),
 ('Regionname', 8),
 ('Date', 58),
 ('SellerG', 251),
 ('Suburb', 308),
 ('Address', 10742)]

Define Function to Measure Quality of Each Approach

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_mae(X_train,X_test,y_train,y_test):
    rf_model =RandomForestRegressor(n_estimators=100, random_state=0)
    rf_model.fit(X_train,y_train)
    preds = rf_model.predict(X_test)
    return mean_absolute_error(y_test,preds)

**Approach 1 (Drop Categorical Variables)**

In [42]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_test = X_test.select_dtypes(exclude=['object'])

In [43]:
print('MAE Approach 1:')
print(score_mae(drop_X_train,drop_X_test,y_train,y_test))

MAE Approach 1:
175703.48185157913


**Approach 2 (Label Encoding)**

In [48]:
col_object = list(X_train.select_dtypes('object').columns)

In [49]:
from sklearn.preprocessing import LabelEncoder

In [50]:
label_X_train = X_train.copy()
label_X_test = X_test.copy()

In [52]:
label_encoder = LabelEncoder()

In [53]:
# Apply label encoder to each column with categorical data
for col in col_object:
    label_X_train[col]=label_encoder.fit_transform(X_train[col])
    label_X_test[col]=label_encoder.transform(X_test[col])

In [54]:
label_X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,2,1,5,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,0,2,6,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,0,1,6,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,2,3,2,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,0,1,6,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [65]:
print('MAE Approach 2:')
print(score_mae(label_X_train,label_X_test,y_train,y_test))

MAE Approach 2:
165936.40548390493


**Approach 3 (One-Hot Encoding)**

In [66]:
from sklearn.preprocessing import OneHotEncoder

In [67]:
onehotencoder = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [69]:
OH_train_cols = pd.DataFrame(onehotencoder.fit_transform(X_train[col_object]))
OH_test_cols = pd.DataFrame(onehotencoder.transform(X_test[col_object]))

In [71]:
# One-hot encoding removed index; put it back
OH_train_cols.index = X_train.index
OH_test_cols.index = X_test.index

In [81]:
for col in col_object:
    print('Column Name:{} / No of Unique: {}'.format(col,X_train[col].nunique()))

Column Name:Type / No of Unique: 3
Column Name:Method / No of Unique: 5
Column Name:Regionname / No of Unique: 8


In [82]:
# Remove categorical columns (will replace with one-hot encoding)
num_col_train = X_train.drop(col_object,axis=1)
num_col_test = X_test.drop(col_object,axis=1)

In [83]:
# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_col_train,OH_train_cols],axis=1)
OH_X_test = pd.concat([num_col_test,OH_test_cols],axis=1)

In [85]:
OH_X_train.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,0,...,6,7,8,9,10,11,12,13,14,15
12167,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6524,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8413,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2919,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6043,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [86]:
print('MAE Approach 3:')
print(score_mae(OH_X_train,OH_X_test,y_train,y_test))

MAE Approach 3:
166089.4893009678
