In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
data = pd.read_csv('melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] 
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
print(low_cardinality_cols)

['Type', 'Method', 'Regionname']


In [4]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [6]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
175703.48185157913


In [7]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])
print(label_X_train[object_cols])
print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

       Type  Method  Regionname
12167   2.0     1.0         5.0
6524    0.0     2.0         6.0
8413    0.0     1.0         6.0
2919    2.0     3.0         2.0
6043    0.0     1.0         6.0
...     ...     ...         ...
13123   0.0     3.0         2.0
3264    0.0     1.0         0.0
9845    0.0     0.0         2.0
10799   0.0     1.0         2.0
2732    0.0     3.0         6.0

[10864 rows x 3 columns]
MAE from Approach 2 (Ordinal Encoding):
165936.40548390493


In [11]:
from sklearn.preprocessing import OneHotEncoder
#print(X_train)
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
print(sorted(d.items(), key=lambda x: x[1]))

# Columns that will be one-hot encoded
low_cardinality_cols = [
    col for col in object_cols if X_train[col].nunique() < 10]
# Columns that will be dropped from the dataset
high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))

# Define the column transformer to apply one-hot encoding to categorical columns

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_train.columns = OH_encoder.get_feature_names_out(object_cols)
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))
OH_cols_valid.columns = OH_encoder.get_feature_names_out(object_cols)
# # Get the feature names from the column transformer
# X_train_encoded = ct.fit_transform(X_train[object_cols])
# feature_names = ct.get_feature_names_out()
# print(feature_names)
# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index
print(OH_cols_valid)

[('Type', 3), ('Method', 5), ('Regionname', 8)]
       Type_h  Type_t  Type_u  Method_PI  Method_S  Method_SA  Method_SP   
8505      1.0     0.0     0.0        0.0       0.0        0.0        1.0  \
5523      1.0     0.0     0.0        0.0       1.0        0.0        0.0   
12852     1.0     0.0     0.0        0.0       0.0        0.0        1.0   
4818      0.0     1.0     0.0        1.0       0.0        0.0        0.0   
12812     1.0     0.0     0.0        0.0       1.0        0.0        0.0   
...       ...     ...     ...        ...       ...        ...        ...   
2664      0.0     0.0     1.0        1.0       0.0        0.0        0.0   
8513      1.0     0.0     0.0        0.0       0.0        0.0        1.0   
12922     1.0     0.0     0.0        0.0       1.0        0.0        0.0   
10761     1.0     0.0     0.0        0.0       1.0        0.0        0.0   
2110      1.0     0.0     0.0        0.0       0.0        0.0        1.0   

       Method_VB  Regionname_Eastern Me



In [9]:
# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))


MAE from Approach 3 (One-Hot Encoding):
166089.4893009678
