In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

## Encoding

### 1. LabelEncoder

Encode target values from string to labels between 0 to class n-1

This transformer should be used to encode target values, y not the input X, from Sklearn as OneHotEncoder now supports string input, so we don't need to do label encoding before one-hot encoding

In [3]:
df = pd.read_csv('ames_unprocessed_data.csv')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Neighborhood,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,...,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,Fireplaces,GarageArea,PavedDrive,SalePrice
0,60,RL,65.0,8450,CollgCr,1Fam,2Story,7,5,2003,...,1710,1,0,2,1,3,0,548,Y,208500
1,20,RL,80.0,9600,Veenker,1Fam,1Story,6,8,1976,...,1262,0,1,2,0,3,1,460,Y,181500
2,60,RL,68.0,11250,CollgCr,1Fam,2Story,7,5,2001,...,1786,1,0,2,1,3,1,608,Y,223500
3,70,RL,60.0,9550,Crawfor,1Fam,2Story,7,5,1915,...,1717,1,0,1,0,3,1,642,Y,140000
4,60,RL,84.0,14260,NoRidge,1Fam,2Story,8,5,2000,...,2198,1,0,2,1,4,1,836,Y,250000


In [4]:
from sklearn.preprocessing import LabelEncoder

# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(0)

# Create a boolean mask for categorical columns
categorical_mask = (df.dtypes == 'object')

# Get list of categrocial columns names
categorical_columns = df.columns[categorical_mask].tolist()

# Create LabelEncoder object
le = LabelEncoder()

# Apply LabelEncode to categorical columns
df[categorical_columns] = df[categorical_columns].apply(lambda x:le.fit_transform(x))

print(df[categorical_columns].head())

   MSZoning  Neighborhood  BldgType  HouseStyle  PavedDrive
0         3             5         0           5           2
1         3            24         0           2           2
2         3             5         0           5           2
3         3             6         0           5           2
4         3            15         0           5           2


### 2. OneHotEncoder

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

df = pd.read_csv('ames_unprocessed_data.csv')

# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(0)

# Create a boolean mask for categorical columns
categorical_mask = (df.dtypes == 'object')

# Get list of categorical columns names
categorical_columns = df.columns[categorical_mask].tolist()

# Generate unique list of each categorical columns
unique_list = [df[c].unique().tolist() for c in categorical_columns]

# Create OneHotEncoder: ohe
ohe = OneHotEncoder(categories=unique_list)

# Create preprocess object for onehotencoding
preprocess = make_column_transformer(
    (ohe, categorical_columns),
    ('passthrough', categorical_mask[~categorical_mask].index.tolist())
)

# apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded
df_encoded = preprocess.fit_transform(df)


# Print the shape fo the original DataFrame
print(df.shape)

(1460, 21)


### 3. DictVectorizer

Using a **DictVectorizer** on a Dataframe that has been converted to a dictionary allows you to get label encoding as well as one-hot encoding

In [8]:
from sklearn.feature_extraction import DictVectorizer

# Convert df into a dictionary: df_dict
df_dict = df.to_dict("records")

# Create the DictVectorizer object: dv
dv = DictVectorizer(sparse=False)

# Apply dv on df: df_encoded
df_encoded2 = dv.fit_transform(df_dict)

# Print the vocabulary
print(dv.vocabulary_)

{'MSSubClass': 23, 'MSZoning=RL': 27, 'LotFrontage': 22, 'LotArea': 21, 'Neighborhood=CollgCr': 34, 'BldgType=1Fam': 1, 'HouseStyle=2Story': 18, 'OverallQual': 55, 'OverallCond': 54, 'YearBuilt': 61, 'Remodeled': 59, 'GrLivArea': 11, 'BsmtFullBath': 6, 'BsmtHalfBath': 7, 'FullBath': 9, 'HalfBath': 12, 'BedroomAbvGr': 0, 'Fireplaces': 8, 'GarageArea': 10, 'PavedDrive=Y': 58, 'SalePrice': 60, 'Neighborhood=Veenker': 53, 'HouseStyle=1Story': 15, 'Neighborhood=Crawfor': 35, 'Neighborhood=NoRidge': 44, 'Neighborhood=Mitchel': 40, 'HouseStyle=1.5Fin': 13, 'Neighborhood=Somerst': 50, 'Neighborhood=NWAmes': 43, 'MSZoning=RM': 28, 'Neighborhood=OldTown': 46, 'Neighborhood=BrkSide': 32, 'BldgType=2fmCon': 2, 'HouseStyle=1.5Unf': 14, 'Neighborhood=Sawyer': 48, 'Neighborhood=NridgHt': 45, 'Neighborhood=NAmes': 41, 'BldgType=Duplex': 3, 'Neighborhood=SawyerW': 49, 'Neighborhood=IDOTRR': 38, 'PavedDrive=N': 56, 'Neighborhood=MeadowV': 39, 'BldgType=TwnhsE': 5, 'MSZoning=C (all)': 24, 'Neighborhood=E

### 4. Preprocessing within a pipeline

In [9]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

from sklearn.pipeline import Pipeline

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [('ohe_onestep', DictVectorizer(sparse=False)),
         ('xgb_model', xgb.XGBRegressor())]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Fit the pipeline
xgb_pipeline.fit(X.to_dict("records"), y)



Pipeline(steps=[('ohe_onestep', DictVectorizer(sparse=False)),
                ('xgb_model', XGBRegressor())])

### 5. Cross-validating XGBoost model

In [10]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Fill LotFrontage missing values with 0
X.LotFrontage = X.LotFrontage.fillna(0)

# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective='reg:squarederror'))]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps)

# Cross-validate the model
cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict('records'), y, 
                                   scoring='neg_mean_squared_error', cv=10)

# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scores))))

10-fold RMSE:  29867.603720688923
