# INTRO TO MACHINE LEARNING

In [1]:
import pandas as pd
import sys

In [2]:
def format_with_commas(x):
    return '{:,.2f}'.format(x)

pd.options.display.float_format = format_with_commas

In [3]:
import platform

# Verificar el sistema operativo
if platform.system() == "Windows":
    melbourne_file_path = r"D:\kaggle\machine_learning\data\melb_data.csv"
elif platform.system() == "Darwin" or platform.system() == "Linux":
    melbourne_file_path = "/Volumes/Memory/kaggle/machine_learning/data/melb_data.csv"
else:
    # Si el sistema operativo no es Windows, macOS o Linux, maneja el caso aquí
    print("Sistema operativo no compatible")

In [4]:
melbourne_data = pd.read_csv(melbourne_file_path)

In [5]:
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.94,1075684.08,10.14,3105.3,2.91,1.53,1.61,558.42,151.97,1964.68,-37.81,145.0,7454.42
std,0.96,639310.72,5.87,90.68,0.97,0.69,0.96,3990.67,541.01,37.27,0.08,0.1,4378.58
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18,144.43,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.86,144.93,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.8,145.0,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.76,145.06,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.41,145.53,21650.0


### Print the list of columns in the dataset

In [6]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [7]:
melbourne_data = melbourne_data.dropna(axis=0)

In [8]:
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.93,1068828.2,9.75,3101.95,2.9,1.58,1.57,471.01,141.57,1964.08,-37.81,144.99,7435.49
std,0.97,675156.43,5.61,86.42,0.97,0.71,0.93,897.45,90.83,38.11,0.08,0.1,4337.7
min,1.0,131000.0,0.0,3000.0,0.0,1.0,0.0,0.0,0.0,1196.0,-38.16,144.54,389.0
25%,2.0,620000.0,5.9,3044.0,2.0,1.0,1.0,152.0,91.0,1940.0,-37.86,144.93,4383.75
50%,3.0,880000.0,9.0,3081.0,3.0,1.0,1.0,373.0,124.0,1970.0,-37.8,145.0,6567.0
75%,4.0,1325000.0,12.4,3147.0,3.0,2.0,2.0,628.0,170.0,2000.0,-37.76,145.05,10175.0
max,8.0,9000000.0,47.4,3977.0,9.0,8.0,10.0,37000.0,3112.0,2018.0,-37.46,145.53,21650.0


# Selecting The Prediction Target of the Model - By convention, the prediction target is called " y "

In [9]:
y = melbourne_data.Price

# Choosing the "Features" of the Model - By convention, this data is called " X "

In [10]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

In [11]:
X = melbourne_data[melbourne_features]

In [12]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.93,1.58,471.01,-37.81,144.99
std,0.97,0.71,897.45,0.08,0.1
min,1.0,1.0,0.0,-38.16,144.54
25%,2.0,1.0,152.0,-37.86,144.93
50%,3.0,1.0,373.0,-37.8,145.0
75%,4.0,2.0,628.0,-37.76,145.05
max,8.0,8.0,37000.0,-37.46,145.53


In [13]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1,156,-37.81,144.99
2,3,2,134,-37.81,144.99
4,4,1,120,-37.81,144.99
6,3,2,245,-37.8,145.0
7,2,1,256,-37.81,145.0


# Decision Tree Regressor Model

# Building the Model with Scikit-Learn Python Library

When coding, this library is written as sklearn

### The steps to building and using a model are:

Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.

Fit: Capture patterns from provided data. This is the heart of modeling.

Predict: Just what it sounds like

Evaluate: Determine how accurate the model's predictions are.

# Define and Fit the Model

In [14]:
from sklearn.tree import DecisionTreeRegressor

melbourne_model = DecisionTreeRegressor(random_state=1)

melbourne_model.fit(X, y)

### Making a Test predictions

In [15]:
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(melbourne_model.predict(X.head()))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2         1       156     -37.81      144.99
2      3         2       134     -37.81      144.99
4      4         1       120     -37.81      144.99
6      3         2       245     -37.80      145.00
7      2         1       256     -37.81      145.00
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]


### Calculating the Mean Absolute Error MAE for the model

In [16]:
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
print(mean_absolute_error(y, predicted_home_prices))

1115.7467183128902


### Split the data into two - one for training the model and another for validating the model MAE

In [17]:
from sklearn.model_selection import train_test_split

# Split the data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)
# Define the model
melbourne_model = DecisionTreeRegressor()
# Fit the model
melbourne_model.fit(train_X, train_y)

# Get predicted price on validation (val) data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

270507.6173875618


# Underfitting and Overfitting

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, tain_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split


melbourne_data = pd.read_csv(melbourne_file_path)

filtered_melbourne_data = melbourne_data.dropna(axis=0)

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
y = filtered_melbourne_data.Price
X = filtered_melbourne_data[melbourne_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

### Compare the MAE with differing values of max_leaf_nodes through a for-loop

In [20]:
max_leaf_nodes_candidates = [5, 50, 500, 5000]
mae_values = []
for max_leaf_nodes in max_leaf_nodes_candidates:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    mae_values.append(my_mae)
    print("Max leaf nodes: %d \t\t Mean Absolute Error: %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5 		 Mean Absolute Error: 347380
Max leaf nodes: 50 		 Mean Absolute Error: 258171
Max leaf nodes: 500 		 Mean Absolute Error: 243495
Max leaf nodes: 5000 		 Mean Absolute Error: 254983


In [21]:
import numpy as np
best_mae_index = np.argmin(mae_values)
best_mae_index

2

In [22]:
best_tree_size = max_leaf_nodes_candidates[best_mae_index]
best_tree_size

500

# Random Forest Model

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split

def format_with_commas(x):
    return '{:,.2f}'.format(x)
pd.options.display.float_format = format_with_commas

melbourne_data = pd.read_csv(melbourne_file_path)

filtered_melbourne_data = melbourne_data.dropna(axis=0)

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
y = filtered_melbourne_data.Price
X = filtered_melbourne_data[melbourne_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def get_forest_mae(train_X, val_X, tain_y, val_y):
    forest_model = RandomForestRegressor(random_state=1)
    forest_model.fit(train_X, train_y)
    melb_preds = forest_model.predict(val_X)
    forest_mae = mean_absolute_error(val_y, melb_preds)
    return(forest_mae)
    

In [25]:
result = get_forest_mae(train_X, val_X, train_y, val_y)
result_formatted = format_with_commas(result)
print(result_formatted)

191,669.75


Intro to Machine Learning Course ends here

# INTERMEDIATE MACHINE LEARNING

In [26]:
import platform

# Verificar el sistema operativo
if platform.system() == "Windows":
    melbourne_file_path = r"D:\kaggle\machine_learning\data\melb_data.csv"
elif platform.system() == "Darwin" or platform.system() == "Linux":
    melbourne_file_path = "/Volumes/Memory/kaggle/machine_learning/data/melb_data.csv"
else:
    # Si el sistema operativo no es Windows, macOS o Linux, maneja el caso aquí
    print("Sistema operativo no compatible")

## Dealing with missing values (Three Approaches)
Drop Columns with Missing Values, Imputation, An Extension To Imputation

### 1-) A Simple Option: Drop Columns with Missing Values

In [27]:
import pandas as pd;
from sklearn.model_selection import train_test_split

data = pd.read_csv(melbourne_file_path)
y = data.Price

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis = 1)
X = melb_predictors.select_dtypes(exclude = ['object'])
# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

In [28]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Numbers of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


### Define Function to Measure Quality of Each Approach
We define a function score_dataset() to compare different approaches to dealing with missing values. This function reports the mean absolute error (MAE) from a random forest model.

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 10, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### Score from Approach 1 (Drop Columns with Missing Values)
Since we are working with both training and validation sets, we are careful to drop the same columns in both DataFrames.

In [30]:
# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis = 1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis = 1)

In [31]:
print("MAE from approach 1 (Drop columns  with missing values):")
mae = score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 1 (Drop columns  with missing values):
183,550.22


### 2-) A Better Option: Imputation
Next, we use "SimpleImputer" to replace missing values with the mean value along each column.

In [32]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [33]:
print("MAE from approach 2 (Imputation):")
mae = score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 2 (Imputation):
178,166.46


We see that Approach 2 has lower MAE than Approach 1, so Approach 2 performed better on this dataset.

### 3-) An Extension To Imputation

Next, we impute the missing values, while also keeping track of which values were imputed.

In [34]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

In [35]:
print("MAE from approach 3 (An Extension to Imputation):")
mae = score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 3 (An Extension to Imputation):
178,927.50


## Categorical Variable

## Dealing with categorical variables (Three Approaches)
Drop Categorical Variables, Ordinal Encoding, One-Hot Encoding

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv(melbourne_file_path)

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis = 1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis = 1, inplace = True)
X_valid_full.drop(cols_with_missing, axis = 1, inplace = True)

# Cardinality means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [37]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182,1,1,0,-37.86,144.99,13240
6524,h,SA,Western Metropolitan,2,8.0,3016,2,2,193,-37.86,144.9,6380
8413,h,S,Western Metropolitan,3,12.6,3020,3,1,555,-37.8,144.82,3755
2919,u,SP,Northern Metropolitan,3,13.0,3046,3,1,265,-37.71,144.92,8870
6043,h,S,Western Metropolitan,3,13.3,3020,3,1,673,-37.76,144.83,4217


In [38]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


### Define Function to Measure Quality of Each Approach
We define a function score_dataset() to compare different approaches to dealing with missing values. This function reports the mean absolute error (MAE) from a random forest model.

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators = 100, random_state = 0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

### 1-) Score from Approach 1 (Drop Categorical Variables)
The easiest approach to dealing with categorical variables is to simply remove them from the dataset. This approach will only work well if the columns did not contain useful information.

In [40]:
drop_X_train = X_train.select_dtypes(exclude = ['object'])
drop_X_valid = X_valid.select_dtypes(exclude = ['object'])

### Drop the problematic categorical columns

In [41]:
# Categorical columns in the training data
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['Type', 'Method', 'Regionname']

Categorical columns that will be dropped from the dataset: []


In [42]:
print("MAE from approach 1 (Drop categorical variables):")
mae = score_dataset(drop_X_train, drop_X_valid, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 1 (Drop categorical variables):
175,703.48


### 2-) Score from Approach 2 (Ordinal Encoding)
Ordinal encoding assigns each unique value to a different integer. Scikit-learn has a OrdinalEncoder class that can be used to get ordinal encodings. We loop over the categorical variables and apply the ordinal encoder separately to each column.

In [43]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()
# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])

In [44]:
label_X_train[object_cols].head()

Unnamed: 0,Type,Method,Regionname
12167,2.0,1.0,5.0
6524,0.0,2.0,6.0
8413,0.0,1.0,6.0
2919,2.0,3.0,2.0
6043,0.0,1.0,6.0


In [45]:
print("MAE from approach 2 (Ordinal Encoding):")
mae = score_dataset(label_X_train, label_X_valid, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 2 (Ordinal Encoding):
165,936.41


### 3-) One-Hot Encoding

One-hot encoding creates new columns indicating the presence (or absence) of each possible value in the original data.

One-hot encoding generally does not perform well if the categorical variable takes on a large number of values (i.e., you generally won't use it for variables taking more than 15 different values).

We use the OneHotEncoder class from scikit-learn to get one-hot encodings. There are a number of parameters that can be used to customize its behavior.

We set handle_unknown='ignore' to avoid errors when the validation data contains classes that aren't represented in the training data, and setting sparse=False ensures that the encoded columns are returned as a numpy array (instead of a sparse matrix).

In [46]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

In [47]:
print("MAE from approach 3 (One-Hot Encoding):")
mae = score_dataset(OH_X_train, OH_X_valid, y_train, y_valid)
formatted_mae = f"{mae:,.2f}"
print(formatted_mae)

MAE from approach 3 (One-Hot Encoding):
166,089.49


## Pipelines

### Reading, Splitting, Cardinaliying and dividing in Numerical & Categorical Columns the Source Data

In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv(melbourne_file_path)

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

### Sneak data preview

In [49]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182,1,1,1.0,0,,1940.0,-37.86,144.99,13240
6524,h,SA,Western Metropolitan,2,8.0,3016,2,2,1.0,193,,,-37.86,144.9,6380
8413,h,S,Western Metropolitan,3,12.6,3020,3,1,1.0,555,,,-37.8,144.82,3755
2919,u,SP,Northern Metropolitan,3,13.0,3046,3,1,1.0,265,,1995.0,-37.71,144.92,8870
6043,h,S,Western Metropolitan,3,13.3,3020,3,1,2.0,673,673.0,1970.0,-37.76,144.83,4217


### Step 1: Define Preprocessing Steps

Similar to how a pipeline bundles together preprocessing and modeling steps, we use the ColumnTransformer class to bundle together different preprocessing steps. The code below:

imputes missing values in numerical data, and

imputes missing values and applies a one-hot encoding to categorical data.

In [56]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## Step 2: Define the Model

### Define Function to Measure Quality of Each Approach
We define a function score_dataset() to compare different approaches to dealing with missing values. This function reports the mean absolute error (MAE) from a random forest model.

In [65]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

## Step 3: Create and Evaluate the Pipeline

Finally, we use the Pipeline class to define a pipeline that bundles the preprocessing and modeling steps. There are a few important things to notice:

With the pipeline, we preprocess the training data and fit the model in a single line of code. (In contrast, without a pipeline, we have to do imputation, one-hot encoding, and model training in separate steps. This becomes especially messy if we have to deal with both numerical and categorical variables!)

With the pipeline, we supply the unprocessed features in X_valid to the predict() command, and the pipeline automatically preprocesses the features before generating predictions. (However, without a pipeline, we have to remember to preprocess the validation data before making predictions.)

In [67]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print(f"'MAE:', {score:,.2f}")

'MAE:', 160,679.19
