# Tutorial 2 - SVM Regression

We will predict the price (`price` column) of an AirBNB listing in Boston given a number of features about the listing.

**Therefore, our unit of analysis is an AIRBNB LISTING**

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [None]:
#We will predict the "price" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

# Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

## Drop the variables we can't use in this tutorial

In [None]:
# We can't use the following columns in this tutorial, because they are for classification tasks

train = train_set.drop(['price_gte_150', 'price_category'], axis=1)
test = test_set.drop(['price_gte_150', 'price_category'], axis=1)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [None]:
train_set.isna().sum()

In [None]:
test_set.isna().sum()

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable (we don't want to transform it)

In [None]:
train_y = train[['price']]
test_y = test[['price']]

train_inputs = train.drop(['price'], axis=1)
test_inputs = test.drop(['price'], axis=1)

##  Identify the numerical and categorical columns

### Option 1: Manually

### Option 2: Programmatically

In [None]:
train_inputs.dtypes

In [None]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [None]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [None]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [None]:
binary_columns

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [None]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

In [None]:
test_x.shape

# Calculate the baseline

In [None]:
from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")

dummy_regr.fit(train_x, train_y)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
#Baseline Train RMSE
dummy_train_pred = dummy_regr.predict(train_x)

baseline_train_mse = mean_squared_error(train_y, dummy_train_pred)

baseline_train_rmse = np.sqrt(baseline_train_mse)

print('Baseline Train RMSE: {}' .format(baseline_train_rmse))

In [None]:
#Baseline Test RMSE
dummy_test_pred = dummy_regr.predict(test_x)

baseline_test_mse = mean_squared_error (test_y, dummy_test_pred)

baseline_test_rmse = np.sqrt(baseline_test_mse)

print('Baseline Test RMSE: {}' .format(baseline_test_rmse))

# SVR(kernel='linear')

In [None]:
from sklearn.svm import SVR 

svm_reg = SVR(kernel="linear", C=100, epsilon=0.5, max_iter=10000) 

svm_reg.fit(train_x, train_y)

In [None]:
#Train RMSE
svm_train_pred = svm_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
svm_test_pred = svm_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

# SVR(kernel='poly')

In [None]:
svm_poly_reg = SVR(kernel="poly", degree=3, C=100, epsilon=0.5) 

svm_poly_reg.fit(train_x, train_y)

In [None]:
#Train RMSE
svm_train_pred = svm_poly_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
svm_test_pred = svm_poly_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

# SVR(kernel='rbf')

In [None]:
svm_rbf_reg = SVR(kernel="rbf", C=100, epsilon=0.01, gamma='scale') 

svm_rbf_reg.fit(train_x, train_y)

In [None]:
#Train RMSE
svm_train_pred = svm_rbf_reg.predict(train_x)

train_mse = mean_squared_error(train_y, svm_train_pred)

train_rmse = np.sqrt(train_mse)

print('Train RMSE: {}' .format(train_rmse))

In [None]:
#Test RMSE
svm_test_pred = svm_rbf_reg.predict(test_x)

test_mse = mean_squared_error(test_y, svm_test_pred)

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))

# Cross Validation

We perform k-fold cross-validation.<br>
See this link for more info: https://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
scores = cross_validate(svm_rbf_reg, train_x, 
                        train_y, cv=5, scoring="neg_mean_squared_error",
                        return_estimator=True)

In [None]:
scores

In [None]:
# test scores are negative and squared

scores['test_score']

In [None]:
rmse_scores = np.sqrt(-scores['test_score'])

rmse_scores

In [None]:
rmse_scores.mean(), rmse_scores.std()

In [None]:
# How can you make predictions: Use each estimator...

test_predictions = pd.DataFrame()

test_predictions['estimator_0'] = scores['estimator'][0].predict(test_x)
test_predictions['estimator_1'] = scores['estimator'][1].predict(test_x)
test_predictions['estimator_2'] = scores['estimator'][2].predict(test_x)
test_predictions['estimator_3'] = scores['estimator'][3].predict(test_x)
test_predictions['estimator_4'] = scores['estimator'][4].predict(test_x)

In [None]:
test_predictions

In [None]:
# Take the average of all predictions:

test_predictions['avg_prediction'] = np.mean(test_predictions, axis=1)

test_predictions

In [None]:
# Test RMSE

test_mse = mean_squared_error(test_y, test_predictions['avg_prediction'])

test_rmse = np.sqrt(test_mse)

print('Test RMSE: {}' .format(test_rmse))