# Tutorial 1 - SVM Classification


We will perform two prediction tasks:
1) Whether the price of an AIRBNB listing is greater than or equal to $150 (`price_gte_150` column),<br>
2) What is the price category, among 4 categories, of an AIRBNB listing (`price_category` column)

**The unit of analysis is an AIRBNB LISTING**

# Setup

In [None]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [None]:
#We will predict the "price_gte_150" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

In [None]:
airbnb

# Split the data into train and test

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(airbnb, test_size=0.3)

## Drop the variables we can't use for the binary task

In [None]:
# We can't use the following columns in this tutorial, because they are not for binary classification tasks

train = train_set.drop(['price', 'price_category'], axis=1)
test = test_set.drop(['price', 'price_category'], axis=1)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [None]:
train_set.isna().sum()

In [None]:
test_set.isna().sum()

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the target variable (we don't want to transform it)

In [None]:
train_y = train[['price_gte_150']]
test_y = test[['price_gte_150']]

train_inputs = train.drop(['price_gte_150'], axis=1)
test_inputs = test.drop(['price_gte_150'], axis=1)

##  Identify the numerical and categorical columns

### Option 1: Manually

### Option 2: Programmatically

In [None]:
train_inputs.dtypes

In [None]:
# Identify the numerical columns
numeric_columns = train_inputs.select_dtypes(include=[np.number]).columns.to_list()

# Identify the categorical columns
categorical_columns = train_inputs.select_dtypes('object').columns.to_list()

In [None]:
# Identify the binary columns so we can pass them through without transforming
binary_columns = ['host_is_superhost', 'host_identity_verified']

In [None]:
# Be careful: numerical columns already includes the binary columns,
# So, we need to remove the binary columns from numerical columns.

for col in binary_columns:
    numeric_columns.remove(col)

In [None]:
binary_columns

In [None]:
numeric_columns

In [None]:
categorical_columns

# Pipeline

In [None]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [None]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [None]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [None]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [None]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

In [None]:
test_x.shape

# SVM - Binary classification

## Baseline Accuracy

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

In [None]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

## SVC(kernel='linear')


In [None]:
from sklearn.svm import SVC
 
lin_svm = SVC(kernel="linear")

lin_svm.fit(train_x, train_y)

## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Predict the train values
train_y_pred = lin_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = lin_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

## Classification Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#We usually create the confusion matrix on test set
confusion_matrix(test_y, test_y_pred)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

#We usually create the classification report on test set
print(classification_report(test_y, test_y_pred))

## SVC(kernel='poly') 



In [None]:
from sklearn.svm import SVC

# You need to enter a value for gamma. Remember, gamma controls the shape of the bell curve for rbf
# You can also set it is as gamma='scale'. This will be the default option in future releases

pol_svm = SVC(kernel="poly", degree=3, coef0=1, C=10)

pol_svm.fit(train_x, train_y)

In [None]:
#Predict the train values
train_y_pred = pol_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = pol_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

## SVC(kernel='rbf')



In [None]:
rbf_svm = SVC(kernel="rbf", C=10, gamma='scale')

rbf_svm.fit(train_x, train_y)

In [None]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y, test_y_pred)

# Multi Class Classification


In [None]:
train_set[['price_category']].head(10)

In [None]:
# Assign new target variable
train_y_multiclass = train_set[['price_category']]
test_y_multiclass = test_set[['price_category']]

## Baseline

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y_multiclass)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y_multiclass, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

In [None]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y_multiclass, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

## SVC(kernel='linear')

In [None]:
svm_clf = SVC(kernel="linear", C=10, decision_function_shape='ovr')

svm_clf.fit(train_x, train_y_multiclass)

In [None]:
#Predict the train values
train_y_pred = svm_clf.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = svm_clf.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

In [None]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

## SVC(kernel='poly')

In [None]:
pol_svm2 = SVC(kernel="poly", degree=3, coef0=1, C=5, decision_function_shape='ovr')

pol_svm2.fit(train_x, train_y_multiclass)

In [None]:
#Predict the train values
train_y_pred = pol_svm2.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = pol_svm2.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

In [None]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

## SVC(kernel='rbf')

In [None]:
rbf_svm = SVC(kernel="rbf", C=10, gamma=0.1, decision_function_shape='ovr')

rbf_svm.fit(train_x, train_y_multiclass)

In [None]:
#Predict the train values
train_y_pred = rbf_svm.predict(train_x)

#Train accuracy
accuracy_score(train_y_multiclass, train_y_pred)

In [None]:
#Predict the test values
test_y_pred = rbf_svm.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_y_pred)

In [None]:
#We usually create the confusion matrix on test set
confusion_matrix(test_y_multiclass, test_y_pred)

# Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 4 (2×2) combinations of hyperparameters
    {'C': [5, 15], 
     'gamma': [0.1, 0.2]}
  ]

rbf_svm = SVC(kernel="rbf", decision_function_shape='ovr')

# train across 5 folds, that's a total of 4*5=20 rounds of training 
grid_search = GridSearchCV(rbf_svm, param_grid, cv=5,
                           scoring='accuracy', return_train_score=True)

grid_search.fit(train_x, train_y_multiclass)

The best hyperparameter combination found:

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

Let's look at the score of each hyperparameter combination tested during the grid search:

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
pd.DataFrame(grid_search.cv_results_)

# Grid Search: randomized

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import random

param_distribs = {
        'C': randint(low=5, high=50),
        'gamma': uniform(0.1, 0.5),    
    }

rbf_svm = SVC(kernel="rbf", decision_function_shape='ovr')

rbf_search = RandomizedSearchCV(rbf_svm, param_distributions=param_distribs,
                                n_iter=5, cv=5, scoring='accuracy', random_state=42)

rbf_search.fit(train_x, train_y_multiclass)

In [None]:
cvres = rbf_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

## Run the final model on the Test Set

In [None]:
final_model = grid_search.best_estimator_

test_predictions = final_model.predict(test_x)

#Test accuracy
accuracy_score(test_y_multiclass, test_predictions)