# Library Import¶

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy import stats
np.random.seed(1)

# Data Load

In [2]:
X_train = pd.read_csv('C:/Users/mssur/Downloads/airbnb_train_X_price_gte_150.csv') 
y_train = pd.read_csv('C:/Users/mssur/Downloads/airbnb_train_y_price_gte_150.csv') 
X_test = pd.read_csv('C:/Users/mssur/Downloads/airbnb_test_X_price_gte_150.csv') 
y_test = pd.read_csv('C:/Users/mssur/Downloads/airbnb_test_y_price_gte_150.csv') 

# SVM Classification by using Random Search CV

In [3]:
score_measure = "precision"
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

RS_Out = SVC()
rand_search = RandomizedSearchCV(estimator = RS_Out, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_



Fitting 5 folds for each of 16 candidates, totalling 80 fits


  y = column_or_1d(y, warn=True)


The best precision score is 0.9370404920282969
... with parameters: {'kernel': 'poly', 'gamma': 0.01, 'C': 0.1}


In [4]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.6073102 Precision=0.9117647 Recall=0.2335217 F1=0.3718141


# SVM Classification by using Grid Search CV

In [5]:
score_measure = "precision"
kfolds = 5

param_grid = {
     'C': [0.1,1, 10, 100], 
     'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

GS_out = SVC()
grid_search = GridSearchCV(estimator = GS_out, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


  y = column_or_1d(y, warn=True)


The best precision score is 0.9370404920282969
... with parameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}


In [6]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.6073102 Precision=0.9117647 Recall=0.2335217 F1=0.3718141


# Decision tree by using Random search cv

In [7]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best precision score is 0.8553701168468061
... with parameters: {'min_samples_split': 30, 'min_samples_leaf': 23, 'min_impurity_decrease': 0.0021, 'max_leaf_nodes': 91, 'max_depth': 35, 'criterion': 'gini'}


In [8]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


# Decision tree by using Grid search cv

In [9]:
score_measure = "precision"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
}

Dout_GS = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = Dout_GS, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 5 folds for each of 9072 candidates, totalling 45360 fits
The best precision score is 0.8470330066484271
... with parameters: {'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 162, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 10, 'min_samples_split': 30}


In [10]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8462980 Precision=0.8379374 Recall=0.8568738 F1=0.8472998


# Inference

1) Based on the data above, we can see that random search and grid search are the most effective methods for Support Vector Machine classifier (93.7)

2) The best grid search cv precision score, after taking into account the decision tree findings, is 85.53. 

3) Decision trees are functioning best when employing random search cv, which is 85.53 Overall, the svm model outperforms decision trees in terms of greatest precision score.