###  Subash Chandra Biswal (U77884251) ###

## Set Up  ##

In [40]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# set random seed to ensure that results are repeatable
np.random.seed(1)

## Data Load ##

In [2]:
# load data
airbnb = pd.read_csv("./data/airbnb.csv")

airbnb.head(3)

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75


## Initial Analysis of Data ##

In [3]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   host_is_superhost                  3555 non-null   int64  
 1   host_identity_verified             3555 non-null   int64  
 2   neighbourhood_cleansed             3555 non-null   object 
 3   latitude                           3555 non-null   float64
 4   longitude                          3555 non-null   float64
 5   property_type                      3552 non-null   object 
 6   room_type                          3555 non-null   object 
 7   accommodates                       3555 non-null   int64  
 8   bathrooms                          3541 non-null   float64
 9   bedrooms                           3545 non-null   float64
 10  beds                               3546 non-null   float64
 11  bed_type                           3555 non-null   objec

In [4]:
# generate a statistical summary of the numeric value in the data
airbnb.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,3555.0,3555.0,3555.0,3555.0,3555.0,3541.0,3545.0,3546.0,3555.0,3555.0,3555.0,3555.0,3555.0,3555.0,2755.0,3555.0,3555.0
mean,0.11308,0.727989,42.339973,-71.084874,3.023629,1.215899,1.246544,1.597293,14.85879,1.427004,10.886639,3.116737,19.126582,279.052602,91.89147,166.060478,0.500422
std,0.316735,0.445058,0.024464,0.031614,1.754808,0.492656,0.73844,0.995467,4.82126,1.050204,19.092755,8.273949,35.666178,408.686952,9.548381,103.378456,0.50007
min,0.0,0.0,42.235942,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.329875,-71.105183,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,85.0,0.0
50%,0.0,1.0,42.345191,-71.078487,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,92.0,94.0,150.0,1.0
75%,0.0,1.0,42.354672,-71.062142,4.0,1.0,2.0,2.0,18.0,1.0,20.0,3.0,21.0,402.0,98.0,219.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,6.0,5.0,16.0,30.0,14.0,200.0,300.0,404.0,2680.0,100.0,650.0,1.0


In [5]:
# Check the missing values by summing the total na's for each variable
airbnb.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                             14
bedrooms                              10
beds                                   9
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 800
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [6]:
# create a list of these catagorical variables
category_var_list = list(airbnb.select_dtypes(include='object').columns)
category_var_list

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy',
 'price_category']

In [7]:
# explore the categorical variable values - often there are typos here that need to be fixed.
for cat in category_var_list: # generally, we want to avoid for loops and use a functional style (i.e. list comprehension)
    print(f"Category: {cat} Values: {airbnb[cat].unique()}")

Category: neighbourhood_cleansed Values: ['Roslindale' 'Jamaica Plain' 'Mission Hill' 'Longwood Medical Area'
 'Bay Village' 'Leather District' 'Chinatown' 'North End' 'Roxbury'
 'South End' 'Back Bay' 'East Boston' 'Charlestown' 'West End'
 'Beacon Hill' 'Downtown' 'Fenway' 'Brighton' 'West Roxbury' 'Hyde Park'
 'Mattapan' 'Dorchester' 'South Boston Waterfront' 'South Boston'
 'Allston']
Category: property_type Values: ['House' 'Apartment' 'Condominium' 'Villa' 'Bed & Breakfast' 'Townhouse'
 'Entire Floor' 'Loft' 'Guesthouse' 'Boat' 'Dorm' 'Other' nan 'Camper/RV']
Category: room_type Values: ['Entire home/apt' 'Private room' 'Shared room']
Category: bed_type Values: ['Real Bed' 'Pull-out Sofa' 'Futon' 'Airbed' 'Couch']
Category: cancellation_policy Values: ['moderate' 'flexible' 'strict' 'super_strict_30']
Category: price_category Values: ['gte_226' 'lte_$75' 'btw_$75-$150' 'btw_$151-$225']


## Drop Unnecessary Variables and fill missing categorical values ##

In [8]:
# Our target is price; but there are three related price variableds - price, price_gte_150, 
# and price_category. We need to drop price_gte_150, and price_category
airbnb.drop(['price', 'price_category'], axis=1, inplace = True) # keep price_gte_150 for our target

In [9]:
airbnb['property_type'].isna().sum()

3

In [10]:
airbnb["property_type"].fillna("unkown", inplace = True)

In [11]:
airbnb['neighbourhood_cleansed'].isna().sum() 

0

In [12]:
airbnb['room_type'].isna().sum() # can see by the results below, no missing values

0

In [13]:
airbnb['bed_type'].isna().sum() # can see by the results below, no missing values

0

In [14]:
airbnb['cancellation_policy'].isna().sum() # can see by the results below, no missing values

0

## Encode the categorical variables ##

In [15]:
dummies_df = pd.get_dummies(airbnb['neighbourhood_cleansed'], prefix='neighbourhood_cleansed', drop_first=True)

In [16]:
airbnb = airbnb.join(dummies_df)
airbnb.drop('neighbourhood_cleansed', axis=1, inplace = True)

In [17]:
airbnb = airbnb.join(pd.get_dummies(airbnb['property_type'], prefix='property_type', drop_first=True))
airbnb.drop('property_type', axis=1, inplace = True)

In [18]:
labelencoder = LabelEncoder()
airbnb['room_type'] = labelencoder.fit_transform(airbnb['room_type'])
airbnb['bed_type'] = labelencoder.fit_transform(airbnb['bed_type'])
airbnb['cancellation_policy'] = labelencoder.fit_transform(airbnb['cancellation_policy'])

In [19]:
# explore the dataframe columns to verify encoding and dropped columns
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3555 entries, 0 to 3554
Data columns (total 56 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   host_is_superhost                               3555 non-null   int64  
 1   host_identity_verified                          3555 non-null   int64  
 2   latitude                                        3555 non-null   float64
 3   longitude                                       3555 non-null   float64
 4   room_type                                       3555 non-null   int32  
 5   accommodates                                    3555 non-null   int64  
 6   bathrooms                                       3541 non-null   float64
 7   bedrooms                                        3545 non-null   float64
 8   beds                                            3546 non-null   float64
 9   bed_type                                 

In [26]:
airbnb.to_csv('./Data/airbnb_presplit_processed.csv', index=False)


## Train Test Split ##

In [27]:
# split the data into validation and training set
train_df, test_df = train_test_split(airbnb, test_size=0.3)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'price_gte_150'
predictors = list(airbnb.columns)
predictors.remove(target)

## Impute missing values of numerical variables ##

In [28]:
numeric_cols_with_nas = list(train_df.isna().sum()[train_df.isna().sum() > 0].index)
numeric_cols_with_nas

['bathrooms', 'bedrooms', 'beds', 'review_scores_rating']

In [29]:
imputer = SimpleImputer(strategy="median")

train_df[numeric_cols_with_nas] = imputer.fit_transform(train_df[numeric_cols_with_nas])
test_df[numeric_cols_with_nas] = imputer.transform(test_df[numeric_cols_with_nas])

## Standardize the numeric variables ##

In [30]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
cols_to_stdize = ['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']                
               
# Transform the predictors of training and validation sets
train_df[cols_to_stdize] = scaler.fit_transform(train_df[cols_to_stdize]) # train_predictors is not a numpy array


test_df[cols_to_stdize] = scaler.transform(test_df[cols_to_stdize]) # validation_target is now a series object

In [31]:
train_X = train_df[predictors]
train_y = train_df[target] # train_target is now a series objecttrain_df.to_csv('airbnb_train_df.csv', index=False)
test_X = test_df[predictors]
test_y = test_df[target] # validation_target is now a series object

#train_df.to_csv('./data/airbnb_train_df_price_gte_150.csv', index=False)
#train_X.to_csv('./data/airbnb_train_X_price_gte_150.csv', index=False)
#train_y.to_csv('./data/airbnb_train_y_price_gte_150.csv', index=False)
#test_df.to_csv('./data/airbnb_test_df_price_gte_150.csv', index=False)
#test_X.to_csv('./data/airbnb_test_X_price_gte_150.csv', index=False)
#test_y.to_csv('./data/airbnb_test_y_price_gte_150.csv', index=False)

## Random search of parameters grid ##

In [41]:
score_measure = "precision"
kfolds = 5

# Grid for decision tree
param_grid_tree = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 50), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

# Grid for SVM
param_grid_svm = {
    'degree': np.arange(1,4),
    'coef0': np.arange(1,4),
    'C': np.arange(1,15),
    'kernel': ['poly'],   
}

dtree = DecisionTreeClassifier()
svmpoly = SVC()

rand_search_tree = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid_tree, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)
rand_search_svm = RandomizedSearchCV(estimator = svmpoly, param_distributions=param_grid_svm, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# Decision tree model fit for grid search
_ = rand_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_tree.best_score_}")
print(f"... with parameters: {rand_search_tree.best_params_}")

bestRecallTree = rand_search_tree.best_estimator_

# SVM model fit for grid search
_ = rand_search_svm.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_svm.best_score_}")
print(f"... with parameters: {rand_search_svm.best_params_}")

bestRecallTree = rand_search_svm.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


30 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.83942426 0.82488631 0.82507291 0.82488631 0.82487201 0.82488631
 0.824886

The best precision score is 0.8558751673518566
... with parameters: {'min_samples_split': 44, 'min_samples_leaf': 21, 'min_impurity_decrease': 0.0021, 'max_leaf_nodes': 18, 'max_depth': 25, 'criterion': 'gini'}
Fitting 5 folds for each of 126 candidates, totalling 630 fits
The best precision score is 0.8559629990754983
... with parameters: {'kernel': 'poly', 'degree': 2, 'coef0': 2, 'C': 13}


## Confusion matrix of Models ##

In [43]:
## Decision Tree
c_matrix = confusion_matrix(test_y, rand_search_tree.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


In [44]:
## SVM
c_matrix = confusion_matrix(test_y, rand_search_svm.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8631678 Precision=0.8493648 Recall=0.8813559 F1=0.8650647


## Final grid search with smaller range ##

In [51]:
score_measure = "precision"
kfolds = 5

# Decision tree grid
param_grid_tree = {
    'min_samples_split': np.arange(41,47),  
    'min_samples_leaf': np.arange(18,24),
    'min_impurity_decrease': np.arange(0.0018, 0.0024, 0.0001),
    'max_leaf_nodes': np.arange(15,21), 
    'max_depth': np.arange(22,28), 
    'criterion': ['gini'],
}

# SVM grid
param_grid_svm = {
    'degree': np.arange(1,3),
    'coef0': np.arange(1,3),
    'C': np.arange(10,16),
    'kernel': ['poly'],   
}

dtree = DecisionTreeClassifier()
svmpoly = SVC()

grid_search_tree = GridSearchCV(estimator = dtree, param_grid=param_grid_tree, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

grid_search_svm = GridSearchCV(estimator = svmpoly, param_grid=param_grid_svm, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# Decision tree fit
_ = grid_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_tree.best_score_}")
print(f"... with parameters: {grid_search_tree.best_params_}")

bestRecallTree = grid_search_tree.best_estimator_


# SVM fit
_ = grid_search_svm.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_svm.best_score_}")
print(f"... with parameters: {grid_search_svm.best_params_}")

bestRecallTree = grid_search_svm.best_estimator_

Fitting 5 folds for each of 7776 candidates, totalling 38880 fits
The best precision score is 0.8584728930092839
... with parameters: {'criterion': 'gini', 'max_depth': 22, 'max_leaf_nodes': 15, 'min_impurity_decrease': 0.0021000000000000003, 'min_samples_leaf': 19, 'min_samples_split': 45}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
The best precision score is 0.8559629990754983
... with parameters: {'C': 13, 'coef0': 2, 'degree': 2, 'kernel': 'poly'}


In [52]:
# Confusion matrix of Decision tree
c_matrix = confusion_matrix(test_y, grid_search_tree.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


In [53]:
# Confusion matrix of SVM
c_matrix = confusion_matrix(test_y, grid_search_svm.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8631678 Precision=0.8493648 Recall=0.8813559 F1=0.8650647


## Important features of Decision tree model ##

In [54]:
np.round(grid_search_tree.best_estimator_.feature_importances_,2)

array([0.  , 0.  , 0.03, 0.1 , 0.78, 0.  , 0.  , 0.05, 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

## Analysis of models ## 

By doing grid search of model parameters, we have got the optimum values of parameters to get best precision. The precision score of decision tree model is 0.833 and that of SVM model is .849. From precision score its clear that SVM model with kernel poly, degree 2, coef0 2 and C of 13 is the best model for the given dataset. 