# SVM

In [20]:
#to process data
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import time

np.random.seed(42)

In [21]:
#import data and convert bool features to numerical
bool_converter = lambda x: 1 if x == 't' else 0
train_df = pd.read_csv(
    "train.csv",
    converters={
        'Host_is_superhost':bool_converter,
        'Host_has_profile_pic': bool_converter,
        'Host_identity_verified':bool_converter,
        'Instant_bookable':bool_converter
    },
)

test_df = pd.read_csv(
    "test.csv",
    converters={
        'Host_is_superhost':bool_converter,
        'Host_has_profile_pic': bool_converter,
        'Host_identity_verified':bool_converter,
        'Instant_bookable':bool_converter
    }
)

In [22]:
#We drop 4 features in total: 'Bedrooms', 'Beds','Month''Host_has_profile_pic'
train_df = train_df.drop(columns = ['Bedrooms', 'Beds','Month','Host_has_profile_pic'])
test_df = test_df.drop(columns = ['Bedrooms', 'Beds','Month','Host_has_profile_pic'])

In [23]:
columns = ['id', 'Decision', 'Host_response_time',
           'Host_is_superhost', 'Host_has_profile_pic', 'Host_identity_verified','Neighbourhood',
           'Property_type', 'Room_type', 'Accommodates','Bathrooms_text',
           'Bedrooms', 'Beds',
           'Essentials', 'Cooking',
           'Balcony', 'Parking',
           'Price',
           'Number_of_reviews',
           'Review_scores_rating',
           'Instant_bookable',
           'Month']
#we can split features like below:
categorical = ['Neighbourhood', 'Host_response_time','Property_type', 'Room_type','Bathrooms_text']
continuous = ['Accommodates','Balcony', 'Parking','Essentials', 'Cooking',
              'Price','Review_scores_rating','Number_of_reviews']
binary = ['Host_is_superhost', 'Host_identity_verified','Instant_bookable']

In [24]:
#fill nan values
train_df.isna().sum()

id                          0
Decision                    0
Host_response_time        858
Host_is_superhost           0
Host_identity_verified      0
Neighbourhood               0
Property_type               0
Room_type                   0
Accommodates                0
Bathrooms_text              0
Essentials                  0
Cooking                     0
Balcony                     0
Parking                     0
Price                       0
Number_of_reviews           0
Review_scores_rating      395
Instant_bookable            0
dtype: int64

In [25]:
test_df.isna().sum()

id                          0
Host_response_time        293
Host_is_superhost           0
Host_identity_verified      0
Neighbourhood               0
Property_type               0
Room_type                   0
Accommodates                0
Bathrooms_text              0
Essentials                  0
Cooking                     0
Balcony                     0
Parking                     0
Price                       0
Number_of_reviews           0
Review_scores_rating      274
Instant_bookable            0
dtype: int64

In [26]:
train_df["Host_response_time"] = train_df["Host_response_time"].fillna(value="missing") 
test_df["Host_response_time"] = test_df["Host_response_time"].fillna(value="missing")

In [27]:
#still have a feature with nan values: fill it with median
train_df["Review_scores_rating"].median()

4.93

In [28]:
train_df["Review_scores_rating"] = train_df["Review_scores_rating"].fillna(value=4.93) #median
test_df["Review_scores_rating"] = test_df["Review_scores_rating"].fillna(value=4.93)

In [29]:
#price features: str to numerical
train_df['Price'] = train_df['Price'].replace({'\$':'',',':''},regex = True)
train_df['Price'] = train_df['Price'].astype('float')

test_df['Price'] = test_df['Price'].replace({'\$':'',',':''},regex = True)
test_df['Price'] = test_df['Price'].astype('float')

In [30]:
# continuous features require normalization
# normalize the continuous features to zero mean and unit variancee
scaler = StandardScaler()
scaler.fit(train_df[continuous])
train_df[continuous] = scaler.transform(train_df[continuous])
test_df[continuous] = scaler.transform(test_df[continuous])

In [31]:
#split labels and features
y_train_full = train_df['Decision']
X_train_full = train_df.drop(['Decision'], axis=1)

In [32]:
# convert categorical features to one-hot representations
len_train = len(X_train_full)

total_X = X_train_full.append(test_df, ignore_index=True)
one_hot_X = pd.get_dummies(total_X, columns=categorical)
#split train and test dataset
X_test = one_hot_X[len_train:]
X_train_full = one_hot_X[:len_train]

In [33]:
X_test

Unnamed: 0,id,Host_is_superhost,Host_identity_verified,Accommodates,Essentials,Cooking,Balcony,Parking,Price,Number_of_reviews,...,Bathrooms_text_4.5 baths,Bathrooms_text_5 baths,Bathrooms_text_5.5 baths,Bathrooms_text_6 baths,Bathrooms_text_6.5 baths,Bathrooms_text_7 baths,Bathrooms_text_7.5 baths,Bathrooms_text_9 baths,Bathrooms_text_Half-bath,Bathrooms_text_Private half-bath
7471,1,1,1,-1.192110,-1.171512,-0.385235,0.763000,0.208063,-0.724997,0.462257,...,0,0,0,0,0,0,0,0,0,0
7472,2,1,1,-0.830126,0.447498,0.843931,0.763000,0.208063,-0.621800,4.532375,...,0,0,0,0,0,0,0,0,0,0
7473,3,1,1,-0.830126,0.447498,0.843931,0.763000,0.208063,-0.621800,4.691072,...,0,0,0,0,0,0,0,0,0,0
7474,4,1,1,-0.830126,0.447498,0.843931,0.763000,0.208063,-0.621800,4.784424,...,0,0,0,0,0,0,0,0,0,0
7475,5,1,1,-0.830126,0.447498,0.843931,0.763000,0.208063,-0.621800,4.607056,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9906,2436,0,0,0.617810,-6.028541,-2.228985,-1.310616,0.208063,-0.398208,-0.816656,...,0,0,0,0,0,0,0,0,0,0
9907,2437,1,1,1.703762,0.447498,0.843931,0.763000,0.208063,-0.352343,-0.816656,...,0,0,0,0,0,0,0,0,0,0
9908,2438,1,1,3.513681,-1.171512,-1.614402,-1.310616,0.208063,2.846753,-0.816656,...,0,0,0,0,0,0,0,0,0,0
9909,2439,1,1,-0.830126,0.447498,-0.999819,-1.310616,0.208063,-0.747930,-0.816656,...,0,0,0,0,0,0,0,0,0,0


In [34]:
X_train_full = X_train_full.drop(['id'], axis=1)
test_index =X_test['id']
#ids = ids.astype(np.int64)
X_test = X_test.drop(['id'], axis=1)

In [35]:
#split validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42)

### SVM

In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [45]:
#hyperparameter selection
param_test1 = {
 'C': [10e-3, 10e-2, 10e-1, 1, 10, 100]
}
gsearch1 = GridSearchCV(estimator = SVC(kernel='rbf', random_state=42),
 param_grid = param_test1, scoring='accuracy',n_jobs=4,cv=3)
gsearch1.fit(X_train_full, y_train_full)
print(gsearch1.best_params_)
print(gsearch1.best_score_)

{'C': 100}
0.7736568111063282


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
cv_results = gsearch1.cv_results_
params = [10e-3, 10e-2, 10e-1, 1, 10, 100]
mean_test_score = cv_results["mean_test_score"]
std_test_score = cv_results["std_test_score"]
sns.set(font_scale = 1.25)
ax = sns.lineplot(x=params, y=mean_test_score)
ax.fill_between(params, y1=mean_test_score - std_test_score, y2=mean_test_score + std_test_score, alpha=.5)
ax.set_xscale("log")
ax.set_xlabel("$C$")
ax.set_ylabel("accuracy")
plt.savefig("svm_c.pdf", bbox_inches="tight")

In [61]:
classifer = SVC(
    C=100,
    kernel='rbf',
    verbose=False,
    random_state=42
)
t_start = time.time()
classifer.fit(X_train_full, y_train_full)
t_end = time.time()
acc_train = classifer.score(X_train_full, y_train_full)
preds = classifer.predict(X_test) + 1
results = pd.Series(preds, index=test_index)
results.to_csv("results/svm.csv", header=None, index=True)
print("train acc: {:.4f}".format(acc_train))
print("training_time: {:.2f}sec".format(t_end - t_start))

train acc: 0.8921
training_time: 8.44sec
