### Import libraries

In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor

### Define useful functions

In [1076]:
def extract_zip_code(hotel_address):
    address_tmp = hotel_address.split(',')[-2].strip(' ').split(' ')[-1][:3]
    if address_tmp == 'NY1':
        return hotel_address.split(',')[-2].strip(' ').split(' ')[-1][2:5]
    else:
        return address_tmp

def extract_price(hotel_room_price):
    room_price_tmp = hotel_room_price.strip('\r').strip('US$')
    if room_price_tmp == '':
        return None
    else:
        room_price_tmp = room_price_tmp.replace(',' , '')
        return float(room_price_tmp)
    
def extract_capacity(hotel_room_capacity):
    return int(hotel_room_capacity[-1])

def extract_chain(hotel_name):
    hotel_name_lower=hotel_name.lower()
    if 'doubletree by hilton' in hotel_name_lower:
        return 'Doubletree Hilton'
    elif 'homewood suites by hilton' in hotel_name_lower:
        return 'Homewood Suites Hilton'
    elif 'hilton garden' in hotel_name_lower:
        return 'Hilton Garden'
    elif 'home2' in hotel_name_lower:
        return 'Home2 Hilton'
    elif 'tru' in hotel_name_lower:
        return 'Tru Hilton'
    elif 'hilton' in hotel_name_lower:
        return 'Hilton'
    
    elif 'courtyard by marriott' in hotel_name_lower:
        return 'Courtyard Marriott'
    elif 'fairfield inn' in hotel_name_lower:
        return 'Fairfield Inn Marriott'
    elif 'residence inn by marriott' in hotel_name_lower:
        return 'Residence Inn Marriott'
    elif 'marriott' in hotel_name_lower:
        return 'Marriott'
    
    elif 'hyatt' in hotel_name_lower:
        return 'Hyatt'
    elif 'wyndham' in hotel_name_lower:
        return 'Wyndham'
    elif 'embassy' in hotel_name_lower:
        return 'Embassy'
    elif 'sheraton' in hotel_name_lower:
        return 'Sheraton'
    elif 'radisson' in hotel_name_lower:
        return 'Radisson'
    elif 'best western' in hotel_name_lower:
        return 'Best Western'
    elif 'hampton' in hotel_name_lower:
        return 'Hampton'
    elif 'holiday inn' in hotel_name_lower:
        return 'Holiday Inn'
    elif 'westin' in hotel_name_lower:
        return 'Westin'
    elif 'red roof' in hotel_name_lower:
        return 'Red Roof'
    else:
        return 'No chain'

def extract_hotel_stars(hotel_stars):
    if hotel_stars in ['1 stars', '1-star hotel']:
        return 1.0
    elif hotel_stars in ['1.5 stars', '1.5 star hotel']:
        return 1.5
    elif hotel_stars in ['2 stars', '2-star hotel']:
        return 2.0
    elif hotel_stars in ['2.5 stars', '2.5 star hotel']:
        return 2.5
    elif hotel_stars in ['3 stars', '3-star hotel']:
        return 3.0
    elif hotel_stars in ['3.5 stars', '3.5 star hotel']:
        return 3.5
    elif hotel_stars in ['4 stars', '4-star hotel']:
        return 4.0
    elif hotel_stars in ['4.5 stars', '4.5 star hotel']:
        return 4.5
    elif hotel_stars in ['5 stars', '5-star hotel']:
        return 5.0
    else:
        return None
    
def extract_cleanliness_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[0]
    if item_tmp.split(',')[0]=="'Cleanliness'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_comfort_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[1]
    if item_tmp.split(',')[0]=="'Comfort'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_location_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[2]
    if item_tmp.split(',')[0]=="'Location'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_facilities_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[3]
    if item_tmp.split(',')[0]=="'Facilities'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_staff_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[4]
    if item_tmp.split(',')[0]=="'Staff'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_value_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')[5]
    if item_tmp.split(',')[0]=="'Value for money'":
        return float(item_tmp.split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def extract_wifi_rating(hotel_rating_breakdown):
    item_tmp = hotel_rating_breakdown.strip('[]').split('], [')
    if len(item_tmp)<7:
        return None
    elif item_tmp[6].split(',')[0]=="'Free WiFi'":
        return float(item_tmp[6].split(',')[1].strip(' ').strip("''"))
    else:
        return None
    
def filter_zip_code(hotel_address_zip_code):
    hotel_address_zip_code_short = hotel_address_zip_code[:3]
    zip_code_list = ['100','101','102','103','104','107','110','111','112','113','114','116']
    if hotel_address_zip_code_short not in zip_code_list:
        return None
    else:
        return hotel_address_zip_code

### Import dataset

In [1077]:
df_hotel_detailed_info_list = pd.read_csv('./datasets/nys_hotel_detailed_info_list.csv',lineterminator='\n')

### Cleaning and pre-processing

In [1078]:
df_hotel_detailed_info_list.rename(columns={'hotel_room_price\r': 'hotel_room_price'},inplace=True)

In [1079]:
df_hotel_detailed_info_list['hotel_address_zip_code'] = df_hotel_detailed_info_list['hotel_address'].apply(extract_zip_code)

In [1080]:
df_hotel_detailed_info_list['hotel_room_price'] = df_hotel_detailed_info_list['hotel_room_price'].apply(extract_price)
df_hotel_detailed_info_list = df_hotel_detailed_info_list[~pd.isnull(df_hotel_detailed_info_list['hotel_room_price'])]

In [1081]:
df_hotel_detailed_info_list['hotel_room_capacity'] = df_hotel_detailed_info_list['hotel_room_capacity'].apply(extract_capacity)

In [1082]:
df_hotel_detailed_info_list['hotel_chain'] = df_hotel_detailed_info_list['hotel_name'].apply(extract_chain)

In [1083]:
df_hotel_detailed_info_list['hotel_cleanliness_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_cleanliness_rating)
df_hotel_detailed_info_list['hotel_comfort_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_comfort_rating)
df_hotel_detailed_info_list['hotel_location_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_location_rating)
df_hotel_detailed_info_list['hotel_facilities_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_facilities_rating)
df_hotel_detailed_info_list['hotel_staff_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_staff_rating)
df_hotel_detailed_info_list['hotel_value_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_value_rating)
df_hotel_detailed_info_list['hotel_wifi_rating'] = df_hotel_detailed_info_list['hotel_rating_breakdown'].apply(extract_wifi_rating)

In [1084]:
hotel_wifi_median = df_hotel_detailed_info_list['hotel_wifi_rating'].median()
df_hotel_detailed_info_list['hotel_wifi_rating'].fillna(hotel_wifi_median,inplace=True)

In [1085]:
df_hotel_detailed_info_list['hotel_stars'] = df_hotel_detailed_info_list['hotel_stars'].apply(extract_hotel_stars)

### Create copy of dataframe

In [1086]:
df_hotel_detailed_info_list_filtered = df_hotel_detailed_info_list.copy()

### Filter on zip codes

In [1087]:
#df_hotel_detailed_info_list_filtered['hotel_address_zip_code'] = df_hotel_detailed_info_list_filtered['hotel_address_zip_code'].apply(filter_zip_code)
#df_hotel_detailed_info_list_filtered = df_hotel_detailed_info_list_filtered[~pd.isnull(df_hotel_detailed_info_list_filtered['hotel_address_zip_code'])]

### Filter on hotel stars

In [1088]:
#df_hotel_detailed_info_list_filtered = df_hotel_detailed_info_list_filtered[(df_hotel_detailed_info_list_filtered['hotel_stars']==3.0) | (df_hotel_detailed_info_list_filtered['hotel_stars']==4.0)]

### Media imputation for hotel stars (if not filtered)

In [1089]:
hotel_stars_median = df_hotel_detailed_info_list_filtered['hotel_stars'].median()
df_hotel_detailed_info_list_filtered['hotel_stars'].fillna(hotel_stars_median,inplace=True)

### Create price per person column

In [1090]:
df_hotel_detailed_info_list_filtered['hotel_room_price_per_person'] = df_hotel_detailed_info_list_filtered['hotel_room_price']/df_hotel_detailed_info_list_filtered['hotel_room_capacity']

### Remove outlier

In [1091]:
df_hotel_detailed_info_list_filtered = df_hotel_detailed_info_list_filtered[df_hotel_detailed_info_list_filtered['hotel_name']!='Del Lago Resort & Casino']

In [918]:
df_hotel_detailed_info_list_filtered.head()

Unnamed: 0,hotel_name,hotel_stars,hotel_address,hotel_overall_rating,hotel_rating_breakdown,hotel_room_name,hotel_room_capacity,hotel_room_price,hotel_address_zip_code,hotel_chain,hotel_cleanliness_rating,hotel_comfort_rating,hotel_location_rating,hotel_facilities_rating,hotel_staff_rating,hotel_value_rating,hotel_wifi_rating,hotel_room_price_per_person
0,Adelphi Hotel,4.0,"365 Broadway, Saratoga Springs, NY 12866, Unit...",9.8,"[['Cleanliness', '10.0'], ['Comfort', '10.0'],...",Premier King Room,2,235.0,128,No chain,10.0,10.0,9.9,10.0,9.6,9.4,10.0,117.5
2,Homewood Suites By Hilton Saratoga Springs,3.0,"3368 South Broadway , Saratoga Springs, NY 1...",9.6,"[['Cleanliness', '9.8'], ['Comfort', '9.8'], [...",One-Bedroom King Suite with Balcony - Non-Smoking,3,194.0,128,Hilton,9.8,9.8,9.4,9.8,9.7,9.3,8.9,64.666667
3,Aqualina Inn Montauk,3.0,"20 S Elmwood Avenue, Montauk, NY 11954, United...",9.5,"[['Cleanliness', '9.8'], ['Comfort', '9.6'], [...",Queen Room with Courtyard,2,159.0,119,No chain,9.8,9.6,9.7,9.4,9.7,9.0,9.8,79.5
4,Watkins Glen Harbor Hotel,3.0,"16 N. Franklin St., Watkins Glen, NY 14891, Un...",9.5,"[['Cleanliness', '9.8'], ['Comfort', '9.8'], [...",King Room with Village View,2,249.0,148,No chain,9.8,9.8,9.7,9.6,9.6,8.8,9.0,124.5
5,The Whitby Hotel,5.0,"18 West 56th Street, New York, NY 10019, Unite...",9.5,"[['Cleanliness', '9.9'], ['Comfort', '9.8'], [...",Superior Room,2,770.0,100,No chain,9.9,9.8,9.6,9.6,9.7,8.4,9.3,385.0


In [1013]:
df_hotel_detailed_info_list_filtered['hotel_chain'].unique()

array(['No chain', 'Homewood Suites Hilton', 'Embassy', 'Hampton',
       'Marriott', 'Courtyard Marriott', 'Holiday Inn',
       'Fairfield Inn Marriott', 'Wyndham', 'Best Western', 'Hyatt',
       'Doubletree Hilton', 'Radisson', 'Westin', 'Red Roof', 'Sheraton'],
      dtype=object)

In [1048]:
for item in df_hotel_detailed_info_list['hotel_name']:
    if 'hilton' in item.lower(): print(item.lower())

homewood suites by hilton saratoga springs
homewood suites by hilton clifton park
hilton garden inn corning downtown
homewood suites by hilton poughkeepsie
hampton inn by hilton new paltz, ny
home2 suites by hilton queensbury glens falls
tru by hilton syracuse north airport area
tru by hilton syracuse-camillus
home2 suites by hilton oswego
home2 suites by hilton saratoga malta
tru by hilton williamsville buffalo airport
doubletree by hilton hotel utica
tailwater lodge altmar, tapestry collection by hilton
hilton garden inn clifton park
homewood suites by hilton new hartford utica
embassy suites by hilton syracuse destiny usa
doubletree by hilton nanuet
hilton garden inn auburn
homewood suites by hilton syracuse - carrier circle
hotel saranac, curio collection by hilton
home2 suites by hilton williamsville buffalo airport
hilton garden inn nyc financial center/manhattan downtown
homewood suites by hilton albany crossgates mall
hilton garden inn watertown
hampton inn & suites by hilton -

## Modeling

### Create features and target

In [1092]:
df_features = df_hotel_detailed_info_list_filtered[['hotel_stars','hotel_overall_rating','hotel_address_zip_code','hotel_chain','hotel_cleanliness_rating','hotel_comfort_rating','hotel_location_rating','hotel_facilities_rating','hotel_staff_rating','hotel_value_rating','hotel_wifi_rating']]
df_target = df_hotel_detailed_info_list_filtered[['hotel_room_price_per_person']]

In [1093]:
df_features.head()

Unnamed: 0,hotel_stars,hotel_overall_rating,hotel_address_zip_code,hotel_chain,hotel_cleanliness_rating,hotel_comfort_rating,hotel_location_rating,hotel_facilities_rating,hotel_staff_rating,hotel_value_rating,hotel_wifi_rating
0,4.0,9.8,128,No chain,10.0,10.0,9.9,10.0,9.6,9.4,10.0
2,3.0,9.6,128,Homewood Suites Hilton,9.8,9.8,9.4,9.8,9.7,9.3,8.9
3,3.0,9.5,119,No chain,9.8,9.6,9.7,9.4,9.7,9.0,9.8
4,3.0,9.5,148,No chain,9.8,9.8,9.7,9.6,9.6,8.8,9.0
5,5.0,9.5,100,No chain,9.9,9.8,9.6,9.6,9.7,8.4,9.3


### Standardize numerical features & create dummy variables for categorical features

In [1094]:
categorical_features = ['hotel_address_zip_code','hotel_chain']

In [1095]:
train_test_scaled_inputs = df_features.copy()
train_test_num_features = train_test_scaled_inputs.drop(categorical_features,axis=1)
train_test_num_features_col = train_test_num_features.columns.values

scaler = StandardScaler()
scaler.fit(train_test_num_features.values)

train_test_num_features = scaler.transform(train_test_num_features.values)
train_test_scaled_inputs[train_test_num_features_col] = train_test_num_features

In [1096]:
train_test_scaled_inputs = pd.get_dummies(train_test_scaled_inputs,columns=categorical_features,drop_first=True)

### Train-validation split

In [1097]:
x_train, x_val, y_train, y_val = train_test_split(train_test_scaled_inputs,df_target, test_size = 0.2, random_state = 42)

### Random forest

In [1098]:
rf = RandomForestRegressor(n_jobs=-1, random_state = 42, n_estimators=200, max_features=10)
rf.fit(x_train,y_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features=10, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [1099]:
y_train_rf_pred = rf.predict(x_train)
y_val_rf_pred = rf.predict(x_val)

In [1100]:
R2_train=rf.score(x_train,y_train)
adj_R2_train=1-(1-R2_train)*(x_train.shape[0]-1)/(x_train.shape[0]-x_train.shape[1]-1)

R2_val=rf.score(x_val,y_val)
adj_R2_val=1-(1-R2_val)*(x_val.shape[0]-1)/(x_val.shape[0]-x_val.shape[1]-1)

print(R2_train)
print(adj_R2_train)
print()
print(R2_val)
print(adj_R2_val)

0.9476565308354119
0.9442399696681836

0.612415638605408
0.4864507211521657


In [1101]:
y_val[:10]

Unnamed: 0,hotel_room_price_per_person
1022,249.5
1257,174.5
1444,40.5
398,175.0
1809,147.5
108,499.5
1361,55.0
615,238.0
1214,30.0
759,80.5


In [1102]:
y_val_rf_pred[:10]

array([177.63166667, 152.43583333,  45.92625   , 120.73233333,
       108.515     , 125.27      ,  57.9625    , 164.40841667,
        45.22833333,  71.63125   ])

### LinearRegression

In [672]:
reg = LinearRegression()
reg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [673]:
# get intercept
const=reg.intercept_
print ('intercept = ',const)

# get weights
reg_summary = pd.DataFrame(train_test_scaled_inputs.columns.values, columns=['Features'])
reg_summary['Weights'] = reg.coef_[0]
reg_summary

intercept =  [141.29120285]


Unnamed: 0,Features,Weights
0,hotel_stars,14.455863
1,hotel_overall_rating,20.145971
2,hotel_cleanliness_rating,-20.179418
3,hotel_comfort_rating,53.219789
4,hotel_location_rating,4.068554
...,...,...
64,hotel_chain_Radisson,-0.868696
65,hotel_chain_Red Roof,24.729815
66,hotel_chain_Sheraton,-6.732327
67,hotel_chain_Westin,-2.144409


In [674]:
y_hat_val = reg.predict(x_val)

R2_val=reg.score(x_val,y_val)
adj_R2_val=1-(1-R2_val)*(x_val.shape[0]-1)/(x_val.shape[0]-x_val.shape[1]-1)
print ('R2_val = ',R2_val)
print ('adj_R2_val = ',adj_R2_val)

print ('RMSE_val = ',mean_squared_error(y_val, y_hat_val, squared=False))

R2_val =  -1.483398550668115e+22
adj_R2_val =  -1.894460799648436e+22
RMSE_val =  9940511876796.78


In [283]:
y_hat_val

array([[ 3.16972421e+01],
       [ 3.16331939e+02],
       [ 1.81883549e+02],
       [ 3.14903971e+02],
       [ 4.92801503e+02],
       [ 2.23342205e+02],
       [ 2.01695220e+02],
       [ 3.30194093e+02],
       [ 3.37569372e+02],
       [ 6.75847899e+01],
       [ 7.52152020e+01],
       [ 7.09263241e+01],
       [ 1.36765691e+02],
       [ 1.34418889e+02],
       [ 2.64710123e+02],
       [ 7.99653062e+01],
       [ 4.16272908e+02],
       [ 5.05386440e+02],
       [ 5.00535356e+01],
       [ 1.54768680e+02],
       [ 3.04287479e+01],
       [ 1.51090899e+02],
       [ 3.97930510e+01],
       [ 1.27131527e+02],
       [ 4.05865590e+02],
       [ 4.59347164e+02],
       [ 1.35454270e+02],
       [ 7.11582682e+00],
       [ 2.42036715e+02],
       [ 3.45421159e+02],
       [ 1.39279192e+02],
       [ 2.08145003e+02],
       [ 2.56388260e+01],
       [ 2.10702914e+02],
       [ 6.64939123e+01],
       [ 2.30753616e+02],
       [ 8.98974995e+01],
       [ 3.54672337e+01],
       [ 5.1