In [122]:

#All required import
#We will take X1 as our price variable
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
#from sklearn.metrics import average_precision_score
#from sklearn.metrics import f1_score

In [127]:
#Read data

def read_data(file_name):
    df = pd.read_csv(file_name)    
    return df


def explore_data(df):
    print("Null check : ")
    null_values = df.isnull().sum()
    print(null_values)      
    print('-----------------------------------------------------------------------')
    print('Duplicate rows in data : ')
    duplicates = df[df.duplicated()]
    print(duplicates.shape)
    
    
def unique_categorical_vals(df, col_name):
    count_vals = df[col_name].nunique()
    col_vals = df[col_name].unique()    
    print('-----------------------------------------------------------------------')
    print('Number of unique values : ', count_vals)
    print('-----------------------------------------------------------------------')
    print("Unique values in column : ", col_vals)
    
    
def encode_data(df, col_names):
    print("Encode column : ", col_names)    
    enc = pd.get_dummies(data=df, prefix='', prefix_sep='', columns = col_names, drop_first=False)
    return enc


def data_cleaning(df, col_name, init_string, final_string):
    df[col_name] = df[col_name].str.replace(init_string, final_string)
    df[col_name] = df[col_name].apply(pd.to_numeric)    
    return df

def drop_cols(df, col_name):
    df = df.drop(col_name, axis=1)
    return df

def rename_columns(df, old_name, new_name):
    df.rename(columns = {old_name:new_name}, inplace=True)
    return df

def train_model(x,y,x_test, y_test, model_name):
    model = model_name
    model.fit(x,y)
    pred_y_test = predict_y(model,x_test)
    accuracy_model(y_test, pred_y_test)
    return model

def predict_y(model, x):
    predicted_y = model.predict(x)
    return predicted_y

def accuracy_model(y_test,y_pred):
    rmse_rf= (mean_squared_error(y_test,y_pred))**(1/2)
    print('RMSE test: %.3f' % rmse_rf)
    print('R^2 test: %.3f' % (r2_score(y_test, y_pred)))
    print('Explained variance score : ', explained_variance_score(y_test, y_pred))
    #print('Average precision score : ' % (average_precision_score(y_test, y_pred)))
    #print('f1 score : ' % (f1_score(y_test, y_pred)))
    
    
    

In [53]:
df = read_data('price_train.csv')
df.head(5)

Unnamed: 0,X1,id,host_is_superhost,host_response_rate,host_response_time,host_listings_count,host_identity_verified,accommodates,neighbourhood_group_cleansed,property_type,...,bedrooms,beds,bed_type,amenities,cleaning_fee,minimum_nights,maximum_nights,instant_bookable,cancellation_policy,price
0,5460,22146017,False,99%,within an hour,521,False,5,Downtown,Apartment,...,2,2,Real Bed,"{TV,Internet,Wifi,Kitchen,Elevator,Heating,""Fa...",$129.00,2,1125,True,strict,$279.00
1,4143,18638163,True,100%,within an hour,1,True,2,Queen Anne,House,...,0,1,Real Bed,"{TV,""Cable TV"",Internet,Wifi,""Air conditioning...",$50.00,2,30,True,moderate,$99.00
2,5802,22734110,True,100%,within an hour,1,False,4,Rainier Valley,Guest suite,...,1,2,Real Bed,"{TV,Wifi,""Air conditioning"",Kitchen,""Free stre...",$25.00,2,28,False,moderate,$75.00
3,776,3946674,True,90%,within a few hours,1,True,2,Central Area,House,...,1,1,Real Bed,"{Internet,Wifi,Kitchen,""Pets live on this prop...",$0.00,2,30,False,flexible,$70.00
4,6064,23610186,True,100%,within an hour,4,True,8,Beacon Hill,House,...,3,3,Real Bed,"{TV,Wifi,Kitchen,""Free parking on premises"",""S...",$99.00,2,12,False,flexible,$242.00


In [54]:
explore_data(df)

Null check : 
X1                              0
id                              0
host_is_superhost               0
host_response_rate              0
host_response_time              0
host_listings_count             0
host_identity_verified          0
accommodates                    0
neighbourhood_group_cleansed    0
property_type                   0
room_type                       0
latitude                        0
longitude                       0
guests_included                 0
bathrooms                       0
bedrooms                        0
beds                            0
bed_type                        0
amenities                       0
cleaning_fee                    0
minimum_nights                  0
maximum_nights                  0
instant_bookable                0
cancellation_policy             0
price                           0
dtype: int64
-----------------------------------------------------------------------
Duplicate rows in data : 
(0, 25)


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3466 entries, 0 to 3465
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   X1                            3466 non-null   int64  
 1   id                            3466 non-null   int64  
 2   host_is_superhost             3466 non-null   bool   
 3   host_response_rate            3466 non-null   object 
 4   host_response_time            3466 non-null   object 
 5   host_listings_count           3466 non-null   int64  
 6   host_identity_verified        3466 non-null   bool   
 7   accommodates                  3466 non-null   int64  
 8   neighbourhood_group_cleansed  3466 non-null   object 
 9   property_type                 3466 non-null   object 
 10  room_type                     3466 non-null   object 
 11  latitude                      3466 non-null   float64
 12  longitude                     3466 non-null   float64
 13  gue

In [56]:
unique_categorical_vals(df, 'host_response_rate')

-----------------------------------------------------------------------
Number of unique values :  32
-----------------------------------------------------------------------
Unique values in column :  ['99%' '100%' '90%' '75%' '86%' '92%' '70%' '98%' '67%' '91%' '84%' '0%'
 '80%' '56%' '85%' '50%' '57%' '71%' '93%' '97%' '95%' '96%' '88%' '78%'
 '40%' '94%' '83%' '89%' '33%' '60%' '87%' '62%']


In [67]:
unique_categorical_vals(df, 'price')

-----------------------------------------------------------------------
Number of unique values :  306
-----------------------------------------------------------------------
Unique values in column :  ['$279.00' '$99.00' '$75.00' '$70.00' '$242.00' '$430.00' '$60.00'
 '$125.00' '$225.00' '$219.00' '$50.00' '$200.00' '$95.00' '$180.00'
 '$53.00' '$100.00' '$190.00' '$420.00' '$65.00' '$146.00' '$109.00'
 '$45.00' '$119.00' '$105.00' '$350.00' '$280.00' '$400.00' '$595.00'
 '$55.00' '$150.00' '$500.00' '$40.00' '$195.00' '$220.00' '$269.00'
 '$88.00' '$79.00' '$120.00' '$185.00' '$550.00' '$80.00' '$495.00'
 '$90.00' '$175.00' '$148.00' '$10.00' '$85.00' '$295.00' '$89.00'
 '$229.00' '$999.00' '$199.00' '$1,002.00' '$141.00' '$599.00' '$650.00'
 '$145.00' '$39.00' '$159.00' '$149.00' '$163.00' '$46.00' '$250.00'
 '$386.00' '$527.00' '$47.00' '$179.00' '$375.00' '$189.00' '$152.00'
 '$315.00' '$450.00' '$299.00' '$78.00' '$110.00' '$59.00' '$103.00'
 '$750.00' '$98.00' '$130.00' '$302.00

In [57]:
unique_categorical_vals(df, 'host_response_time')

-----------------------------------------------------------------------
Number of unique values :  4
-----------------------------------------------------------------------
Unique values in column :  ['within an hour' 'within a few hours' 'within a day' 'a few days or more']


In [58]:
unique_categorical_vals(df, 'neighbourhood_group_cleansed')

-----------------------------------------------------------------------
Number of unique values :  17
-----------------------------------------------------------------------
Unique values in column :  ['Downtown' 'Queen Anne' 'Rainier Valley' 'Central Area' 'Beacon Hill'
 'Magnolia' 'Other neighborhoods' 'Northgate' 'Cascade'
 'University District' 'West Seattle' 'Capitol Hill' 'Lake City' 'Ballard'
 'Delridge' 'Seward Park' 'Interbay']


In [59]:
unique_categorical_vals(df, 'property_type')

-----------------------------------------------------------------------
Number of unique values :  26
-----------------------------------------------------------------------
Unique values in column :  ['Apartment' 'House' 'Guest suite' 'Townhouse' 'Bungalow' 'Condominium'
 'Loft' 'Serviced apartment' 'Guesthouse' 'Houseboat' 'Aparthotel'
 'Bed and breakfast' 'Camper/RV' 'Tiny house' 'Cabin' 'Boat' 'Cottage'
 'Other' 'Farm stay' 'Hostel' 'Tent' 'Resort' 'Yurt' 'In-law'
 'Boutique hotel' 'Villa']


In [60]:
unique_categorical_vals(df, 'room_type')

-----------------------------------------------------------------------
Number of unique values :  3
-----------------------------------------------------------------------
Unique values in column :  ['Entire home/apt' 'Private room' 'Shared room']


In [61]:
unique_categorical_vals(df, 'bed_type')

-----------------------------------------------------------------------
Number of unique values :  4
-----------------------------------------------------------------------
Unique values in column :  ['Real Bed' 'Futon' 'Pull-out Sofa' 'Airbed']


In [62]:
unique_categorical_vals(df, 'amenities')

-----------------------------------------------------------------------
Number of unique values :  3061
-----------------------------------------------------------------------
Unique values in column :  ['{TV,Internet,Wifi,Kitchen,Elevator,Heating,"Family/kid friendly",Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit","Safety card","Fire extinguisher",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","Self check-in","Smart lock",Keypad,"Private living room"}'
 '{TV,"Cable TV",Internet,Wifi,"Air conditioning",Kitchen,"Pets live on this property","Free street parking","Indoor fireplace",Heating,Washer,Dryer,"Smoke detector","Carbon monoxide detector","First aid kit","Safety card",Essentials,Shampoo,Hangers,"Hair dryer",Iron,"Laptop friendly workspace","translation missing: en.hosting_amenity_49","translation missing: en.hosting_amenity_50","Self check-in",Keypad,"Private entrance","Hot water","Bed linens",Microwave,"Coffee maker",Refrigerator

In [63]:
unique_categorical_vals(df, 'cleaning_fee')

-----------------------------------------------------------------------
Number of unique values :  154
-----------------------------------------------------------------------
Unique values in column :  ['$129.00' '$50.00' '$25.00' '$0.00' '$99.00' '$100.00' '$20.00' '$110.00'
 '$89.00' '$35.00' '$10.00' '$60.00' '$30.00' '$75.00' '$55.00' '$159.00'
 '$40.00' '$80.00' '0' '$200.00' '$95.00' '$165.00' '$275.00' '$325.00'
 '$150.00' '$71.00' '$130.00' '$15.00' '$77.00' '$250.00' '$300.00'
 '$85.00' '$70.00' '$185.00' '$175.00' '$135.00' '$125.00' '$45.00'
 '$82.00' '$225.00' '$65.00' '$140.00' '$5.00' '$350.00' '$160.00'
 '$115.00' '$61.00' '$47.00' '$49.00' '$109.00' '$9.00' '$149.00'
 '$145.00' '$98.00' '$353.00' '$29.00' '$90.00' '$78.00' '$84.00'
 '$230.00' '$107.00' '$59.00' '$169.00' '$72.00' '$88.00' '$120.00'
 '$179.00' '$170.00' '$114.00' '$195.00' '$64.00' '$400.00' '$323.00'
 '$83.00' '$74.00' '$96.00' '$16.00' '$319.00' '$19.00' '$119.00'
 '$105.00' '$69.00' '$249.00' '$24.00'

In [64]:
unique_categorical_vals(df, 'instant_bookable')

-----------------------------------------------------------------------
Number of unique values :  2
-----------------------------------------------------------------------
Unique values in column :  [ True False]


In [65]:
unique_categorical_vals(df, 'cancellation_policy')

-----------------------------------------------------------------------
Number of unique values :  6
-----------------------------------------------------------------------
Unique values in column :  ['strict' 'moderate' 'flexible' 'strict_14_with_grace_period'
 'super_strict_60' 'super_strict_30']


In [66]:
#Encode the data 
col_names = ['host_response_time', 'room_type', 'bed_type', 'cancellation_policy']

df = encode_data(df, col_names)
df.info()

Encode column :  ['host_response_time', 'room_type', 'bed_type', 'cancellation_policy']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3466 entries, 0 to 3465
Data columns (total 38 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   X1                            3466 non-null   int64  
 1   id                            3466 non-null   int64  
 2   host_is_superhost             3466 non-null   bool   
 3   host_response_rate            3466 non-null   object 
 4   host_listings_count           3466 non-null   int64  
 5   host_identity_verified        3466 non-null   bool   
 6   accommodates                  3466 non-null   int64  
 7   neighbourhood_group_cleansed  3466 non-null   object 
 8   property_type                 3466 non-null   object 
 9   latitude                      3466 non-null   float64
 10  longitude                     3466 non-null   float64
 11  guests_included               346

In [68]:
df.head(10)

Unnamed: 0,X1,id,host_is_superhost,host_response_rate,host_listings_count,host_identity_verified,accommodates,neighbourhood_group_cleansed,property_type,latitude,...,Airbed,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,strict_14_with_grace_period,super_strict_30,super_strict_60
0,5460,22146017,False,99%,521,False,5,Downtown,Apartment,47.612163,...,0,0,0,1,0,0,1,0,0,0
1,4143,18638163,True,100%,1,True,2,Queen Anne,House,47.640827,...,0,0,0,1,0,1,0,0,0,0
2,5802,22734110,True,100%,1,False,4,Rainier Valley,Guest suite,47.549007,...,0,0,0,1,0,1,0,0,0,0
3,776,3946674,True,90%,1,True,2,Central Area,House,47.601922,...,0,0,0,1,1,0,0,0,0,0
4,6064,23610186,True,100%,4,True,8,Beacon Hill,House,47.552311,...,0,0,0,1,1,0,0,0,0,0
5,2757,13533608,False,100%,4,True,8,Magnolia,Townhouse,47.649938,...,0,0,0,1,0,0,0,1,0,0
6,7786,29396738,True,100%,1,False,3,Other neighborhoods,Bungalow,47.678508,...,0,0,0,1,0,1,0,0,0,0
7,7012,26667492,False,100%,9,False,4,Downtown,Condominium,47.612819,...,0,0,0,1,0,0,0,1,0,0
8,6019,23417458,True,100%,2,False,2,Downtown,Apartment,47.606281,...,0,0,0,1,0,0,0,1,0,0
9,3766,17465443,False,99%,152,False,2,Downtown,Loft,47.612551,...,0,0,0,1,0,0,0,1,0,0


In [70]:
df = data_cleaning(df, 'host_response_rate', '%', '')
df.head(10)

Unnamed: 0,X1,id,host_is_superhost,host_response_rate,host_listings_count,host_identity_verified,accommodates,neighbourhood_group_cleansed,property_type,latitude,...,Airbed,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,strict_14_with_grace_period,super_strict_30,super_strict_60
0,5460,22146017,False,99,521,False,5,Downtown,Apartment,47.612163,...,0,0,0,1,0,0,1,0,0,0
1,4143,18638163,True,100,1,True,2,Queen Anne,House,47.640827,...,0,0,0,1,0,1,0,0,0,0
2,5802,22734110,True,100,1,False,4,Rainier Valley,Guest suite,47.549007,...,0,0,0,1,0,1,0,0,0,0
3,776,3946674,True,90,1,True,2,Central Area,House,47.601922,...,0,0,0,1,1,0,0,0,0,0
4,6064,23610186,True,100,4,True,8,Beacon Hill,House,47.552311,...,0,0,0,1,1,0,0,0,0,0
5,2757,13533608,False,100,4,True,8,Magnolia,Townhouse,47.649938,...,0,0,0,1,0,0,0,1,0,0
6,7786,29396738,True,100,1,False,3,Other neighborhoods,Bungalow,47.678508,...,0,0,0,1,0,1,0,0,0,0
7,7012,26667492,False,100,9,False,4,Downtown,Condominium,47.612819,...,0,0,0,1,0,0,0,1,0,0
8,6019,23417458,True,100,2,False,2,Downtown,Apartment,47.606281,...,0,0,0,1,0,0,0,1,0,0
9,3766,17465443,False,99,152,False,2,Downtown,Loft,47.612551,...,0,0,0,1,0,0,0,1,0,0


In [80]:
df = data_cleaning(df, 'cleaning_fee', '$', '')

AttributeError: Can only use .str accessor with string values!

In [81]:
df = data_cleaning(df, 'price', '$', '')

ValueError: Unable to parse string "1,002.00" at position 0

In [82]:
df = data_cleaning(df, 'price', ',', '')

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3466 entries, 0 to 3465
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   X1                           3466 non-null   int64  
 1   id                           3466 non-null   int64  
 2   host_is_superhost            3466 non-null   bool   
 3   host_response_rate           3466 non-null   int64  
 4   host_listings_count          3466 non-null   int64  
 5   host_identity_verified       3466 non-null   bool   
 6   accommodates                 3466 non-null   int64  
 7   latitude                     3466 non-null   float64
 8   longitude                    3466 non-null   float64
 9   guests_included              3466 non-null   int64  
 10  bathrooms                    3466 non-null   float64
 11  bedrooms                     3466 non-null   int64  
 12  beds                         3466 non-null   int64  
 13  cleaning_fee      

In [85]:
df = drop_cols(df, ['property_type', 'neighbourhood_group_cleansed', 'amenities'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3466 entries, 0 to 3465
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   X1                           3466 non-null   int64  
 1   id                           3466 non-null   int64  
 2   host_is_superhost            3466 non-null   bool   
 3   host_response_rate           3466 non-null   int64  
 4   host_listings_count          3466 non-null   int64  
 5   host_identity_verified       3466 non-null   bool   
 6   accommodates                 3466 non-null   int64  
 7   latitude                     3466 non-null   float64
 8   longitude                    3466 non-null   float64
 9   guests_included              3466 non-null   int64  
 10  bathrooms                    3466 non-null   float64
 11  bedrooms                     3466 non-null   int64  
 12  beds                         3466 non-null   int64  
 13  cleaning_fee      

In [87]:
df= rename_columns(df, 'a few days or more', 'a_few_days_or_more')

In [89]:
df= rename_columns(df, 'within a day', 'within_a_day')
df= rename_columns(df, 'within a few hours', 'within_a_few_hours')
df= rename_columns(df, 'within an hour', 'within_an_hour')
df= rename_columns(df, 'Entire home/apt', 'Entire_home_apt')
df= rename_columns(df, 'Private room', 'Private_room')
df= rename_columns(df, 'Shared room', 'Shared_room')
df= rename_columns(df, 'Pull-out Sofa', 'Pull_out_Sofa')
df= rename_columns(df, 'Real Bed', 'Real_Bed')

In [91]:
df = drop_cols(df, ['X1'])

In [93]:
df = drop_cols(df, ['id'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3466 entries, 0 to 3465
Data columns (total 33 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_is_superhost            3466 non-null   bool   
 1   host_response_rate           3466 non-null   int64  
 2   host_listings_count          3466 non-null   int64  
 3   host_identity_verified       3466 non-null   bool   
 4   accommodates                 3466 non-null   int64  
 5   latitude                     3466 non-null   float64
 6   longitude                    3466 non-null   float64
 7   guests_included              3466 non-null   int64  
 8   bathrooms                    3466 non-null   float64
 9   bedrooms                     3466 non-null   int64  
 10  beds                         3466 non-null   int64  
 11  cleaning_fee                 3466 non-null   float64
 12  minimum_nights               3466 non-null   int64  
 13  maximum_nights    

In [96]:
y = df['price']
x = drop_cols(df, ['price'])

In [97]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=1)

In [128]:
#Random forest regressor
rf_reg = train_model(x_train, y_train,x_test, y_test, RandomForestRegressor())

RMSE test: 73.547
R^2 test: 0.733
Explained variance score :  0.7329832000403642


In [129]:
#Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor
decTree_reg = train_model(x_train, y_train,x_test, y_test, DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=3))

RMSE test: 90.736
R^2 test: 0.593
Explained variance score :  0.594370546006766


In [121]:
from sklearn.svm import SVR
svm_reg = train_model(x_train, y_train,x_test, y_test, SVR())

RMSE test: 138.798
R^2 test: 0.049
Explained variance score :  0.11124407880255716


In [132]:
from sklearn.linear_model import SGDRegressor

sgd_reg = train_model(x_train, y_train,x_test, y_test, SGDRegressor())

RMSE test: 43762345995524.438
R^2 test: -94578661594497376321536.000
Explained variance score :  -4.494184213265828e+22


In [136]:
from sklearn.ensemble import BaggingRegressor
bagging_reg = train_model(x_train, y_train, x_test, y_test, BaggingRegressor(n_estimators=200, max_features = 20, random_state = 10))

RMSE test: 71.507
R^2 test: 0.747
Explained variance score :  0.7478873600373961


In [145]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = train_model(x_train, y_train, x_test, y_test, GradientBoostingRegressor(loss='squared_error', \
                        learning_rate = 0.2, n_estimators=500, criterion='squared_error' ))

RMSE test: 74.773
R^2 test: 0.724
Explained variance score :  0.7239440699843112
