In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from geopy.geocoders import Nominatim


In [5]:
# Import our input dataset
ny_df = pd.read_csv('column_listings_df.csv', nrows=2000)
ny_df.head()

Unnamed: 0,listing_id,listing_url,name,host_id,host_url,host_name,host_since,host_is_superhost,host_listings_count,host_total_listings_count,...,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month
0,52702018,https://www.airbnb.com/rooms/52702018,Rental unit in Queens · ★4.90 · 1 bedroom · 2 ...,93231383,https://www.airbnb.com/users/show/93231383,Daniel,2016-09-03,t,3,3,...,2023-08-21,4.9,4.95,4.9,4.95,4.94,4.85,4.87,3,8.53
1,784969376930125242,https://www.airbnb.com/rooms/784969376930125242,Home in Queens · ★4.97 · 3 bedrooms · 4 beds ·...,93231383,https://www.airbnb.com/users/show/93231383,Daniel,2016-09-03,t,3,3,...,2023-08-15,4.97,4.97,4.97,4.95,4.95,4.84,4.87,3,4.47
2,9919728,https://www.airbnb.com/rooms/9919728,Townhouse in Queens · ★4.33 · 3 bedrooms · 7 b...,50997424,https://www.airbnb.com/users/show/50997424,Mark,2015-12-10,f,10,10,...,2020-02-18,4.33,4.17,4.33,4.83,4.67,4.83,4.33,7,0.09
3,53696990,https://www.airbnb.com/rooms/53696990,Vacation home in Queens · Studio · 2 beds · 1 ...,50997424,https://www.airbnb.com/users/show/50997424,Mark,2015-12-10,f,10,10,...,2022-09-16,4.0,4.0,3.0,5.0,4.0,5.0,3.0,7,0.08
4,9920363,https://www.airbnb.com/rooms/9920363,Townhouse in Queens · ★4.25 · 5 bedrooms · 11 ...,50997424,https://www.airbnb.com/users/show/50997424,Mark,2015-12-10,f,10,10,...,2020-01-01,4.25,4.25,3.75,4.75,4.0,4.75,4.5,7,0.09


In [6]:
ny_df.columns

Index(['listing_id', 'listing_url', 'name', 'host_id', 'host_url', 'host_name',
       'host_since', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'latitude', 'longitude', 'room_type',
       'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities',
       'price', 'minimum_nights', 'maximum_nights', 'has_availability',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'first_review', 'last_review', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'calculated_host_listings_count', 'reviews_per_month'],
      dtype='object')

In [7]:
ny_df=ny_df[['room_type','accommodates', 'bedrooms','neighbourhood_cleansed','price']]

ny_df['bedrooms']= ny_df['bedrooms'].fillna(1)
ny_df['price']= ny_df['price'].str.extract('(\d+\.\d+|\d+)').astype(float)

ny_df.head(3)

Unnamed: 0,room_type,accommodates,bedrooms,neighbourhood_cleansed,price
0,Entire home/apt,4,1.0,East Elmhurst,171.0
1,Entire home/apt,7,3.0,East Elmhurst,311.0
2,Entire home/apt,12,3.0,Middle Village,328.0


In [8]:
# Encode data
encode_df = pd.get_dummies(ny_df, columns=['room_type', 'neighbourhood_cleansed'], prefix=['room_type', 'neighbourhood_cleansed'])
encode_df.head(3)

Unnamed: 0,accommodates,bedrooms,price,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,neighbourhood_cleansed_Arverne,neighbourhood_cleansed_Astoria,neighbourhood_cleansed_Bath Beach,neighbourhood_cleansed_Battery Park City,...,neighbourhood_cleansed_Tribeca,neighbourhood_cleansed_Two Bridges,neighbourhood_cleansed_Upper East Side,neighbourhood_cleansed_Upper West Side,neighbourhood_cleansed_Washington Heights,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodside
0,4,1.0,171.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,3.0,311.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12,3.0,328.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
encode_df.dtypes

accommodates                                int64
bedrooms                                  float64
price                                     float64
room_type_Entire home/apt                   uint8
room_type_Private room                      uint8
                                           ...   
neighbourhood_cleansed_Whitestone           uint8
neighbourhood_cleansed_Williamsburg         uint8
neighbourhood_cleansed_Windsor Terrace      uint8
neighbourhood_cleansed_Woodhaven            uint8
neighbourhood_cleansed_Woodside             uint8
Length: 105, dtype: object

In [10]:
encode_df = encode_df.dropna()

In [11]:
y = encode_df.price.values
X = encode_df.drop(columns="price").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

LogisticRegression

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


In [13]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

In [14]:
y_pred = regr.predict(X_test)

In [22]:
# Assuming you already have predictions for the test dataset
y_pred = regr.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 15039081306.734959


In [16]:
# Score the model
print(f"Training Data Score: {regr.score(X_train, y_train)}")
print(f"Testing Data Score: {regr.score(X_test, y_test)}")

Training Data Score: 0.6688299633974548
Testing Data Score: -9917188980211504.0


#Deep Learning

In [17]:
nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=104, activation="relu", input_dim=104))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1))

nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 104)               10920     
                                                                 
 dense_1 (Dense)             (None, 80)                8400      
                                                                 
 dense_2 (Dense)             (None, 80)                6480      
                                                                 
 dense_3 (Dense)             (None, 80)                6480      
                                                                 
 dense_4 (Dense)             (None, 1)                 81        
                                                                 
Total params: 32361 (126.41 KB)
Trainable params: 32361 (126.41 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
nn.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

In [19]:
#fit_model = nn.fit(X_train,y_train,epochs=30)

fit_model = nn.fit(X_train, y_train, epochs=100, validation_split=0.2) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [20]:
test_loss, test_mae = nn.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

Test Loss: 9122.388671875, Test MAE: 50.0784912109375


In [21]:
# Calculate predictions using the deep learning model
y_pred_nn = nn.predict(X_test)

# Calculate Mean Squared Error (MSE) for the deep learning model
mse_nn = mean_squared_error(y_test, y_pred_nn)

# Calculate Root Mean Squared Error (RMSE) for the deep learning model
rmse_nn = np.sqrt(mse_nn)

# Calculate R-squared for the deep learning model
r2_nn = r2_score(y_test, y_pred_nn)

print("Deep Learning Model - Root Mean Squared Error (RMSE):", rmse_nn)
print("Deep Learning Model - R-squared:", r2_nn)

Deep Learning Model - Root Mean Squared Error (RMSE): 95.51120739320184
Deep Learning Model - R-squared: 0.6000049241876837
