In [91]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
import pandas as pd
import tensorflow as tf
from geopy.geocoders import Nominatim


In [93]:
# Import our input dataset
ny_df = pd.read_csv('../Resources/merged_cleaned_ny.csv', nrows=2000)
ny_df.head()

Unnamed: 0,primary_key,listing_id,listing_url,name,host_id,host_url,host_name,host_since,host_is_superhost,host_listings_count,...,number_of_reviews_l30d,first_review,last_review,review_scores_rating,calculated_host_listings_count,reviews_per_month,date,available,price,adjusted_price
0,1,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,https://www.airbnb.com/users/show/2845,Jennifer,9/9/2008,f,7,...,0,11/21/2009,6/21/2022,4.68,3,0.3,7/31/2023,f,$240.00,$240.00
1,2,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,https://www.airbnb.com/users/show/2845,Jennifer,9/9/2008,f,7,...,0,11/21/2009,6/21/2022,4.68,3,0.3,8/1/2023,f,$240.00,$240.00
2,3,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,https://www.airbnb.com/users/show/2845,Jennifer,9/9/2008,f,7,...,0,11/21/2009,6/21/2022,4.68,3,0.3,8/2/2023,f,$240.00,$240.00
3,4,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,https://www.airbnb.com/users/show/2845,Jennifer,9/9/2008,f,7,...,0,11/21/2009,6/21/2022,4.68,3,0.3,8/3/2023,t,$240.00,$240.00
4,5,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,2845,https://www.airbnb.com/users/show/2845,Jennifer,9/9/2008,f,7,...,0,11/21/2009,6/21/2022,4.68,3,0.3,8/4/2023,t,$240.00,$240.00


In [94]:
ny_df.columns

Index(['primary_key', 'listing_id', 'listing_url', 'name', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count', 'latitude',
       'longitude', 'room_type', 'accommodates', 'bedrooms', 'beds',
       'minimum_nights', 'maximum_nights', 'has_availability',
       'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'first_review', 'last_review', 'review_scores_rating',
       'calculated_host_listings_count', 'reviews_per_month', 'date',
       'available', 'price', 'adjusted_price'],
      dtype='object')

In [95]:
ny_df=ny_df[['room_type','accommodates', 'bedrooms','latitude',
       'longitude','price']]

ny_df['bedrooms']= ny_df['bedrooms'].fillna(1)
ny_df['price']= ny_df['price'].str.extract('(\d+\.\d+|\d+)').astype(float)

ny_df.head(3)

Unnamed: 0,room_type,accommodates,bedrooms,latitude,longitude,price
0,Entire home/apt,1,1.0,40.75356,-73.98559,240.0
1,Entire home/apt,1,1.0,40.75356,-73.98559,240.0
2,Entire home/apt,1,1.0,40.75356,-73.98559,240.0


In [98]:
ny_df_zip = ny_df.sample(n=200, random_state=42)

In [99]:

# Initialize the geocoder (you can choose other geocoders as well)
def get_zip_code(latitude, longitude):
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.reverse((latitude, longitude))
    if location and 'postcode' in location.raw['address']:
        return location.raw['address']['postcode']
    else:
        return None

ny_df_zip['zip_code'] = ny_df_zip.apply(lambda row: get_zip_code(row['latitude'], row['longitude']), axis=1)

#Display
ny_df_zip.head(3)

Unnamed: 0,room_type,accommodates,bedrooms,latitude,longitude,price,zip_code
1860,Private room,1,1.0,40.82782,-73.9473,99.0,10031
353,Entire home/apt,3,1.0,40.73072,-73.99275,250.0,10003
1333,Entire home/apt,2,1.0,40.68253,-73.94295,135.0,11216


In [100]:
df_reduced = ny_df_zip[['room_type', 'accommodates', 'bedrooms', 'price', 'zip_code']]
df_reduced.head(3)

Unnamed: 0,room_type,accommodates,bedrooms,price,zip_code
1860,Private room,1,1.0,99.0,10031
353,Entire home/apt,3,1.0,250.0,10003
1333,Entire home/apt,2,1.0,135.0,11216


In [101]:
# Encode data
encode_df = pd.get_dummies(df_reduced, columns=['room_type'], prefix=['room_type'])
encode_df.head(3)

Unnamed: 0,accommodates,bedrooms,price,zip_code,room_type_Entire home/apt,room_type_Private room
1860,1,1.0,99.0,10031,0,1
353,3,1.0,250.0,10003,1,0
1333,2,1.0,135.0,11216,1,0


In [102]:
encode_df['zip_code']=encode_df['zip_code'].astype(int)

In [103]:
encode_df.dtypes

accommodates                   int64
bedrooms                     float64
price                        float64
zip_code                       int32
room_type_Entire home/apt      uint8
room_type_Private room         uint8
dtype: object

In [104]:
encode_df = encode_df.dropna()

In [105]:
y = encode_df.price.values
X = encode_df.drop(columns="price").values

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [109]:
nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=8, activation="relu", input_dim=5))
nn.add(tf.keras.layers.Dense(units=8, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1))

nn.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_27 (Dense)            (None, 8)                 48        
                                                                 
 dense_28 (Dense)            (None, 8)                 72        
                                                                 
 dense_29 (Dense)            (None, 1)                 9         
                                                                 
Total params: 129 (516.00 Byte)
Trainable params: 129 (516.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [110]:
nn.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

In [111]:
#fit_model = nn.fit(X_train,y_train,epochs=30)

fit_model = nn.fit(X_train, y_train, epochs=100, validation_split=0.2) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [112]:
test_loss, test_mae = nn.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

Test Loss: 8692.5966796875, Test MAE: 68.17477416992188
