In [142]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error 
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# Data preprocessing

<ul>
    <b>Removing columns</b>:
    <li>id, host_id - ids</li>
    <li>name - name of listing (offer)</li>
    <li>host_name - name of user hosting a listing</li>
    <li>latitude, longitude - we're going to use neighbourhood and neighberhood group as geogrephical regions rather then exact latitude ang longitude </li>
    <li>last_review - contains a lot of nulls which are really tough to replace</li>
    <li>calculated_host_listings_count - amount of listings hosted by one user</li>
</ul>

<br><br>

Replacing column names to make it easier to manipualate
<ul>
<b>Replaced names</b>
    <li>neighbourhood_group -> nghb_group</li>
    <li>neighbourhood -> nghb </li>
</ul>

In [236]:
df = pd.read_csv('AB_NYC_2019.csv')
df_org = df

cols_to_drop = ['id', 'name', 'host_id', 'host_name', 
                'last_review', 'calculated_host_listings_count']
df = df.drop(axis=1, columns=cols_to_drop)

cols_replace = {'neighbourhood_group': 'nghb_group',
                'neighbourhood': 'nghb'}
df = df.rename(mapper=cols_replace, axis=1)

<b>Removing NaN</b><br>
Replacing NaN values from reviews_per_month with 0, as NaN is effectt of no reviews. There are no other NaN values in any other columns.

In [237]:
df[np.isnan(df.reviews_per_month)]

Unnamed: 0,nghb_group,nghb,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365
2,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,365
19,Manhattan,East Harlem,40.79685,-73.94872,Entire home/apt,190,7,0,,249
26,Manhattan,Inwood,40.86754,-73.92639,Private room,80,4,0,,0
36,Brooklyn,Bedford-Stuyvesant,40.68876,-73.94312,Private room,35,60,0,,365
38,Brooklyn,Flatbush,40.63702,-73.96327,Private room,150,1,0,,365
...,...,...,...,...,...,...,...,...,...,...
48890,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,9
48891,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,36
48892,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,27
48893,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,2


In [238]:
df[df.reviews_per_month == 0]

Unnamed: 0,nghb_group,nghb,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,availability_365


In [239]:
df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
df = df.drop(df.index[df.price==0])

df.isna().sum()

nghb_group           0
nghb                 0
latitude             0
longitude            0
room_type            0
price                0
minimum_nights       0
number_of_reviews    0
reviews_per_month    0
availability_365     0
dtype: int64

Dropping rows with prices bigger than 500, as AirBnB is intended for rather cheap and one night rent. Higher prices will significantly decrease r^2 (from about 0.5 to 0.18). Furthermore, when looking at value counts for more than 500 USD it is only slightly more than 400-500 range.

In [240]:
bins = np.arange(0, 500, 100)
bins = np.append(bins, [500, 1000, 10000])
df_org['price'].value_counts(bins=bins).sort_index()

(-0.001, 100.0]      23928
(100.0, 200.0]       16583
(200.0, 300.0]        5027
(300.0, 400.0]        1594
(400.0, 500.0]         719
(500.0, 1000.0]        805
(1000.0, 10000.0]      239
Name: price, dtype: int64

In [241]:
df = df.drop(df[df.price > 300].index, axis=0)

In [None]:
pd.DataFrame(df.corr()["price"])

Factorizing categorical values, so that there are no strings but rather each category has assigned value from 0...n, where n is number of unique categories.

In [242]:
df['nghb_group'] = pd.factorize(df['nghb_group'])[0]
df['nghb'] = pd.factorize(df['nghb'])[0]
df['room_type'] = pd.factorize(df['room_type'])[0]

Dividing dataset into train and test

In [243]:
X = df.drop("price", axis=1)
y = df["price"]

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=df.drop("price", axis=1).columns)

X_train, X_test, y_train, y_test = ms.train_test_split(X, y, random_state=42)
# no stratify as it is regression problem not classification 

In [248]:
model_v1 = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=X_train.shape[1]),
    keras.layers.Dense(500, activation="relu"),
    keras.layers.Dense(250, activation="relu"),
    keras.layers.Dense(125, activation="relu"),
    keras.layers.Dense(62, activation="relu"),
    keras.layers.Dense(1),
])

model_v1.compile(loss='mean_absolute_error',
             optimizer='adam',
             metrics=['MeanSquaredError',
                      'RootMeanSquaredError',
                      'MeanAbsoluteError'])
    
model_v1.summary()



model_v2 = keras.models.Sequential([
    keras.layers.InputLayer(input_shape=X_train.shape[1]),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(1),
])

model_v2.compile(loss='mean_absolute_error',
             optimizer='adam',
             metrics=['MeanSquaredError',
                      'RootMeanSquaredError',
                      'MeanAbsoluteError'])

model_v2.summary()

Model: "sequential_35"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_120 (Dense)            (None, 500)               5000      
_________________________________________________________________
dense_121 (Dense)            (None, 250)               125250    
_________________________________________________________________
dense_122 (Dense)            (None, 125)               31375     
_________________________________________________________________
dense_123 (Dense)            (None, 62)                7812      
_________________________________________________________________
dense_124 (Dense)            (None, 1)                 63        
Total params: 169,500
Trainable params: 169,500
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_36"
_________________________________________________________________
Layer (type)            

Using normalizer which in practice does the standarization of all values.

In [250]:
history_v1 = model_v1.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100


Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100


Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [251]:
history_v2 = model_v2.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100


Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [253]:
metrics = {}
metrics_v1_test = model_v1.evaluate(X_test, y_test)
y_pred_v1_test = model_v1.predict(X_test)

metrics_v1_train = model_v1.evaluate(X_train, y_train)
y_pred_v1_train = model_v1.predict(X_train)

metrics_v2_test = model_v2.evaluate(X_test, y_test)
y_pred_v2_test = model_v2.predict(X_test)

metrics_v2_train = model_v2.evaluate(X_train, y_train)
y_pred_v2_train = model_v2.predict(X_train)

metrics["Nr. architektury"] = ["1", "1", "2", "2"]
metrics["Typ danych"] = ["trenujace", "testowe", "trenujace", "testowe"]
metrics["MSE"] = [metrics_v1_train[1], metrics_v1_test[1], metrics_v2_train[1], metrics_v2_test[1]]
metrics["RMSE"] = [metrics_v1_train[2], metrics_v1_test[2], metrics_v2_train[2], metrics_v2_test[2]]
metrics["MAE"] = [metrics_v1_train[3], metrics_v1_test[3], metrics_v2_train[3], metrics_v2_test[3]]
metrics["R2"] = [r2_score(y_train,y_pred_v1_train),
                 r2_score(y_test,y_pred_v1_test),
                 r2_score(y_train,y_pred_v2_train),
                 r2_score(y_test,y_pred_v2_test)]


pd.DataFrame(metrics)



Unnamed: 0,Nr. architektury,Typ danych,MSE,RMSE,MAE,R2
0,1,trenujace,1703.270142,41.270695,27.996891,0.606598
1,1,testowe,2000.362671,44.725414,31.458178,0.542939
2,2,trenujace,2030.783813,45.064217,31.365456,0.530952
3,2,testowe,2041.399048,45.181843,31.603842,0.533563
