In [87]:
!pip install pyspark

import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf




In [103]:
# Import the column_listings_df Parquet file
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ReadParquet").getOrCreate()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the Parquet file
listings_df = spark.read.parquet('/content/drive/MyDrive/column_listings_df.parquet')
listings_df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+-------

In [89]:
print(listings_df.columns)

['listing_id', 'listing_url', 'name', 'host_id', 'host_url', 'host_name', 'host_since', 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'has_availability', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'reviews_per_month', 'Wifi', 'Smoke alarm', 'Carbon monoxide alarm', 'Kitchen', 'Air conditioning', 'TV', 'Iron', 'Essentials', 'Hangers', 'Shampoo', 'Refrigerator', 'Hair dryer', 'Dishes and silverware', 'Hot water', 'Cooking basics', 'Heating', 'Bed linens', 'Microwave', 'Oven', 'Fire ex

In [90]:
# Select only the desired columns
cleanup_df = listings_df[['room_type','accommodates','bathrooms_text','bedrooms','beds','neighbourhood_cleansed','price','Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning',
                   'TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer','Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven',
                   'Fire extinguisher','Coffee maker','Free street parking','First aid kit','Self check-in','Dedicated workspace']]

#Rename columns
cleanup_df = cleanup_df.withColumnRenamed('neighbourhood_cleansed', 'neighbourhood')

cleanup_df.show()

+---------------+------------+--------------+--------+----+--------------------+-------+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+--------------+-------+----------+---------+----+-----------------+------------+-------------------+-------------+-------------+-------------------+
|      room_type|accommodates|bathrooms_text|bedrooms|beds|       neighbourhood|  price|Wifi|Smoke alarm|Carbon monoxide alarm|Kitchen|Air conditioning| TV|Iron|Essentials|Hangers|Shampoo|Refrigerator|Hair dryer|Dishes and silverware|Hot water|Cooking basics|Heating|Bed linens|Microwave|Oven|Fire extinguisher|Coffee maker|Free street parking|First aid kit|Self check-in|Dedicated workspace|
+---------------+------------+--------------+--------+----+--------------------+-------+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+--

In [91]:
cleanup_df.printSchema()

root
 |-- room_type: string (nullable = true)
 |-- accommodates: string (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- beds: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- price: string (nullable = true)
 |-- Wifi: integer (nullable = true)
 |-- Smoke alarm: integer (nullable = true)
 |-- Carbon monoxide alarm: integer (nullable = true)
 |-- Kitchen: integer (nullable = true)
 |-- Air conditioning: integer (nullable = true)
 |-- TV: integer (nullable = true)
 |-- Iron: integer (nullable = true)
 |-- Essentials: integer (nullable = true)
 |-- Hangers: integer (nullable = true)
 |-- Shampoo: integer (nullable = true)
 |-- Refrigerator: integer (nullable = true)
 |-- Hair dryer: integer (nullable = true)
 |-- Dishes and silverware: integer (nullable = true)
 |-- Hot water: integer (nullable = true)
 |-- Cooking basics: integer (nullable = true)
 |-- Heating: integer (nullable = true)
 |-- Bed linens:

In [92]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Fill missing values in 'bedrooms' with 1
cleanup_df = cleanup_df.withColumn('bedrooms', col('bedrooms').cast('int')).fillna(1, subset=['bedrooms'])

# Remove the dollar sign and convert to float
cleanup_df = cleanup_df.withColumn('price', cleanup_df['price'].substr(2, 100).cast('float'))

# Convert string columns to integer
cleanup_df = cleanup_df.withColumn('accommodates', col('accommodates').cast('integer'))
cleanup_df = cleanup_df.withColumn('beds', col('beds').cast('integer'))

cleanup_df.printSchema()

root
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- beds: integer (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- price: float (nullable = true)
 |-- Wifi: integer (nullable = true)
 |-- Smoke alarm: integer (nullable = true)
 |-- Carbon monoxide alarm: integer (nullable = true)
 |-- Kitchen: integer (nullable = true)
 |-- Air conditioning: integer (nullable = true)
 |-- TV: integer (nullable = true)
 |-- Iron: integer (nullable = true)
 |-- Essentials: integer (nullable = true)
 |-- Hangers: integer (nullable = true)
 |-- Shampoo: integer (nullable = true)
 |-- Refrigerator: integer (nullable = true)
 |-- Hair dryer: integer (nullable = true)
 |-- Dishes and silverware: integer (nullable = true)
 |-- Hot water: integer (nullable = true)
 |-- Cooking basics: integer (nullable = true)
 |-- Heating: integer (nullable = true)
 |-- Bed linen

In [93]:
from pyspark.sql.functions import split, regexp_extract, when

# Extract the number of bathrooms
cleanup_df = cleanup_df.withColumn('bathrooms', regexp_extract('bathrooms_text', r'(\d+(\.\d+)?)', 1).cast('float'))

# Extract the bathroom type
cleanup_df = cleanup_df.withColumn('bathroom_type',
                          when(cleanup_df['bathrooms_text'].contains('private'), 'private')
                          .when(cleanup_df['bathrooms_text'].contains('shared'), 'shared')
                          .otherwise('private')
                         )

# Reorder columns
cleanup_df = cleanup_df.select('room_type','accommodates','bathrooms_text','bedrooms','beds','bathrooms','bathroom_type','neighbourhood','price','Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning',
                   'TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer','Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven',
                   'Fire extinguisher','Coffee maker','Free street parking','First aid kit','Self check-in','Dedicated workspace')

# Drop the 'bathrooms_text' column
cleanup_df = cleanup_df.drop('bathrooms_text')

cleanup_df.show()

+---------------+------------+--------+----+---------+-------------+--------------------+-----+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+--------------+-------+----------+---------+----+-----------------+------------+-------------------+-------------+-------------+-------------------+
|      room_type|accommodates|bedrooms|beds|bathrooms|bathroom_type|       neighbourhood|price|Wifi|Smoke alarm|Carbon monoxide alarm|Kitchen|Air conditioning| TV|Iron|Essentials|Hangers|Shampoo|Refrigerator|Hair dryer|Dishes and silverware|Hot water|Cooking basics|Heating|Bed linens|Microwave|Oven|Fire extinguisher|Coffee maker|Free street parking|First aid kit|Self check-in|Dedicated workspace|
+---------------+------------+--------+----+---------+-------------+--------------------+-----+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+-----

In [94]:
pandas_df = cleanup_df.toPandas()

In [95]:
pandas_df.dropna()
pandas_df.head()

Unnamed: 0,room_type,accommodates,bedrooms,beds,bathrooms,bathroom_type,neighbourhood,price,Wifi,Smoke alarm,...,Heating,Bed linens,Microwave,Oven,Fire extinguisher,Coffee maker,Free street parking,First aid kit,Self check-in,Dedicated workspace
0,Entire home/apt,4,1,2.0,1.0,private,East Elmhurst,171.0,1,1,...,0,1,1,0,0,1,1,0,1,0
1,Entire home/apt,7,3,4.0,1.0,private,East Elmhurst,311.0,1,1,...,0,1,1,0,0,0,0,1,1,0
2,Entire home/apt,12,3,7.0,1.0,private,Middle Village,328.0,1,1,...,1,0,0,0,0,0,0,0,0,0
3,Entire home/apt,4,1,2.0,1.0,private,Middle Village,139.0,1,1,...,0,1,0,0,0,1,0,0,0,0
4,Entire home/apt,16,5,11.0,2.5,private,Ridgewood,485.0,1,1,...,0,1,0,0,1,0,0,0,1,0


In [96]:
pandas_df['bedrooms']= pandas_df['bedrooms'].fillna(1)
pandas_df = pandas_df.dropna(subset=['beds'])
pandas_df = pandas_df[~pandas_df['beds'].isin([float('inf')])]
pandas_df['beds'] = pandas_df['beds'].astype('int')
pandas_df.head()

Unnamed: 0,room_type,accommodates,bedrooms,beds,bathrooms,bathroom_type,neighbourhood,price,Wifi,Smoke alarm,...,Heating,Bed linens,Microwave,Oven,Fire extinguisher,Coffee maker,Free street parking,First aid kit,Self check-in,Dedicated workspace
0,Entire home/apt,4,1,2,1.0,private,East Elmhurst,171.0,1,1,...,0,1,1,0,0,1,1,0,1,0
1,Entire home/apt,7,3,4,1.0,private,East Elmhurst,311.0,1,1,...,0,1,1,0,0,0,0,1,1,0
2,Entire home/apt,12,3,7,1.0,private,Middle Village,328.0,1,1,...,1,0,0,0,0,0,0,0,0,0
3,Entire home/apt,4,1,2,1.0,private,Middle Village,139.0,1,1,...,0,1,0,0,0,1,0,0,0,0
4,Entire home/apt,16,5,11,2.5,private,Ridgewood,485.0,1,1,...,0,1,0,0,1,0,0,0,1,0


In [97]:
print(pandas_df.dtypes)

room_type                 object
accommodates               int32
bedrooms                   int32
beds                       int64
bathrooms                float32
bathroom_type             object
neighbourhood             object
price                    float32
Wifi                       int32
Smoke alarm                int32
Carbon monoxide alarm      int32
Kitchen                    int32
Air conditioning           int32
TV                         int32
Iron                       int32
Essentials                 int32
Hangers                    int32
Shampoo                    int32
Refrigerator               int32
Hair dryer                 int32
Dishes and silverware      int32
Hot water                  int32
Cooking basics             int32
Heating                    int32
Bed linens                 int32
Microwave                  int32
Oven                       int32
Fire extinguisher          int32
Coffee maker               int32
Free street parking        int32
First aid 

In [98]:
cols_to_uint = ['Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning','TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer',
                'Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven','Fire extinguisher','Coffee maker','Free street parking',
                'First aid kit','Self check-in','Dedicated workspace']

pandas_df[cols_to_uint] = pandas_df[cols_to_uint].astype('uint8')
print(pandas_df.dtypes)

room_type                 object
accommodates               int32
bedrooms                   int32
beds                       int64
bathrooms                float32
bathroom_type             object
neighbourhood             object
price                    float32
Wifi                       uint8
Smoke alarm                uint8
Carbon monoxide alarm      uint8
Kitchen                    uint8
Air conditioning           uint8
TV                         uint8
Iron                       uint8
Essentials                 uint8
Hangers                    uint8
Shampoo                    uint8
Refrigerator               uint8
Hair dryer                 uint8
Dishes and silverware      uint8
Hot water                  uint8
Cooking basics             uint8
Heating                    uint8
Bed linens                 uint8
Microwave                  uint8
Oven                       uint8
Fire extinguisher          uint8
Coffee maker               uint8
Free street parking        uint8
First aid 

In [99]:
# Encode data
encoded_df = pd.get_dummies(pandas_df, columns=['room_type', 'bathroom_type', 'neighbourhood'], prefix=['room_type', 'bathroom_type', 'neighbourhood'])
encoded_df.head()

Unnamed: 0,accommodates,bedrooms,beds,bathrooms,price,Wifi,Smoke alarm,Carbon monoxide alarm,Kitchen,Air conditioning,...,neighbourhood_Upper East Side,neighbourhood_Upper West Side,neighbourhood_Vinegar Hill,neighbourhood_Washington Heights,neighbourhood_West Village,neighbourhood_Whitestone,neighbourhood_Williamsburg,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodside
0,4,1,2,1.0,171.0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,7,3,4,1.0,311.0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,12,3,7,1.0,328.0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,4,1,2,1.0,139.0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,16,5,11,2.5,485.0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [100]:
encoded_df.dropna()
model_df = encoded_df.copy()

LogisticRegression

In [101]:
y = model_df["price"]
X = model_df.drop(columns="price")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [102]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

ValueError: ignored

In [None]:
y_pred = regr.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 15039081306.734959


In [None]:
# Score the model
print(f"Training Data Score: {regr.score(X_train, y_train)}")
print(f"Testing Data Score: {regr.score(X_test, y_test)}")

Training Data Score: 0.6688299633974548
Testing Data Score: -9917188980211504.0


#Deep Learning

In [None]:
nn = tf.keras.models.Sequential()
nn.add(tf.keras.layers.Dense(units=104, activation="relu", input_dim=104))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=80, activation="relu"))
nn.add(tf.keras.layers.Dense(units=1))

nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 104)               10920     
                                                                 
 dense_1 (Dense)             (None, 80)                8400      
                                                                 
 dense_2 (Dense)             (None, 80)                6480      
                                                                 
 dense_3 (Dense)             (None, 80)                6480      
                                                                 
 dense_4 (Dense)             (None, 1)                 81        
                                                                 
Total params: 32361 (126.41 KB)
Trainable params: 32361 (126.41 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
nn.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

In [None]:
#fit_model = nn.fit(X_train,y_train,epochs=30)

fit_model = nn.fit(X_train, y_train, epochs=100, validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
test_loss, test_mae = nn.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")

Test Loss: 9122.388671875, Test MAE: 50.0784912109375


In [None]:
# Calculate predictions using the deep learning model
y_pred_nn = nn.predict(X_test)

# Calculate Mean Squared Error (MSE) for the deep learning model
mse_nn = mean_squared_error(y_test, y_pred_nn)

# Calculate Root Mean Squared Error (RMSE) for the deep learning model
rmse_nn = np.sqrt(mse_nn)

# Calculate R-squared for the deep learning model
r2_nn = r2_score(y_test, y_pred_nn)

print("Deep Learning Model - Root Mean Squared Error (RMSE):", rmse_nn)
print("Deep Learning Model - R-squared:", r2_nn)

Deep Learning Model - Root Mean Squared Error (RMSE): 95.51120739320184
Deep Learning Model - R-squared: 0.6000049241876837


In [None]:
# Stop the Spark session when you're done
spark.stop()