In [115]:
!pip install pyspark

import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf




In [116]:
# Import the column_listings_df Parquet file
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ReadParquet").getOrCreate()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the Parquet file
listings_df = spark.read.parquet('/content/drive/MyDrive/column_listings_df.parquet')
listings_df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+-------

In [117]:
print(listings_df.columns)

['listing_id', 'listing_url', 'name', 'host_id', 'host_url', 'host_name', 'host_since', 'host_is_superhost', 'host_listings_count', 'host_total_listings_count', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'has_availability', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'reviews_per_month', 'Wifi', 'Smoke alarm', 'Carbon monoxide alarm', 'Kitchen', 'Air conditioning', 'TV', 'Iron', 'Essentials', 'Hangers', 'Shampoo', 'Refrigerator', 'Hair dryer', 'Dishes and silverware', 'Hot water', 'Cooking basics', 'Heating', 'Bed linens', 'Microwave', 'Oven', 'Fire ex

In [118]:
# Select only the desired columns
cleanup_df = listings_df[['listing_id','listing_url','name','room_type','accommodates','bathrooms_text','bedrooms','beds','neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude','price','Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning',
                   'TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer','Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven',
                   'Fire extinguisher','Coffee maker','Free street parking','First aid kit','Self check-in','Dedicated workspace','review_scores_rating']]

#Rename columns
cleanup_df = cleanup_df.withColumnRenamed('neighbourhood_cleansed', 'neighbourhood')

cleanup_df.show()

+------------------+--------------------+--------------------+---------------+------------+--------------+--------+----+--------------------+----------------------------+------------------+------------------+-------+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+--------------+-------+----------+---------+----+-----------------+------------+-------------------+-------------+-------------+-------------------+--------------------+
|        listing_id|         listing_url|                name|      room_type|accommodates|bathrooms_text|bedrooms|beds|       neighbourhood|neighbourhood_group_cleansed|          latitude|         longitude|  price|Wifi|Smoke alarm|Carbon monoxide alarm|Kitchen|Air conditioning| TV|Iron|Essentials|Hangers|Shampoo|Refrigerator|Hair dryer|Dishes and silverware|Hot water|Cooking basics|Heating|Bed linens|Microwave|Oven|Fire extinguisher|Coffee maker|Free s

In [119]:
cleanup_df.printSchema()

root
 |-- listing_id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: string (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: string (nullable = true)
 |-- beds: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- neighbourhood_group_cleansed: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- price: string (nullable = true)
 |-- Wifi: integer (nullable = true)
 |-- Smoke alarm: integer (nullable = true)
 |-- Carbon monoxide alarm: integer (nullable = true)
 |-- Kitchen: integer (nullable = true)
 |-- Air conditioning: integer (nullable = true)
 |-- TV: integer (nullable = true)
 |-- Iron: integer (nullable = true)
 |-- Essentials: integer (nullable = true)
 |-- Hangers: integer (nullable = true)
 |-- Shampoo: integer (nullable = true)
 |-- Refrigerator: intege

In [120]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Fill missing values in 'bedrooms' with 1
cleanup_df = cleanup_df.withColumn('bedrooms', col('bedrooms').cast('int')).fillna(1, subset=['bedrooms'])

# Remove the dollar sign and convert to float
cleanup_df = cleanup_df.withColumn('price', cleanup_df['price'].substr(2, 100).cast('float'))

# Convert string columns to integer
cleanup_df = cleanup_df.withColumn('accommodates', col('accommodates').cast('integer'))
cleanup_df = cleanup_df.withColumn('beds', col('beds').cast('integer'))

cleanup_df.printSchema()

root
 |-- listing_id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bathrooms_text: string (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- beds: integer (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- neighbourhood_group_cleansed: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- price: float (nullable = true)
 |-- Wifi: integer (nullable = true)
 |-- Smoke alarm: integer (nullable = true)
 |-- Carbon monoxide alarm: integer (nullable = true)
 |-- Kitchen: integer (nullable = true)
 |-- Air conditioning: integer (nullable = true)
 |-- TV: integer (nullable = true)
 |-- Iron: integer (nullable = true)
 |-- Essentials: integer (nullable = true)
 |-- Hangers: integer (nullable = true)
 |-- Shampoo: integer (nullable = true)
 |-- Refrigerator: inte

In [121]:
from pyspark.sql.functions import split, regexp_extract, when

# Extract the number of bathrooms
cleanup_df = cleanup_df.withColumn('bathrooms', regexp_extract('bathrooms_text', r'(\d+(\.\d+)?)', 1).cast('float'))

# Extract the bathroom type
cleanup_df = cleanup_df.withColumn('bathroom_type',
                          when(cleanup_df['bathrooms_text'].contains('private'), 'private')
                          .when(cleanup_df['bathrooms_text'].contains('shared'), 'shared')
                          .otherwise('private')
                         )

# Reorder columns
cleanup_df = cleanup_df.select('listing_id','listing_url','name','room_type','accommodates','bathrooms_text','bedrooms','beds','bathrooms','bathroom_type','neighbourhood','neighbourhood_group_cleansed','longitude','latitude','price','Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning',
                   'TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer','Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven',
                   'Fire extinguisher','Coffee maker','Free street parking','First aid kit','Self check-in','Dedicated workspace','review_scores_rating')

# Drop the 'bathrooms_text' column
cleanup_df = cleanup_df.drop('bathrooms_text')

cleanup_df.show()

+------------------+--------------------+--------------------+---------------+------------+--------+----+---------+-------------+--------------------+----------------------------+------------------+------------------+-----+----+-----------+---------------------+-------+----------------+---+----+----------+-------+-------+------------+----------+---------------------+---------+--------------+-------+----------+---------+----+-----------------+------------+-------------------+-------------+-------------+-------------------+--------------------+
|        listing_id|         listing_url|                name|      room_type|accommodates|bedrooms|beds|bathrooms|bathroom_type|       neighbourhood|neighbourhood_group_cleansed|         longitude|          latitude|price|Wifi|Smoke alarm|Carbon monoxide alarm|Kitchen|Air conditioning| TV|Iron|Essentials|Hangers|Shampoo|Refrigerator|Hair dryer|Dishes and silverware|Hot water|Cooking basics|Heating|Bed linens|Microwave|Oven|Fire extinguisher|Coffe

In [122]:
pandas_df = cleanup_df.toPandas()

In [123]:
pandas_df.dropna()
pandas_df.head()

Unnamed: 0,listing_id,listing_url,name,room_type,accommodates,bedrooms,beds,bathrooms,bathroom_type,neighbourhood,...,Bed linens,Microwave,Oven,Fire extinguisher,Coffee maker,Free street parking,First aid kit,Self check-in,Dedicated workspace,review_scores_rating
0,52702018,https://www.airbnb.com/rooms/52702018,Rental unit in Queens · ★4.90 · 1 bedroom · 2 ...,Entire home/apt,4,1,2.0,1.0,private,East Elmhurst,...,1,1,0,0,1,1,0,1,0,4.9
1,784969376930125242,https://www.airbnb.com/rooms/784969376930125242,Home in Queens · ★4.97 · 3 bedrooms · 4 beds ·...,Entire home/apt,7,3,4.0,1.0,private,East Elmhurst,...,1,1,0,0,0,0,1,1,0,4.97
2,9919728,https://www.airbnb.com/rooms/9919728,Townhouse in Queens · ★4.33 · 3 bedrooms · 7 b...,Entire home/apt,12,3,7.0,1.0,private,Middle Village,...,0,0,0,0,0,0,0,0,0,4.33
3,53696990,https://www.airbnb.com/rooms/53696990,Vacation home in Queens · Studio · 2 beds · 1 ...,Entire home/apt,4,1,2.0,1.0,private,Middle Village,...,1,0,0,0,1,0,0,0,0,4.0
4,9920363,https://www.airbnb.com/rooms/9920363,Townhouse in Queens · ★4.25 · 5 bedrooms · 11 ...,Entire home/apt,16,5,11.0,2.5,private,Ridgewood,...,1,0,0,1,0,0,0,1,0,4.25


In [124]:
pandas_df['bedrooms']= pandas_df['bedrooms'].fillna(1)
pandas_df = pandas_df.dropna(subset=['beds'])
pandas_df = pandas_df[~pandas_df['beds'].isin([float('inf')])]
pandas_df['beds'] = pandas_df['beds'].astype('int')
pandas_df.head()

Unnamed: 0,listing_id,listing_url,name,room_type,accommodates,bedrooms,beds,bathrooms,bathroom_type,neighbourhood,...,Bed linens,Microwave,Oven,Fire extinguisher,Coffee maker,Free street parking,First aid kit,Self check-in,Dedicated workspace,review_scores_rating
0,52702018,https://www.airbnb.com/rooms/52702018,Rental unit in Queens · ★4.90 · 1 bedroom · 2 ...,Entire home/apt,4,1,2,1.0,private,East Elmhurst,...,1,1,0,0,1,1,0,1,0,4.9
1,784969376930125242,https://www.airbnb.com/rooms/784969376930125242,Home in Queens · ★4.97 · 3 bedrooms · 4 beds ·...,Entire home/apt,7,3,4,1.0,private,East Elmhurst,...,1,1,0,0,0,0,1,1,0,4.97
2,9919728,https://www.airbnb.com/rooms/9919728,Townhouse in Queens · ★4.33 · 3 bedrooms · 7 b...,Entire home/apt,12,3,7,1.0,private,Middle Village,...,0,0,0,0,0,0,0,0,0,4.33
3,53696990,https://www.airbnb.com/rooms/53696990,Vacation home in Queens · Studio · 2 beds · 1 ...,Entire home/apt,4,1,2,1.0,private,Middle Village,...,1,0,0,0,1,0,0,0,0,4.0
4,9920363,https://www.airbnb.com/rooms/9920363,Townhouse in Queens · ★4.25 · 5 bedrooms · 11 ...,Entire home/apt,16,5,11,2.5,private,Ridgewood,...,1,0,0,1,0,0,0,1,0,4.25


In [125]:
print(pandas_df.dtypes)

listing_id                       object
listing_url                      object
name                             object
room_type                        object
accommodates                      int32
bedrooms                          int32
beds                              int64
bathrooms                       float32
bathroom_type                    object
neighbourhood                    object
neighbourhood_group_cleansed     object
longitude                        object
latitude                         object
price                           float32
Wifi                              int32
Smoke alarm                       int32
Carbon monoxide alarm             int32
Kitchen                           int32
Air conditioning                  int32
TV                                int32
Iron                              int32
Essentials                        int32
Hangers                           int32
Shampoo                           int32
Refrigerator                      int32


In [126]:
cols_to_uint = ['Wifi','Smoke alarm','Carbon monoxide alarm','Kitchen','Air conditioning','TV','Iron','Essentials','Hangers','Shampoo','Refrigerator','Hair dryer',
                'Dishes and silverware','Hot water','Cooking basics','Heating','Bed linens','Microwave','Oven','Fire extinguisher','Coffee maker','Free street parking',
                'First aid kit','Self check-in','Dedicated workspace']

pandas_df[cols_to_uint] = pandas_df[cols_to_uint].astype('uint8')
print(pandas_df.dtypes)

listing_id                       object
listing_url                      object
name                             object
room_type                        object
accommodates                      int32
bedrooms                          int32
beds                              int64
bathrooms                       float32
bathroom_type                    object
neighbourhood                    object
neighbourhood_group_cleansed     object
longitude                        object
latitude                         object
price                           float32
Wifi                              uint8
Smoke alarm                       uint8
Carbon monoxide alarm             uint8
Kitchen                           uint8
Air conditioning                  uint8
TV                                uint8
Iron                              uint8
Essentials                        uint8
Hangers                           uint8
Shampoo                           uint8
Refrigerator                      uint8


In [127]:
# Encode data
encoded_df = pd.get_dummies(pandas_df, columns=['bathroom_type'], prefix=['bathroom_type'])
encoded_df.head()

Unnamed: 0,listing_id,listing_url,name,room_type,accommodates,bedrooms,beds,bathrooms,neighbourhood,neighbourhood_group_cleansed,...,Oven,Fire extinguisher,Coffee maker,Free street parking,First aid kit,Self check-in,Dedicated workspace,review_scores_rating,bathroom_type_private,bathroom_type_shared
0,52702018,https://www.airbnb.com/rooms/52702018,Rental unit in Queens · ★4.90 · 1 bedroom · 2 ...,Entire home/apt,4,1,2,1.0,East Elmhurst,Queens,...,0,0,1,1,0,1,0,4.9,1,0
1,784969376930125242,https://www.airbnb.com/rooms/784969376930125242,Home in Queens · ★4.97 · 3 bedrooms · 4 beds ·...,Entire home/apt,7,3,4,1.0,East Elmhurst,Queens,...,0,0,0,0,1,1,0,4.97,1,0
2,9919728,https://www.airbnb.com/rooms/9919728,Townhouse in Queens · ★4.33 · 3 bedrooms · 7 b...,Entire home/apt,12,3,7,1.0,Middle Village,Queens,...,0,0,0,0,0,0,0,4.33,1,0
3,53696990,https://www.airbnb.com/rooms/53696990,Vacation home in Queens · Studio · 2 beds · 1 ...,Entire home/apt,4,1,2,1.0,Middle Village,Queens,...,0,0,1,0,0,0,0,4.0,1,0
4,9920363,https://www.airbnb.com/rooms/9920363,Townhouse in Queens · ★4.25 · 5 bedrooms · 11 ...,Entire home/apt,16,5,11,2.5,Ridgewood,Queens,...,0,1,0,0,0,1,0,4.25,1,0


In [128]:
encoded_df.dropna()
model_df = encoded_df.copy()

In [129]:
import pandas as pd

# Assuming model_df is your DataFrame
model_df.to_csv('model_data.csv', index=False)  # Setting index=False prevents pandas from writing row numbers


In [130]:
from IPython.core.display import display, HTML

# Create a link to download the CSV
display(HTML('<a href="./model_data.csv" target="_blank">Download CSV File</a>'))


In [132]:
from google.colab import files

files.download('model_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>