In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

'apt-get' is not recognized as an internal or external command,
operable program or batch file.


The system cannot find the path specified.
'wget' is not recognized as an internal or external command,
operable program or batch file.
tar: Error opening archive: Failed to open '$SPARK_VERSION-bin-hadoop3.tgz'


Exception: Unable to find py4j in /content/spark-3.4.0-bin-hadoop3\python, your SPARK_HOME may not be configured correctly

In [149]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

# Set the partitions to 4 or 8.
spark.conf.set("spark.sql.shuffle.partitions", 8)

In [150]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url_listings = "http://data.insideairbnb.com/united-states/ny/new-york-city/2023-09-05/data/listings.csv.gz"
spark.sparkContext.addFile(url_listings)
listings_df = spark.read.csv(SparkFiles.get("listings.csv.gz"), sep=",", header=True, quote ='"', multiLine=True, escape = '"')

# Create a lookup table for calendar.
url_calendar="http://data.insideairbnb.com/united-states/ny/new-york-city/2023-09-05/data/calendar.csv.gz"
spark.sparkContext.addFile(url_calendar)
calendar_df = spark.read.csv(SparkFiles.get("calendar.csv.gz"), sep=",", header=True, quote ='"', multiLine=True, escape = '"')

# Create a lookup table for the airport codes.
url_reviews ="http://data.insideairbnb.com/united-states/ny/new-york-city/2023-09-05/data/reviews.csv.gz"
spark.sparkContext.addFile(url_reviews)
reviews_df = spark.read.csv(SparkFiles.get("reviews.csv.gz"), sep=",", header=True, quote ='"', multiLine=True, escape = '"')


In [151]:
# Look over the listings data.
listings_df.show()

+------------------+--------------------+--------------+------------+-----------+--------------------+--------------------+---------------------+--------------------+---------+--------------------+---------+----------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+------------------+------------------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+--------------

In [152]:
# Look over the data for calendar.
calendar_df.show()

+----------+----------+---------+-------+--------------+--------------+--------------+
|listing_id|      date|available|  price|adjusted_price|minimum_nights|maximum_nights|
+----------+----------+---------+-------+--------------+--------------+--------------+
|      2595|2023-09-05|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-06|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-07|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-08|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-09|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-10|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-11|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-12|        t|$240.00|       $240.00|            30|          1125|
|      2595|2023-09-13|        t|$240.00|  

In [153]:
# Look over the review data.
reviews_df.show()

+----------+--------+----------+-----------+-------------+--------------------+
|listing_id|      id|      date|reviewer_id|reviewer_name|            comments|
+----------+--------+----------+-----------+-------------+--------------------+
|      2595|   17857|2009-11-21|      50679|         Jean|Notre séjour de t...|
|      2595|   19176|2009-12-05|      53267|         Cate|   Great experience.|
|      2595|   19760|2009-12-10|      38960|        Anita|I've stayed with ...|
|      2595|   34320|2010-04-09|      71130|      Kai-Uwe|We've been stayin...|
|      2595|   46312|2010-05-25|     117113|       Alicia|We had a wonderfu...|
|      2595| 1238204|2012-05-07|    1783688|       Sergey|Hi to everyone!\r...|
|      2595| 1293632|2012-05-17|    1870771|         Loïc|Jennifer was very...|
|      2595| 2022498|2012-08-18|    2124102|      Melanie|This apartment is...|
|      2595| 4682989|2013-05-20|     496053|         Eric|Jennifer's place ...|
|      2595|13193832|2014-05-21|   13685

In [154]:
import pandas as pd

In [155]:
listings_df.count()

39453

In [156]:
calendar_df.count()

14399996

In [157]:
reviews_df.count()

1019573

In [158]:
unique_neighbourhoods = listings_df.select('neighbourhood_cleansed').distinct()
unique_neighbourhoods_count = unique_neighbourhoods.count()
print("Count of unique neighbourhoods:", unique_neighbourhoods_count)

Count of unique neighbourhoods: 223


In [159]:
accepted_neighborhoods = ["Manhattan", "Queens", "Brooklyn"]

In [160]:
filtered_listings_df = listings_df[listings_df['neighbourhood_group_cleansed'].isin(accepted_neighborhoods)]


In [161]:
filtered_listings_df.show()

+------------------+--------------------+--------------+------------+-----------+--------------------+--------------------+---------------------+--------------------+---------+--------------------+---------------+----------+--------------------+--------------------+------------------+------------------+--------------------+-----------------+--------------------+--------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+--------------------+----------------------+----------------------------+------------------+------------------+--------------------+---------------+------------+---------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+----------------+----------------+---------------+---------------+---------------+------

In [162]:
filtered_listings_df.count()

37609

In [163]:
listing_columns = ['id','listing_url','name','host_id','host_url','host_name','host_since','host_is_superhost','host_listings_count','host_total_listings_count','neighbourhood_cleansed','neighbourhood_group_cleansed','latitude','longitude','room_type','accommodates','bathrooms_text','bedrooms','beds','amenities','price','minimum_nights','maximum_nights','has_availability','number_of_reviews','number_of_reviews_ltm','number_of_reviews_l30d','first_review','last_review','review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value','calculated_host_listings_count','reviews_per_month']
calendars_columns = ['listing_id','date','available','price','adjusted_price']
reviews_columns = ['listing_id','id','date','reviewer_id','reviewer_name','comments']

In [164]:
column_listings_df = filtered_listings_df[listing_columns]
column_calendars_df = calendar_df[calendars_columns]
column_reviews_df = reviews_df[reviews_columns]

In [165]:
column_listings_df = column_listings_df.withColumnRenamed('id', 'listing_id')

In [166]:
column_listings_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [167]:
from pyspark.sql import SparkSession


In [168]:
# Initialize Spark session
spark = SparkSession.builder.appName("example").getOrCreate()

# Assuming you have a PySpark DataFrame named 'column_listings_df'
# Drop rows with null values in the 'listing_id' column
column_listings_df = column_listings_df.dropna(subset=['first_review'])

# Show the DataFrame after dropping null values
column_listings_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [169]:
column_listings_df.count()

27708

In [170]:
column_reviews_df = column_reviews_df.dropna(subset=['comments'])
column_reviews_df.show()

+----------+--------+----------+-----------+-------------+--------------------+
|listing_id|      id|      date|reviewer_id|reviewer_name|            comments|
+----------+--------+----------+-----------+-------------+--------------------+
|      2595|   17857|2009-11-21|      50679|         Jean|Notre séjour de t...|
|      2595|   19176|2009-12-05|      53267|         Cate|   Great experience.|
|      2595|   19760|2009-12-10|      38960|        Anita|I've stayed with ...|
|      2595|   34320|2010-04-09|      71130|      Kai-Uwe|We've been stayin...|
|      2595|   46312|2010-05-25|     117113|       Alicia|We had a wonderfu...|
|      2595| 1238204|2012-05-07|    1783688|       Sergey|Hi to everyone!\r...|
|      2595| 1293632|2012-05-17|    1870771|         Loïc|Jennifer was very...|
|      2595| 2022498|2012-08-18|    2124102|      Melanie|This apartment is...|
|      2595| 4682989|2013-05-20|     496053|         Eric|Jennifer's place ...|
|      2595|13193832|2014-05-21|   13685

In [171]:
column_reviews_df.count()

1019573

# Clean lists of amenities

In [172]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import json

# Initialize a Spark session
spark = SparkSession.builder.appName("DataFrameCleanAmenities").getOrCreate()

# Assuming you have a PySpark DataFrame named 'cleaned_joined_df'
amenities_replaced_df = column_listings_df.select('*')

# List of amenity names to search for and replace
amenities_to_replace = ["Wifi", "TV", "Oven", "Stove", "Soap", "Shampoo", "Conditioner", "Sound system", "Refrigerator", "Backyard", "Patio", "BBQ grill",
                        "Free parking", "Paid parking", "Free street parking", "Paid street parking"]

In [173]:
# Define a UDF to perform the amenities replacement
def replace_amenities(amenities):
    amenities_list = json.loads(amenities)
    for i, amenity in enumerate(amenities_list):
        for amenity_to_replace in amenities_to_replace:
            if amenity_to_replace.lower() in amenity.lower():
                amenities_list[i] = amenity_to_replace
    return json.dumps(amenities_list)

# Register the UDF
replace_amenities_udf = udf(replace_amenities, StringType())

# Apply the UDF to replace amenities
amenities_replaced_df = amenities_replaced_df.withColumn('amenities', replace_amenities_udf('amenities'))

# Print or display the modified DataFrame
amenities_replaced_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [174]:
# Define the string to find and the string to replace it with
string_to_find = "AC "
replacement_string = "Air conditioning"

# Define a UDF to perform the string replacement
def replace_string(amenities):
    amenities_list = json.loads(amenities)
    for i, amenity in enumerate(amenities_list):
        if string_to_find.lower() in amenity.lower():
            amenities_list[i] = replacement_string
    return json.dumps(amenities_list)

# Register the UDF
replace_string_udf = udf(replace_string, StringType())

# Apply the UDF to replace strings
amenities_replaced_df = amenities_replaced_df.withColumn('amenities', replace_string_udf('amenities'))

# Print or display the modified DataFrame
amenities_replaced_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [175]:
# Create a copy of the DataFrame
amenities_cleaned_df = amenities_replaced_df.select('*')

# Define a UDF to perform the amenities cleaning
def clean_amenities(amenities):
    amenities_list = json.loads(amenities)
    for i, amenity in enumerate(amenities_list):
        # Split amenity at colon, if present
        amenity_parts = amenity.split(':')

        # Take the first part as the cleaned amenity (remove characters after colon)
        cleaned_amenity = amenity_parts[0].strip()

        # Update the amenities list
        amenities_list[i] = cleaned_amenity

    return json.dumps(amenities_list)

# Register the UDF
clean_amenities_udf = udf(clean_amenities, StringType())

# Apply the UDF to clean amenities
amenities_cleaned_df = amenities_cleaned_df.withColumn('amenities', clean_amenities_udf('amenities'))

# Print or display the modified DataFrame
amenities_cleaned_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [176]:
column_listings_df = amenities_cleaned_df
column_listings_df.show()

+------------------+--------------------+--------------------+---------+--------------------+---------------+----------+-----------------+-------------------+-------------------------+----------------------+----------------------------+------------------+------------------+---------------+------------+--------------+--------+----+--------------------+-------+--------------+--------------+----------------+-----------------+---------------------+----------------------+------------+-----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+------------------------------+-----------------+
|        listing_id|         listing_url|                name|  host_id|            host_url|      host_name|host_since|host_is_superhost|host_listings_count|host_total_listings_count|neighbourhood_cleansed|neighbourhood_group_cleansed|          latitude|         longitude|      room_type|a

In [177]:
column_calendars_df.show()

+----------+----------+---------+-------+--------------+
|listing_id|      date|available|  price|adjusted_price|
+----------+----------+---------+-------+--------------+
|      2595|2023-09-05|        t|$240.00|       $240.00|
|      2595|2023-09-06|        t|$240.00|       $240.00|
|      2595|2023-09-07|        t|$240.00|       $240.00|
|      2595|2023-09-08|        t|$240.00|       $240.00|
|      2595|2023-09-09|        t|$240.00|       $240.00|
|      2595|2023-09-10|        t|$240.00|       $240.00|
|      2595|2023-09-11|        t|$240.00|       $240.00|
|      2595|2023-09-12|        t|$240.00|       $240.00|
|      2595|2023-09-13|        t|$240.00|       $240.00|
|      2595|2023-09-14|        t|$240.00|       $240.00|
|      2595|2023-09-15|        t|$240.00|       $240.00|
|      2595|2023-09-16|        t|$240.00|       $240.00|
|      2595|2023-09-17|        t|$240.00|       $240.00|
|      2595|2023-09-18|        t|$240.00|       $240.00|
|      2595|2023-09-19|        

In [187]:
from pyspark.sql import functions as F

# Assuming column_calendars_df is your DataFrame
# Remove dollar signs and convert price and adjusted_price to float
clean_column_calendars_df = column_calendars_df.withColumn("price",
                                                     F.regexp_replace(F.col("price"), "[$,]", "").cast("float"))
clean_column_calendars_df = clean_column_calendars_df.withColumn("adjusted_price",
                                                     F.regexp_replace(F.col("adjusted_price"), "[$,]", "").cast("float"))

# Show the DataFrame to verify changes
clean_column_calendars_df.show()


+----------+----------+---------+-----+--------------+
|listing_id|      date|available|price|adjusted_price|
+----------+----------+---------+-----+--------------+
|      2595|2023-09-05|        t|240.0|         240.0|
|      2595|2023-09-06|        t|240.0|         240.0|
|      2595|2023-09-07|        t|240.0|         240.0|
|      2595|2023-09-08|        t|240.0|         240.0|
|      2595|2023-09-09|        t|240.0|         240.0|
|      2595|2023-09-10|        t|240.0|         240.0|
|      2595|2023-09-11|        t|240.0|         240.0|
|      2595|2023-09-12|        t|240.0|         240.0|
|      2595|2023-09-13|        t|240.0|         240.0|
|      2595|2023-09-14|        t|240.0|         240.0|
|      2595|2023-09-15|        t|240.0|         240.0|
|      2595|2023-09-16|        t|240.0|         240.0|
|      2595|2023-09-17|        t|240.0|         240.0|
|      2595|2023-09-18|        t|240.0|         240.0|
|      2595|2023-09-19|        t|240.0|         240.0|
|      259

In [188]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, dayofweek
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [189]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("AirbnbPricePrediction") \
    .getOrCreate()


In [190]:
# Convert the price columns to float
clean_column_calendars_df = clean_column_calendars_df.withColumn("price", col("price").cast("float"))
clean_column_calendars_df = clean_column_calendars_df.withColumn("adjusted_price", col("adjusted_price").cast("float"))

# Drop rows where either 'price' or 'adjusted_price' is null
clean_column_calendars_df = clean_column_calendars_df.dropna(subset=['price', 'adjusted_price'])

# Convert 'available' to numerical (1 for 't', 0 for 'f')
clean_column_calendars_df = clean_column_calendars_df.withColumn("available", (col("available") == "t").cast("integer"))

# Convert 'date' from string to date type
clean_column_calendars_df = clean_column_calendars_df.withColumn("date", col("date").cast("date"))

# Add 'month' and 'day_of_week' features
clean_column_calendars_df = clean_column_calendars_df.withColumn("month", month("date")) \
                                         .withColumn("day_of_week", dayofweek("date"))


In [191]:
# Features and Assembler
feature_columns = ['month', 'day_of_week', 'available']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Model
rf = RandomForestRegressor(labelCol="price", featuresCol="features")

# Pipeline
pipeline = Pipeline(stages=[assembler, rf])


In [194]:
# Split data into training and test sets
train_data, test_data = clean_column_calendars_df.randomSplit([0.8, 0.2])

# Fit model to training data
model = pipeline.fit(train_data)

# Make predictions on test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 560.792


In [211]:
clean_column_calendars_df.show(400)

+----------+----------+---------+-----+--------------+-----+-----------+
|listing_id|      date|available|price|adjusted_price|month|day_of_week|
+----------+----------+---------+-----+--------------+-----+-----------+
|      2595|2023-09-05|        1|240.0|         240.0|    9|          3|
|      2595|2023-09-06|        1|240.0|         240.0|    9|          4|
|      2595|2023-09-07|        1|240.0|         240.0|    9|          5|
|      2595|2023-09-08|        1|240.0|         240.0|    9|          6|
|      2595|2023-09-09|        1|240.0|         240.0|    9|          7|
|      2595|2023-09-10|        1|240.0|         240.0|    9|          1|
|      2595|2023-09-11|        1|240.0|         240.0|    9|          2|
|      2595|2023-09-12|        1|240.0|         240.0|    9|          3|
|      2595|2023-09-13|        1|240.0|         240.0|    9|          4|
|      2595|2023-09-14|        1|240.0|         240.0|    9|          5|
|      2595|2023-09-15|        1|240.0|         240

In [212]:
predictions.show(400)

+----------+----------+---------+-----+--------------+-----+-----------+--------------+------------------+
|listing_id|      date|available|price|adjusted_price|month|day_of_week|      features|        prediction|
+----------+----------+---------+-----+--------------+-----+-----------+--------------+------------------+
|  10000070|2023-09-06|        0| 85.0|          85.0|    9|          4| [9.0,4.0,0.0]|207.34916592608914|
|  10000070|2023-09-07|        0| 85.0|          85.0|    9|          5| [9.0,5.0,0.0]|207.46949479551216|
|  10000070|2023-09-12|        0| 85.0|          85.0|    9|          3| [9.0,3.0,0.0]| 207.4552619380791|
|  10000070|2023-09-21|        0| 85.0|          85.0|    9|          5| [9.0,5.0,0.0]|207.46949479551216|
|  10000070|2023-09-23|        0| 85.0|          85.0|    9|          7| [9.0,7.0,0.0]|211.99827253752068|
|  10000070|2023-10-04|        0| 85.0|          85.0|   10|          4|[10.0,4.0,0.0]|202.67368802204996|
|  10000070|2023-10-05|        0| 85.

In [203]:
!ls /content/

model			     spark-3.4.0-bin-hadoop3.tgz.1
sample_data		     spark-3.4.0-bin-hadoop3.tgz.2
spark-3.4.0-bin-hadoop3      spark-3.4.0-bin-hadoop3.tgz.3
spark-3.4.0-bin-hadoop3.tgz  spark-3.4.0-bin-hadoop3.tgz.4


In [209]:
!zip -r /content/model.zip /content/model


  adding: content/model/ (stored 0%)
  adding: content/model/stages/ (stored 0%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/ (stored 0%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/metadata/ (stored 0%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/metadata/part-00000 (deflated 35%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/metadata/.part-00000.crc (stored 0%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/metadata/_SUCCESS (stored 0%)
  adding: content/model/stages/0_VectorAssembler_c5aebd56572a/metadata/._SUCCESS.crc (stored 0%)
  adding: content/model/stages/1_RandomForestRegressor_9b4d875bf9f9/ (stored 0%)
  adding: content/model/stages/1_RandomForestRegressor_9b4d875bf9f9/data/ (stored 0%)
  adding: content/model/stages/1_RandomForestRegressor_9b4d875bf9f9/data/_SUCCESS (stored 0%)
  adding: content/model/stages/1_RandomForestRegressor_9b4d875bf9f9/data/.part-00000-f912abf2-407e-422f-add8-a2733

In [210]:
from google.colab import files
files.download("/content/model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [80]:
# End the Spark session
spark.stop()

In [None]:
# Don't forget to stop the Spark session when you're done
spark.stop()