In [None]:
pip install pyspark

In [None]:
pip install handyspark

In [None]:
import os
import pandas as pd
import numpy as np

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import RandomForestRegressor, LinearRegressionModel
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [None]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
import seaborn as sns 
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
spark = SparkSession.builder.master("local[2]").appName("Linear-Regression-Airbnb_House_price").getOrCreate()
spark

In [None]:
sc = spark.sparkContext
sc

In [None]:
PATH = '../input/listingsold-datasetcsv/listings.csv'

In [None]:
import pandas as pd
housing_df = pd.read_csv(PATH)
housing_df.head()

# Fill Missing Values with Pandas

In [None]:
null = []
for i in housing_df.columns:
    if any(housing_df[i].isna()) == True:
        null.append(i)
null

In [None]:
for i in null:
    housing_df[i]=housing_df[i].ffill().bfill()

# Prepare Amenities Column

In [None]:
housing_df.amenities


In [None]:
housing_df.amenities = housing_df.amenities.apply(lambda x :x.strip('{').strip('}'))
housing_df.amenities[0]        

In [None]:
housing_df.amenities =housing_df.amenities.apply(lambda x:x.replace('"',''))
housing_df.amenities.head()

In [None]:
housing_df.value_counts()

In [None]:
#housing_df = spark.read.csv(path=PATH,header=True)
housing_df = spark.createDataFrame(housing_df)

In [None]:
housing_df.printSchema()

In [None]:
housing_df.select('price').show(10)

In [None]:
housing_df.show(5)

In [None]:
lst = ['id','host_since','listing_url','host_id','square_feet']
# these columns are irrelevant for price and squear_feet is consist of 98% missing values.

In [None]:
housing_df = housing_df.drop(*lst)
housing_df.show()


In [None]:
housing_df.printSchema()

In [None]:
housing_df.columns

In [None]:
housing_df

In [None]:
housing_df = housing_df.toPandas()
housing_df.head()

In [None]:
housing_df.isna().mean()

In [None]:
housing_df = spark.createDataFrame(housing_df)
housing_df.printSchema()

In [None]:
housing_df.select('price','property_type').show(10)

In [None]:
import seaborn as sns

In [None]:
# group by housingmedianage and see the distribution
result_df = housing_df.groupBy("property_type").count().sort("property_type", ascending=False)
result_df

In [None]:
result_df.toPandas().plot.bar(x='property_type',figsize=(14, 6))

In [None]:
sns.distplot(housing_df.toPandas()['price'])

In [None]:
(housing_df.describe().select('summary',F.round("price", 4).alias("price")).show())

In [None]:
sns.boxplot(housing_df.toPandas()['price'])

In [None]:
import matplotlib.pyplot as plt


In [None]:
plt.figure(figsize=(12,9))
sns.lineplot(housing_df.toPandas()['review_scores_rating'],housing_df.toPandas()['price'])

In [None]:
nbc= housing_df.select('neighbourhood').distinct().show()
nbc

In [None]:
housing_df.toPandas().neighbourhood.unique()

In [None]:
df = housing_df.toPandas()
df.head()

In [None]:
area =['Roslindale', 'Jamaica Plain', 'Mission Hill', 'Fenway/Kenmore',
       'Back Bay', 'Leather District', 'Chinatown', 'Hyde Park',
       'North End', 'Roxbury', 'South End', 'Mattapan', 'East Boston',
       'South Boston', 'Charlestown', 'West End', 'Beacon Hill',
       'Theater District', 'Downtown Crossing', 'Downtown',
       'Financial District', 'Government Center', 'Allston-Brighton',
       'West Roxbury', 'Chestnut Hill', 'Dorchester', 'Brookline',
       'Cambridge', 'Somerville', 'Harvard Square']

In [None]:
plt.figure(figsize=(12,9))
for i in area:
    k = df[df['neighbourhood']==i]
    sns.lineplot(k["review_scores_accuracy"], k.price)
    plt.title(i)
    plt.show()

In [None]:
plt.figure(figsize=(12,9))
sns.barplot(df["instant_bookable"], df.price)

In [None]:

plt.figure(figsize=(12,9))
sns.barplot(df["cancellation_policy"], df.price,hue=df.review_scores_accuracy)

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
df[['review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_value']]

In [None]:
#df.corr()['price'].drop('price').plot(kind='barh');

sns.set_style("darkgrid")
df.corr()["price"].drop('price').sort_values().plot.barh(figsize=(10,6), color="red")
plt.title(" Correlations With Price", {"color":"blue", "fontsize":15});

In [None]:
housing_df.printSchema()

In [None]:
housing_df.columns

In [None]:
housing_df.columns[:-1]


In [None]:
str_cols = [f.name for f in housing_df.schema.fields if isinstance(f.dataType, StringType)]
# ['colc']
str_cols

# Get Dummies for 'neighbourhood','property_type'

In [None]:
categories = housing_df.select("neighbourhood").distinct().rdd.flatMap(lambda x: x).collect()

exprs = [F.when(F.col("neighbourhood") == category, 1).otherwise(0).alias(category)
         for category in categories]

nbc = housing_df.select(*exprs)
nbc.show()

In [None]:
categories = housing_df.select("property_type").distinct().rdd.flatMap(lambda x: x).collect()

exprs = [F.when(F.col("property_type") == category, 1).otherwise(0).alias(category)
         for category in categories]

pt = housing_df.select(*exprs)
pt.show()

# Modelling

In [None]:
housing_df.columns

In [None]:
housing_df = housing_df.na.replace('','other')

In [None]:
from pyspark.sql.types import IntegerType
housing_df = housing_df.withColumn("price", housing_df["price"].cast(IntegerType()))

In [None]:
string_indexer = StringIndexer(inputCols=['neighbourhood',
 'property_type',
 'room_type',
 'amenities',
 'instant_bookable',
 'cancellation_policy'], 
                               outputCols=["neighbourhood_ix", "property_type_ix", "room_type_ix","amenities_ix", 
                                           "instant_bookable_ix", "cancellation_policy_ix"], handleInvalid="keep")

ohe_indexer = OneHotEncoder(inputCols=["neighbourhood_ix", "property_type_ix", "room_type_ix","amenities_ix", "instant_bookable_ix", 
                                            "cancellation_policy_ix"], 
                           outputCols=["neighbourhood_enc", "property_type_enc", "room_type_enc","amenities_enc", "instant_bookable_enc", 
                                            "cancellation_policy_enc"], handleInvalid="keep")

assembler = VectorAssembler(inputCols=['latitude',
 'longitude',
 'neighbourhood_enc',
 'property_type_enc',
 'room_type_enc',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'amenities_enc',
 'minimum_nights',
 'number_of_reviews',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable_enc',
 'cancellation_policy_enc',
 'calculated_host_listings_count',
 'reviews_per_month'] , outputCol="Attributes")

regressor = RandomForestRegressor(featuresCol="Attributes", labelCol="price")

pipeline = Pipeline(stages=[string_indexer, ohe_indexer, assembler, regressor])

# Creating Cross Validator

In [None]:
paramgrid = ParamGridBuilder() \
                .addGrid(regressor.numTrees, [50, 100, 500, 1000]).build()

cross_validator = CrossValidator(estimator=pipeline,
                                estimatorParamMaps=paramgrid,
                                evaluator= RegressionEvaluator(labelCol="price"),
                                numFolds=3)

In [None]:
train_data, test_data = housing_df.randomSplit([0.75, 0.25])


In [None]:
cvmodel = cross_validator.fit(train_data)

# ASSESSMENT

In [None]:
bestmodel = cvmodel.bestModel
for x in range(len(bestmodel.stages)):
    print(bestmodel.stages[x])

In [None]:
pred = cvmodel.transform(test_data)
pred.select("price", "prediction").show()

In [None]:
eval = RegressionEvaluator(labelCol="price")
rmse = eval.evaluate(pred)
mse = eval.evaluate(pred, {eval.metricName:"mse"})
mae = eval.evaluate(pred, {eval.metricName:"mae"})
r2 = eval.evaluate(pred, {eval.metricName:"r2"})

print("RMSE: %.3f" %rmse)
print("MSE : %.3f" %mse)
print("MAE : %.3f" %mae)
print("R2  : %.3f" %r2)