In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
!pip show pyspark

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

In [None]:
sc = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
data = sc.read.csv('/kaggle/input/cardataset/data.csv', inferSchema=True, header=True)

In [None]:
data.printSchema()

In [None]:
data.toPandas()

In [None]:
data.toPandas().isnull().sum().sort_values(ascending=False)

In [None]:
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

In [None]:
data = data.drop('Market Category')
data = data.na.drop()
print((data.count(), len(data.columns)))

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

In [None]:
assembler = VectorAssembler(inputCols=['Year', 'Engine HP', 'Engine Cylinders', 'Number of Doors', 'highway MPG', 'city mpg', 'Popularity'],
                           outputCol='Attributes')

regressor = RandomForestRegressor(featuresCol='Attributes', labelCol='MSRP')

In [None]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, regressor])
pipeline.write().overwrite().save("pipeline")
!ls

In [None]:
pipelineModel = Pipeline.load("pipeline")

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=123)
model = pipelineModel.fit(train_data)
predictions = model.transform(test_data)

In [None]:
# Select example rows to display.
predictions.select("MSRP", "prediction").show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
evaluator = RegressionEvaluator(labelCol="MSRP", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

print("RMSE ", rmse)