In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pyspark

In [None]:
## Turning on intellisense in the notebook
%config Completer.use_jedi = False

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# setting random seed for notebook reproducability
rnd_seed=23
np.random.seed=rnd_seed
np.random.set_state=rnd_seed

In [None]:
spark = SparkSession.builder.master("local[2]").appName("Used-cars-data-wrangling").getOrCreate()
spark

Load The Data From a File Into a Dataframe

In [None]:
USED_CAR_DATA = '/kaggle/input/craigslist-carstrucks-data/vehicles.csv'
# define the schema, corresponding to a line in the csv data file.
schema = StructType([StructField("id", LongType(), nullable=True),
 StructField("linkurl", StringType(), nullable=True),
 StructField("text_formatregion", StringType(), nullable=True),
 StructField("linkregion_url", StringType(), nullable=True),
 StructField("grid_3x3price", FloatType(), nullable=True),
 StructField("grid_3x3year", IntegerType(), nullable=True),
 StructField("text_formatmanufacturer", StringType(), nullable=True),
 StructField("text_formatmodel", StringType(), nullable=True),
 StructField("text_formatcondition", StringType(), nullable=True),
 StructField("text_formatcylinders", StringType(), nullable=True),
 StructField("text_formatfuel", StringType(), nullable=True),
 StructField("grid_3x3odometer",IntegerType(), nullable=True),
 StructField("text_formattitle_status", StringType(), nullable=True),
 StructField("text_formattransmission", StringType(), nullable=True),
 StructField("text_formatVIN", StringType(), nullable=True),
 StructField("text_formatdrive", StringType(), nullable=True),
 StructField("text_formatsize", StringType(), nullable=True),
 StructField("text_formattype", StringType(), nullable=True),
 StructField("text_formatpaint_color", StringType(), nullable=True),
 StructField("linkimage_url", StringType(), nullable=True),
 StructField("text_formatdescription", StringType(), nullable=True),
 StructField("text_formatstate", StringType(), nullable=True),
 StructField("navigationlat", StringType(), nullable=True),
 StructField("grid_3x3long", IntegerType(), nullable=True),
 StructField("calendar_todayposting_date", StringType(), nullable=True)
    ])

In [None]:
cars_df =spark.read.format('csv').options(header='true').options(delimiter=',').load(USED_CAR_DATA).cache()
#cars_df.describe()
cars_df.show(20)
cars_df.printSchema()
cars_df.select('region','price','year','manufacturer','model','condition','cylinders', 'fuel','odometer','title_status', 'transmission','drive','size', 'state', 
               'type', 'paint_color', 'posting_date','image_url').show(10)
cars_df.count()

In [None]:
# group by condition and see the distribution
result_df = cars_df.groupBy("condition").count().sort("condition", ascending=False)
result_df.show(101)

In [None]:
conditions = ["good", "fair", "excellent", "new", "like new", "salvage"]
condition_result_not_null_df = cars_df.where(cars_df.condition.isin(conditions) )
condition_result_not_null_df.show(10)
condition_result_not_null_df.count()


In [None]:
# condition_result_null_df = cars_df.where(cars_df.condition is Nul)
condition_result_null_df = cars_df.filter("condition is NULL")
condition_result_null_df.count()

condition_result_df = condition_result_null_df.union(condition_result_not_null_df)
condition_result_df.count()

In [None]:
# manufacturer_df = cars_df.groupBy("manufacturer").count().sort("manufacturer", ascending=True)
# manufacturer_df = condition_result_df.groupBy("manufacturer").count().sort("manufacturer", ascending=True)
manufacturer_list = ['alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'datsun','dodge','ferrari','fiat','ford','gmc', 'harley-davidson','hennessey',
'honda','hyundai','infiniti','jaguar','jeep', 'kia','land rover', 'lexus','lincoln','mazda', 'mercedes-benz','mercury','mini','mitsubishi','morgan',
'nissan','pontiac','porsche','ram','rover','saturn','subaru','tesla','toyota','volkswagen','volvo'
]

# 'price', 'year' - Numeric 
# 'manufacturer', 'condition', 'cylinders','fuel', 'odometer', 'transmission','drive', 'type', 'paint_color' - Categorical

manufacturer_df = cars_df.where(cars_df.manufacturer.isin(manufacturer_list)  ).cache()
#manufacturer_null_df = manufacturer_df.filter("manufacturer is NULL")
#manufacturer_not_null_df
#manufacturer_not_null_df.count()

manufacturer_df.show(10)
#manufacturer_df = manufacturer_not_null_df.union(manufacturer_null_df)
manufacturer_df.count()

In [None]:
manufacturer_df_list = manufacturer_df.groupBy("manufacturer").count().sort("manufacturer", ascending=True)
manufacturer_df_list.show()

What car is most popular?

In [None]:
manufacturer_df_list.toPandas().plot.bar(x='manufacturer',figsize=(18, 6))

In [None]:
# Calculate summary statistics
# manufacturer_df.describe().show()
#Statistics.colStats(manufacturer_df)

(manufacturer_df.describe().select(
                    "summary",
                    F.round("price", 2).alias("price"),
                    F.round("year", 0).alias("year"),
                    F.round("odometer", 0).alias("odometer"),
                    F.round("lat", 2).alias("lat"),
                    F.round("long", 2).alias("long"))
                    .show())

In [None]:
cylinders_df = manufacturer_df.groupBy("cylinders").count().sort("cylinders", ascending=False)
cylinders_df = cylinders_df.na.fill("None")
#cylinders_df.show()
cylinders_df.toPandas().plot.bar(x='cylinders',figsize=(14, 6))

In [None]:
fuel_df = manufacturer_df.groupBy("fuel").count().sort("fuel", ascending=False)
fuel_df = fuel_df.na.fill("None")
fuel_df.show()
fuel_df.count()
fuel_df.toPandas().plot.bar(x='fuel',figsize=(14, 6))

transmission_df = manufacturer_df.groupBy("transmission").count().sort("transmission", ascending=False)
transmission_df = transmission_df.na.fill("None")
transmission_df.show()
transmission_df.count()
transmission_df.toPandas().plot.bar(x='transmission',figsize=(14, 6))

paint_color_df = manufacturer_df.groupBy("paint_color").count().sort("paint_color", ascending=False)
paint_color_df = paint_color_df.na.fill("None")
paint_color_df.show()
paint_color_df.count()
paint_color_df.toPandas().plot.bar(x='transmission',figsize=(14, 6))

In [None]:
title_df = manufacturer_df.groupBy("title_status").count().sort("title_status", ascending=False)
title_df = title_df.na.fill("None")
title_df.show()
title_df.toPandas().plot.bar(x='title_status',figsize=(14, 6))

In [None]:
# Only use the cars with clean titles
title_df = manufacturer_df.where(manufacturer_df.title_status.isin('clean') ).cache()
title_df.count()

manufacturer_df.unpersist()

In [None]:
drive_df = title_df.groupBy("drive").count().sort("drive", ascending=False)
drive_df = drive_df.na.fill("None")
drive_df.show()
drive_df.toPandas().plot.bar(x='drive',figsize=(14, 6))

In [None]:
# size|  type and state
size_df = title_df.groupBy("size").count().sort("size", ascending=False)
size_df = size_df.na.fill("None")
size_df.show()
size_df.toPandas().plot.bar(x='size',figsize=(14, 6))

type_df = title_df.groupBy("type").count().sort("type", ascending=False)
type_df = type_df.na.fill("None")
type_df.show()
type_df.toPandas().plot.bar(x='type',figsize=(14, 6))


In [None]:
# This is the state of the car like in the features built in the car.

region_df = title_df.groupBy("region").count().sort("count", ascending=False)
region_df = region_df.na.fill("None")
region_df.show(405)
# region_df.count() -- 405
region_df.limit(100).toPandas().plot.bar(x='region',figsize=(38, 6))

In [None]:
price_df = title_df.groupBy("price").count().sort("price", ascending=False)
price_df = price_df.na.replace("None")
price_df[price_df=="None"]



In [None]:
manufacturer_df.columns

In [None]:
#. Modelling

# Linear regression - hand code []
# 'region' , 'price', 'year', 'manufacturer','model', 'condition', 'cylinders','fuel', 'odometer', |title_status|transmission|              VIN|drive|     size|  type|paint_color|           image_url|         description|state|      lat|       long|        posting_date|

# Pick only these :
# 'price', 'year', 'manufacturer', 'condition', 'cylinders','fuel', 'odometer', 'transmission','drive', 'type', 'paint_color'

manufacturer_list = ['alfa-romeo', 'aston-martin', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'datsun','dodge','ferrari','fiat','ford','gmc', 'harley-davidson','hennessey',
                    'honda','hyundai','infiniti','jaguar','jeep', 'kia','land rover', 'lexus','lincoln','mazda', 'mercedes-benz','mercury','mini','mitsubishi','morgan',
                    'nissan','pontiac','porsche','ram','rover','saturn','subaru','tesla','toyota','volkswagen','volvo']
conditions        = ["good", "fair", "excellent", "new", "like new", "salvage", "None"]
fuels             = ["other", "hybrid", "gas","electric", "diesel", "None"]
transmissions     = ["other", "manual", "automatic", "None"]
drives            = [ "rwd", "fwd", "4wd", "None"]
vehicle_types     = ["wagon", "van", "truck", "sedan","pickup","other", "offroad", "mini-van", "hatchback", "coupe", "convertible", "bus", "SUV", "None"]
paint_colors      = ["yellow", "white","silver","red","purple","orange", "grey","green","custom","brown","blue","None"]

# 'price', 'year' - Numeric 
# 'manufacturer', 'condition', 'cylinders','fuel', 'odometer', 'transmission','drive', 'type', 'paint_color' - Categorical
size_df = manufacturer_df.na.fill("None")
features_df = manufacturer_df.where(
                    manufacturer_df.manufacturer.isin(manufacturer_list) & 
                    manufacturer_df.condition.isin(conditions) &  
                    manufacturer_df.fuel.isin(fuels) &  
                    manufacturer_df.transmission.isin(transmissions) &  
                    manufacturer_df.drive.isin(drives) &  
                    manufacturer_df.type.isin(vehicle_types) &  
                    manufacturer_df.paint_color.isin(paint_colors)   

                ).cache()

#features_df.show(10)
features_df.count()

#Picking only the relevant columns for Analysis
selected_df = features_df.select('price', 'year', 'manufacturer', 'condition', 'cylinders','fuel', 'odometer', 'transmission','drive', 'type', 'paint_color')
selected_df = selected_df.dropna()
selected_df.show(10)
selected_df.count()


In [None]:

selected_df.printSchema

from pyspark.sql.types import IntegerType
selected_int_df = selected_df.withColumn("price", selected_df["price"].cast(IntegerType()))
selected_int_df = selected_int_df.withColumn("year", selected_df["year"].cast(IntegerType()))
selected_int_df.printSchema
selected_int_df.describe()

selected_int_df.count()
selected_int_df = selected_int_df.filter("price is not null and price > 3000 and price < 60000 ")
selected_int_df.count()

selected_int_df.agg({'price': 'min', 'price': 'max'}).show()
#selected_int_df.agg({'price': 'max'}).show()

In [None]:
# Converting the categorical features to Numerical
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# 2 step process. Convert the strings to an indexed value and then apply the one hot encoder
indexer = StringIndexer(inputCols=['manufacturer', 'condition', 'cylinders','fuel', 'odometer', 'transmission','drive', 'type', 'paint_color'], 
                        outputCols=['manufacturer_index', 'condition_index', 'cylinders_index','fuel_index', 'odometer_index', 'transmission_index','drive_index', 'type_index', 'paint_color_index'])
indexed = indexer.fit(selected_int_df).transform(selected_int_df)
indexed.show(3)

encoder = OneHotEncoder(inputCols=['manufacturer_index', 'condition_index', 'cylinders_index','fuel_index', 'odometer_index', 'transmission_index','drive_index', 'type_index', 'paint_color_index'],
                        outputCols=[ 'manufacturer_vec', 'condition_vec', 'cylinders_vec','fuel_vec', 'odometer_vec', 'transmission_vec','drive_vec', 'type_vec', 'paint_color_vec'])
model = encoder.fit(indexed)
encoded_df = model.transform(indexed)
encoded_df.show(3)


In [None]:
# Standardizing and Normalizing the data.

from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['manufacturer_vec', 'condition_vec', 'cylinders_vec','fuel_vec', 'odometer_vec', 'transmission_vec','drive_vec', 'type_vec', 'paint_color_vec'], 
                                   outputCol = 'features')
v_selected_df = vectorAssembler.transform(encoded_df)
v_selected_df = v_selected_df.select(['features', 'price'])
v_selected_df.show(3, False)
v_selected_df.count()

splits = v_selected_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

test_df.columns

In [None]:




from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='price', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

# y= m*x + b
# Price = slope1* manufacturer + slope2 * paint + slope3

In [None]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)
train_df.describe().show()

In [None]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","price","features").show(15)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction",  labelCol="price", metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))


In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'price')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_predictions.select("prediction","price","features").show(5)

dt_evaluator = RegressionEvaluator(
    labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(dt_predictions))


In [None]:
# Future enhancements
splits = [-float("inf"), -1, -0.5, 0.0, 0.5, float("inf")]

data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])

bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

from pyspark.ml.feature import Bucketizer

splits = [100000.0, 90000.0, 80000.0, 70000.0, 
          60000.0, 50000.0,40000.0, 30000.0, 20000.0,10000.0, 0.0]
bucketizer = Bucketizer(splits=splits, inputCol="price", outputCol="bucketedPrice")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(encoded_df)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

In [None]:
selected_df.show(10)

In [None]:
# Create a model using pure python
# Array for features, predict the price.

# Calculate the mean value of a list of numbers
def mean(values): return sum(values) / float(len(values))
 
# Calculate the variance of a list of numbers
def variance(values, mean): return sum([(x-mean)**2 for x in values])

mean_x, mean_y = mean(selected_df["features), mean(selected_df.price)
#var_x, var_y = variance(train_df.x, train_df.mean_x), variance(train_df.y, train_df.mean_y)
print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
#print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))