In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# This notebook uses Spark to load an perform data transformations and combines interesting sections from these notebooks : 
#
# https://www.kaggle.com/mohaiminul101/car-price-prediction
# https://www.kaggle.com/aishu2218/do-you-wanna-predict-price-of-car-you-wanna-buy
# https://www.kaggle.com/udit1907/linear-advanced-regression-guided-car-purchase
#  
# These notebooks use 'Pandas' and 'scikit-learn'. This notebook here uses Spark to do the ETL and 
# then converts the Spark dataframe to a 'Pandas' dataframe (at the last minute) to do the visualization.
# Note that we are using regular 'Pandas' and not the newly available 'pyspark.pandas' in this notebook.


# Start Spark Session

In [None]:
# The simulated cluster environment is configured with : 
#   - 3 workers
#   - With 3GB of memory for each worker (Total memory is 9GB)
#   - Each worker has 2 cores (total cores is 6)

# Start up Spark session. Let's be greedy and ask for all available resources (We'll be explicit).

# We could not include the 'spark.executor.cores', 'spark.cores.max', 'spark.executor.memory' configurations. 
# Spark would give us all available resources by default (if fair sharing between jobs is not configured). 

# Request : 
#   - A maximum of 6 cores with 
#   - 2 cores per executor
#   - 3 GB of memory per executor


In [None]:
spark = SparkSession\
            .builder\
            .master("spark://spark-master:7077")\
            .appName("etl_jupyter")\
            .config("spark.executor.cores", "2")\
            .config("spark.cores.max", "6")\
            .config("spark.executor.memory", "3G")\
            .config("spark.driver.memory", "2G")\
            .getOrCreate()


# Read Raw Car Sales Data

In [None]:
# Read in the data. We don't ask Spark to determine the column data types. 
# This can add time to the job. We can give it a schema in the read command or we can cast these once loaded. 
# Here, we'll cast the columns to the appropriate types once loaded.


In [None]:
# Reference : https://spark.apache.org/docs/latest/sql-data-sources-csv.html

car_df = spark\
    .read\
    .option("header", True)\
    .option("delimiter", ",")\
    .option("inferSchema", False)\
    .csv("/data/car_data.csv")


In [None]:
# Get number of rows and columns in the dataframe

print('Rows: {}, Columns: {}'.format(car_df.count(), len(car_df.columns)))


In [None]:
# Print the dataframe schema.

# Column descriptions are : 
# Ref : https://www.kaggle.com/aishu2218/do-you-wanna-predict-price-of-car-you-wanna-buy/data?select=car+data.csv


# Car_Name :      The name of the car.
# Year :          The year in which the car was bought.
# Selling_Price : The price the owner wants to sell the car at.
# Present_Price : The current ex-showroom price of the car.
# Kms_Driven :    The distance completed by the car in km.
# Fuel_Type :     Fuel type of the car.
# Seller_Type :   Whether the seller is a dealer or an individual.
# Transmission:   Whether the car is manual or automatic.
# Owner:          The number of owners the car has previously had.

car_df.printSchema()


In [None]:
# Show a few sample records. 

# Note that Prices are in lakh units. 
# https://en.wikipedia.org/wiki/Lakh ... In Indian numbering system equal to one hundred thousand. 
# For example, in India 150,000 rupees becomes 1.5 lakh rupees.

car_df.show(10, False)


# Partitioning

In [None]:
# RDDs in Apache Spark are collection of partitions. Spark automatically partitions RDDs and distributes the partitions 
# across different nodes. A partition in spark is an atomic chunk of data (logical division of data) stored on a node in the cluster. 
# Partitions are basic units of parallelism in Apache Spark. 

In [None]:
# When processing, Spark assigns one task for each partition and each worker threads can only process one task at a time. 
# Thus, with too few partitions, the application won’t utilize all the cores available in the cluster and it 
# can cause data skewing problem; with too many partitions, it will bring overhead for Spark to manage too 
# many small tasks.

In [None]:
# In Spark, by default, a partition is created for every HDFS partition of size 64MB. 
# RDDs are automatically partitioned in spark without human intervention, however, at times the 
# programmers would like to change the partitioning scheme by changing the size of the partitions 
# and number of partitions based on the requirements of the application.

![partitions](media/partitioning.png)

In [None]:
# To help with parallelism, shuffle data to where we have as many partitions as the number of available core
# (e.g. 6 in this case because our cluster is configured with 6 cores).

print('Number of partitions for car_df dataframe. Before repartition : {}'.format(car_df.rdd.getNumPartitions()))


# Show the number of records per partition before the shuffle. Note that there is a cost to shuffling data.

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("count"))\
    .show()


In [None]:
# Repartition data.

car_df = car_df.repartition(6) 


# Show the number of records per partition after the shuffle.

print('Number of partitions for car_df dataframe. After repartition : {}'.format(car_df.rdd.getNumPartitions()))

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("partitionId"))\
    .show()


# Data Transformations

In [None]:
# Assign proper types to certain column elements.

car_df = car_df\
    .withColumn('Year', car_df.Year.cast(T.IntegerType()))\
    .withColumn('Kms_Driven', car_df.Kms_Driven.cast(T.IntegerType()))\
    .withColumn('Owner', car_df.Owner.cast(T.IntegerType()))\
    .withColumn('Selling_Price', car_df.Selling_Price.cast(T.DoubleType()))\
    .withColumn('Present_Price', car_df.Present_Price.cast(T.DoubleType()))


In [None]:
# Check the schema to see the changes

car_df.printSchema()


In [None]:
# Check to make sure that we have data in every column of every row.
# The query below counts all instances of null or nan in each colum of the car_df dataframe.

car_df.select(
    [F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in car_df.columns]
).show()


In [None]:
# A function to calculate the age of a vehicle given year

def get_age(i_year):
    current_year = date.today().year

    if current_year < i_year:
        raise Exception('Problem with years')

    return current_year - i_year


# Create a user defined function (i.e. a user-programmable routine that act on one row).

calc_age_udf = F.udf(lambda year: get_age(year), T.IntegerType())



# Add an Car_Age column to the car_df dataframe

car_df = car_df\
    .withColumn('Car_Age', calc_age_udf(F.col('Year')))

# Check our work

car_df.select('Year', 'Car_Age').show(10, False)


# Data Exploration

In [None]:
# Compute summary statistics for numeric columns.

car_df\
    .select('Selling_Price', 'Present_Price', 'Kms_Driven', 'Car_Age')\
    .summary()\
    .show()


In [None]:
# Convert the Spark dataframe to a native Pandas dataframe (for visualizations). 
# Note that we are NOT using the NEW Pandas API on Spark, which allows 
# you to scale your Pandas workload out. It's just straight Pandas.

car_pdf = car_df.toPandas()


In [None]:
# Create boxplots for numerical data.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 7))
fig.suptitle('Numerical Data Box Plots')

ax1 = sns.boxplot(x='Selling_Price', data=car_pdf, ax=ax1)
ax2 = sns.boxplot(x='Kms_Driven', data=car_pdf, ax=ax2)
ax3 = sns.boxplot(x='Present_Price', data=car_pdf, ax=ax3)
ax4 = sns.boxplot(x='Car_Age', data=car_pdf, ax=ax4)


In [None]:
# Create bar charts for categorical data.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 11))
fig.suptitle('Counts by Categories')

ax1 = sns.countplot(x='Seller_Type', data=car_pdf, ax=ax1).set_title('Sale Count by Seller Type')
ax2 = sns.countplot(x='Fuel_Type', data=car_pdf, ax=ax2).set_title('Sale Count by Fuel Type')
ax3 = sns.countplot(x='Transmission', data=car_pdf, ax=ax3).set_title('Sale Count by Transmission')
ax4 = sns.countplot(x='Owner', data=car_pdf, ax=ax4).set_title('Sale Count by Previous Owner Count')


In [None]:
# Create correlation heatmap.

sns.heatmap(car_pdf.corr(), annot=True, cmap="RdBu")
plt.show()


In [None]:
# Plot selling_price relative to present_price.

plt.figure(figsize=(10,10))
sns.lmplot(x='Present_Price',y='Selling_Price',data=car_pdf)


# Save the Spark Dataframe as a Parquet File for Downstream Use

In [None]:
# Show the schema of the dataframe we are saving

car_df.printSchema()


In [None]:
car_df.write.mode("overwrite").parquet("/data/car_data.parquet")

In [None]:
spark.stop()