In [None]:
# This notebook uses Spark import and manipulate the car_data.csv data file. 
# It combines interesting sections from these 3 notebooks : 
#
# https://www.kaggle.com/mohaiminul101/car-price-prediction
# https://www.kaggle.com/aishu2218/do-you-wanna-predict-price-of-car-you-wanna-buy
# https://www.kaggle.com/udit1907/linear-advanced-regression-guided-car-purchase

# These notebooks use 'Pandas' and 'scikit-learn'. I primarily use SparkSQL and Spark MLlib.

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

<span style="color:blue">
    TODO : 
    <ul>
      <li>Highlight pyspark.sql SparkSession import.<\li>
      <li>Spark canned set of functions and types.</li>
      <li>Other imports are for date manipulations and plotting.</li>
    </ul>
</span>

# Start Spark Session

In [None]:
# The simulated cluster environment is configured with : 
#   - 3 workers
#   - With 3GB of memory for each worker (Total memory is 9GB)
#   - Each worker has 2 cores (total cores is 6)

# Start up Spark session. Let's be greedy and ask for all available resources (We'll be explicit).

# Request : 
#   - A maximum of 6 cores with 
#   - 2 cores per executor
#   - 3 GB of memory per executor

# We also want to tell Spark about a specific java .jar file which contains a user defined function 
# we want to use later.

In [None]:
spark = SparkSession\
            .builder\
            .master("spark://spark-master:7077")\
            .appName("1_car_data_ETL_jupyter")\
            .config("spark.executor.cores", "2")\
            .config("spark.cores.max", "6")\
            .config("spark.executor.memory", "3G")\
            .config("spark.driver.memory", "2G")\
            .config('spark.jars', '/src/java/spark-jobs/helloworld/target/jv_helloworld-1.0-SNAPSHOT.jar')\
            .config('spark.executor.extraClassPath', '/src/java/spark-jobs/helloworld/target/jv_helloworld-1.0-SNAPSHOT.jar')\
            .getOrCreate()


In [None]:
# Print the Spark session configuration. 

print("Spark Session configuration : ")

print('===')

for e in spark.sparkContext.getConf().getAll():
    print(e)

print('===')

<span style="color:blue">
    TODO : 
    <ul>
      <li>Browse *http://spark-master:8080*. <\li>
      <li>Look at running applications names.</li>
      <li>Look at worker core and memory usage.</li>
      <li>Highlight external '.jar' usage in Session configuration.</li>
      <li>Show StringLengthUDF class code in GitHub Project Repos.</li>
    </ul>
</span>

In [None]:
# Add jar to java spark context. It has a UDF that I want to use later.

spark._jsc.addJar("/src/java/spark-jobs/helloworld/target/jv_helloworld-1.0-SNAPSHOT.jar")


In [None]:
# Register the java function. It will be available as StringLengthUDF (in the spark.sql command).

spark.udf.registerJavaFunction("StringLengthUDF", "ca.nrc.udf.StringLengthUDF", T.IntegerType())


In [None]:
# Tell Spark to use Apache Arrow

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# About the Data ... 
The **car_data.csv** file contains information about used cars. We'll use this data for the purposes of vehicle price prediction.

### Columns and Descriptions
* **Car_Name** :      The name of the car.
* **Year** :          The year in which the car was bought.
* **Selling_Price** : The price the owner wants to sell the car at.
* **Present_Price** : The current ex-showroom price of the car.
* **Kms_Driven** :    The distance completed by the car in km.
* **Fuel_Type** :     Fuel type of the car.
* **Seller_Type** :   Whether the seller is a dealer or an individual.
* **Transmission** :   Whether the car is manual or automatic.
* **Owner** :          The number of owners the car has previously had.

# Read Raw Car Sales Data

In [None]:
# Read in the data. We don't ask Spark to determine the column data types. 
# This can add time to the job. We can give it a schema in the read command 
# or we can cast these once loaded. 

# Here, we'll cast the columns to the appropriate types once loaded.


In [None]:
# Reference : https://spark.apache.org/docs/latest/sql-data-sources-csv.html

car_df = spark\
    .read\
    .option("header", True)\
    .option("delimiter", ",")\
    .option("inferSchema", False)\
    .csv("/data/car_data.csv")


In [None]:
# Get number of rows and columns in the dataframe. 
# Similar to Pandas "pandas.DataFrame.shape" ...

print('Rows: {}, Columns: {}'.format(car_df.count(), len(car_df.columns)))


In [None]:
# Print the dataframe schema.
# Similar to Pandas "pandas.DataFrame.dtypes" ...

car_df.printSchema()


In [None]:
# Show a few sample records. 

# Note that Prices are in lakh units. 
# https://en.wikipedia.org/wiki/Lakh ... 

# A lakh, in Indian numbering system, is equal to one hundred thousand. 
# For example, in India 150,000 rupees becomes 1.5 lakh rupees.

car_df.show(5, False)


# Partitioning

In [None]:
# Partitions are basic units of parallelism in Apache Spark. 
# With too few partitions, the application won’t utilize all the cores available in the cluster.

In [None]:
# By default, Spark creates partitions that are equal to the number of CPU cores in the machine (spark.default.parallelism).

In [None]:
# When Spark reads a CSV file, it splits up the data into multiple partitions based on 
# the configuration "spark.files.maxPartitionBytes" which defaults to 128MB.

# RDDs are automatically partitioned in spark without human intervention. 
# However, we can change the partitioning scheme if we want. 
# Note that there is a cost to shuffling data.

![partitions](media/partitioning.png)

In [None]:
# What is the current partitioning scheme ? 

print('Number of partitions for car_df dataframe. Before repartition : {}'.format(car_df.rdd.getNumPartitions()))


# Show the number of records per partition before the shuffle. 

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("count"))\
    .show()


In [None]:
# Repartition data for fun. Shuffle data to where we have as many partitions as the number of available core
# (e.g. 6 in this case because our cluster is configured with 6 cores).

car_df = car_df.repartition(6) 


# Show the number of records per partition after the shuffle.

print('Number of partitions for car_df dataframe. After repartition : {}'.format(car_df.rdd.getNumPartitions()))

car_df\
    .withColumn("partitionId", F.spark_partition_id())\
    .groupBy("partitionId")\
    .count()\
    .orderBy(F.asc("partitionId"))\
    .show()


# DataFrames API - Selection, Aggregation, Filtering

In [None]:
# We can do any number of transformations using the DataFrames API.
# This is to give you an idea of what interacting with dataframes looks like.

# What kind of fuels the vehicles run on (Selection, Aggregation) ?

fuels_df = car_df\
    .select('Fuel_Type')\
    .groupBy('Fuel_Type')\
    .agg(F.count("Fuel_Type").alias('number_of_vehicles'))\
    .orderBy(F.col('number_of_vehicles').desc())

fuels_df.show()


In [None]:
# We can do the same with a temporary table and a SQL query:

# Register the car_df DataFrame as a SQL temporary view
car_df.createOrReplaceTempView("CAR_TABLE")


# What kind of fuels the vehicles run on (Selection, Aggregation) ?

# Create the fuels_df DataFrame using Standard SQL query
fuels_df = spark.sql('''
    SELECT
        Fuel_Type,
        COUNT(1) AS number_of_vehicles
    FROM 
        CAR_TABLE
    GROUP BY
        Fuel_Type
    ORDER BY
        number_of_vehicles
    DESC
''')

fuels_df.show()


In [None]:
# Filtering example :

# Cars selling for over 30 lakh rupees. 

car_df\
    .filter(F.col('Selling_Price').cast(T.DoubleType()) >= '30.0')\
    .show()

# Note that we have to cast the Selling_Price as a double. It's still a string at this point.
# We will fix this permanently in the next section.


In [None]:
# A point for the section above is this : 


# PySpark APIs for working with data are quite easy to use and maybe even easier to work with than other APIs.. (Looking at you Pandas).


# !! HOWEVER !! 

# If you still want to something like a Pandas API but benefit from Spark's distributed architecture, 
# check out (Pandas API on Spark - used to be Koalas). This is new in Spark 3.2.

# https://databricks.com/blog/2021/10/04/pandas-api-on-upcoming-apache-spark-3-2.html

In [None]:
# More transformation examples can be found in the common_ops jupyter notebook included in the current GitHub project directory.

# Data Transformations for Downstream MLlib Example

In [None]:
# Assign proper types to certain column elements.

car_df = car_df\
    .withColumn('Year', car_df.Year.cast(T.IntegerType()))\
    .withColumn('Kms_Driven', car_df.Kms_Driven.cast(T.IntegerType()))\
    .withColumn('Owner', car_df.Owner.cast(T.IntegerType()))\
    .withColumn('Selling_Price', car_df.Selling_Price.cast(T.DoubleType()))\
    .withColumn('Present_Price', car_df.Present_Price.cast(T.DoubleType()))


In [None]:
# Check the schema to see the changes

car_df.printSchema()


<span style="color:blue">
    TODO : 
    <ul>
      <li>Highlight type changes in table schema.</li>
    </ul>
</span>

In [None]:
# Check to make sure that we have data in every column of every row.
# The query below counts all instances of null or nan in each colum of the car_df dataframe.

car_df.select(
    [F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in car_df.columns]
).show()


In [None]:
# Write and call a User Defined Function (UDF) written in Python 


# Write a function to calculate the age of a vehicle given year
def get_age(i_year):
    current_year = date.today().year

    if current_year < i_year:
        raise Exception('Problem with years')

    return current_year - i_year


# Create a user defined function (i.e. a user-programmable routine that act on one row).

calc_age_udf = F.udf(lambda year: get_age(year), T.IntegerType())


# Add an 'Car_Age' column to the car_df dataframe

car_df = car_df\
    .withColumn('Car_Age', calc_age_udf(F.col('Year')))

# We should see a 'Car_Age' column in at the end of our dataframe now.

car_df.show(5, False)


<span style="color:blue">
    TODO : 
    <ul>
      <li>Mention get_age() function could be part of larger shared/reusable library with unit testing and "all the trimmings".</li>  
    </ul>
</span>

In [None]:
# Call a UDF written in Java

# Add the first name length field (fname_length) by calling a Java function. 
# This function lives in the jar we added to Spark earlier.
car_df = car_df\
    .withColumn("car_name_length", F.expr("StringLengthUDF(Car_Name)"))

car_df.show(5, True)


In [None]:
# We could also call the Java UDF using a SQL query. 
# Let's do that but just show what the transformation would look like.

# Re-register the car_df DataFrame as a SQL temporary view. 
# To make sure we update the table with new type information.
car_df.createOrReplaceTempView("CAR_TABLE")

spark.sql('''
    SELECT
        Car_Name,
        Year,
        Selling_Price,
        Present_Price,
        Kms_Driven,
        Fuel_Type,
        Seller_Type,
        Transmission,
        Owner,
        Car_Age,
        StringLengthUDF(Car_Name) as car_name_length
    FROM 
        CAR_TABLE
''').show(5, True)

In [None]:
# One last simple tranformation example :

# Add calculated column. Create 'inflated_price' column, which is the ('Selling_Price' + 30 percent) :

car_df = car_df\
    .withColumn('inflated_price', F.col('Selling_Price')*F.lit(1.30))

car_df\
    .select('Car_Name', 'Selling_Price', 'Inflated_Price')\
    .show(5, False)


In [None]:
# Drop a column we don't need .. in this case, the 'Inflated_Price' and 'car_name_length' columns we just added.

car_df = car_df\
    .drop('Inflated_Price')\
    .drop('car_name_length')

car_df.show(5, False)


# Data Exploration

In [None]:
# Compute summary statistics for numeric columns.

car_df\
    .select('Selling_Price', 'Present_Price', 'Kms_Driven', 'Car_Age')\
    .summary()\
    .show()


In [None]:
# Convert the Spark dataframe to a native Pandas dataframe (for visualizations). 
# Note that we are NOT using the NEW Pandas API on Spark, which allows 
# you to scale your Pandas workload out. It's just straight Pandas.

car_pdf = car_df.toPandas()


In [None]:
# A SIDE NOTE ... APACHE ARROW

# If using Apacke Arrow, creating a Spark dataframe from a pandas dataframe is much quicker.
# We are already using apache arrow.. This line was executed above.
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

# How long does it take to create a Spark dataframe from a pandas dataframe if using Arrow?
%time df = spark.createDataFrame(car_pdf)
print()
print()
# If using NOT using Apacke Arrow, creating a Spark dataframe from a pandas dataframe should be slower.
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "false")
%time df = spark.createDataFrame(car_pdf)

# Reset to use Apache Arrow
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

<span style="color:blue">
    TODO : 
    <ul>
      <li>Highlight dataframe creation speed difference (with and without arrow)</li>
    </ul>
</span>

In [None]:
# Create boxplots for numerical data.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 7))
fig.suptitle('Numerical Data Box Plots')

ax1 = sns.boxplot(x='Selling_Price', data=car_pdf, ax=ax1)
ax2 = sns.boxplot(x='Kms_Driven', data=car_pdf, ax=ax2)
ax3 = sns.boxplot(x='Present_Price', data=car_pdf, ax=ax3)
ax4 = sns.boxplot(x='Car_Age', data=car_pdf, ax=ax4)


In [None]:
# Create bar charts for categorical data.

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 11))
fig.suptitle('Counts by Categories')

ax1 = sns.countplot(x='Seller_Type', data=car_pdf, ax=ax1).set_title('Sale Count by Seller Type')
ax2 = sns.countplot(x='Fuel_Type', data=car_pdf, ax=ax2).set_title('Sale Count by Fuel Type')
ax3 = sns.countplot(x='Transmission', data=car_pdf, ax=ax3).set_title('Sale Count by Transmission')
ax4 = sns.countplot(x='Owner', data=car_pdf, ax=ax4).set_title('Sale Count by Previous Owner Count')


In [None]:
# Create correlation heatmap.

sns.heatmap(car_pdf.corr(), annot=True, cmap="RdBu")
plt.show()


In [None]:
# Plot selling_price relative to present_price.

plt.figure(figsize=(10,10))
sns.lmplot(x='Present_Price',y='Selling_Price',data=car_pdf)


# Save the Spark Dataframe as a Parquet File for Downstream Use

In [None]:
# Show the schema of the dataframe we are saving

car_df.printSchema()


In [None]:
car_df.write.mode("overwrite").parquet("/data/car_data.parquet")

In [None]:
%%bash

echo 'Current Linux User : (Should be ds...)'
whoami

echo ''
echo ''


# We can see the parquet file (6 partitions) was saved in the '/data' directory : 

echo 'List parquet file parts :'

ls /data/car_data.parquet


In [None]:
spark.stop()