## Create spark context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"
%run "../includes/common_functions"

## Define schema

In [3]:
name_schema = StructType([
    StructField("forename", StringType(), True),
    StructField("surname", StringType(), True)
])

In [4]:
drivers_schema = StructType(fields=[
    StructField("driverId", IntegerType(), False),
    StructField("driverRef", StringType(), True),
    StructField("number", IntegerType(), True),
    StructField("code", StringType(), True),
    StructField("name", name_schema),
    StructField("dob", DateType(), True),
    StructField("nationality", StringType(), True),
    StructField("url", StringType(), True)
])

## Read the JSON file from HDFS & apply schema

In [5]:
drivers_df = spark.read.schema(drivers_schema).json(f"{raw_folder_path}/drivers.json")

In [6]:
drivers_df.printSchema()

root
 |-- driverId: integer (nullable = true)
 |-- driverRef: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- code: string (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- forename: string (nullable = true)
 |    |-- surname: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- nationality: string (nullable = true)
 |-- url: string (nullable = true)



In [7]:
drivers_df.show(5)

+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|driverId| driverRef|number|code|                name|       dob|nationality|                 url|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
|       1|  hamilton|    44| HAM|   {Lewis, Hamilton}|1985-01-07|    British|http://en.wikiped...|
|       2|  heidfeld|  null| HEI|    {Nick, Heidfeld}|1977-05-10|     German|http://en.wikiped...|
|       3|   rosberg|     6| ROS|     {Nico, Rosberg}|1985-06-27|     German|http://en.wikiped...|
|       4|    alonso|    14| ALO|  {Fernando, Alonso}|1981-07-29|    Spanish|http://en.wikiped...|
|       5|kovalainen|  null| KOV|{Heikki, Kovalainen}|1981-10-19|    Finnish|http://en.wikiped...|
+--------+----------+------+----+--------------------+----------+-----------+--------------------+
only showing top 5 rows



## Rename columns & add ingestion date also concatenate forename & surname

In [8]:
drivers_with_columns_df = drivers_df.withColumnRenamed("driverId", "driver_id")\
.withColumnRenamed("driverRef", "driver_ref").withColumn("ingestion_date", current_timestamp())\
.withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))

In [9]:
drivers_with_columns_df.show(5)

+---------+----------+------+----+-----------------+----------+-----------+--------------------+--------------------+
|driver_id|driver_ref|number|code|             name|       dob|nationality|                 url|      ingestion_date|
+---------+----------+------+----+-----------------+----------+-----------+--------------------+--------------------+
|        1|  hamilton|    44| HAM|   Lewis Hamilton|1985-01-07|    British|http://en.wikiped...|2023-08-19 22:44:...|
|        2|  heidfeld|  null| HEI|    Nick Heidfeld|1977-05-10|     German|http://en.wikiped...|2023-08-19 22:44:...|
|        3|   rosberg|     6| ROS|     Nico Rosberg|1985-06-27|     German|http://en.wikiped...|2023-08-19 22:44:...|
|        4|    alonso|    14| ALO|  Fernando Alonso|1981-07-29|    Spanish|http://en.wikiped...|2023-08-19 22:44:...|
|        5|kovalainen|  null| KOV|Heikki Kovalainen|1981-10-19|    Finnish|http://en.wikiped...|2023-08-19 22:44:...|
+---------+----------+------+----+-----------------+----

## Drop unwanted columns

In [10]:
drivers_final_df = drivers_with_columns_df.drop(col("url"))

## Write outut to parquet file

In [11]:
drivers_final_df.write.mode('overwrite').parquet(f"{processed_folder_path}/drivers")

In [12]:
drivers_final_df.write.csv("/home/sunbeam/Desktop/FastLaneForecast/combinedCsv/drivers", header=True)