## Create sparl context

In [1]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.config("spark.sql.shuffle.partitions", "2").appName("Analysis").master("local[2]").getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext



In [2]:
%run "../includes/configuration"
%run "../includes/common_functions"

## Define schema

In [3]:
races_schema = StructType(fields=[
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True),
])

## Read the CSV file from HDFS & apply schema

In [4]:
races_df = spark.read.option("header", True).schema(races_schema).csv(f"{raw_folder_path}/races.csv")

## Add ingestion date & race_timestamp to the dataframe

In [5]:
races_with_timestamp_df = races_df.withColumn("ingestion_date", current_timestamp())\
.withColumn("race_timestamp", to_timestamp(concat(col('date'), lit(' '), col('time')), 'yyyy-MM-dd HH:mm:ss'))

In [6]:
races_with_timestamp_df.show(5)

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|      ingestion_date|     race_timestamp|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2023-08-19 21:43:...|2009-03-29 06:00:00|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2023-08-19 21:43:...|2009-04-05 09:00:00|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|2023-08-19 21:43:...|2009-04-19 07:00:00|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|2023-08-19 21:43:...|2009-04-26 12:00:00|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00

In [7]:
races_selected_df = races_with_timestamp_df.select(col("raceId").alias('race_id'), col("year").alias('race_year'), col("round"), col('circuitId').alias('circuit_id'), col("name"), col("ingestion_date"), col("race_timestamp"))

## Select only columns required & rename as required

In [8]:
races_selected_df.show(5)

+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|race_id|race_year|round|circuit_id|                name|      ingestion_date|     race_timestamp|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
|      1|     2009|    1|         1|Australian Grand ...|2023-08-19 21:43:...|2009-03-29 06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2023-08-19 21:43:...|2009-04-05 09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2023-08-19 21:43:...|2009-04-19 07:00:00|
|      4|     2009|    4|         3|  Bahrain Grand Prix|2023-08-19 21:43:...|2009-04-26 12:00:00|
|      5|     2009|    5|         4|  Spanish Grand Prix|2023-08-19 21:43:...|2009-05-10 12:00:00|
+-------+---------+-----+----------+--------------------+--------------------+-------------------+
only showing top 5 rows



## Write the output to processed container in parquet container

In [10]:
races_selected_df.write.mode('overwrite').partitionBy('race_year').parquet(f"{processed_folder_path}/races")

In [12]:
races_selected_df.write.csv("/home/sunbeam/Desktop/FastLaneForecast/combinedCsv/races", header=True)