In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder\
    .appName("NNS1")\
    .getOrCreate()

In [5]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,DateType

In [6]:
race_schema=StructType(fields=[StructField("raceId",IntegerType(),False),
                               StructField("year",IntegerType(),True),
                               StructField("round",IntegerType(),True),
                               StructField("circuitId",IntegerType(),True),
                               StructField("name",StringType(),True),
                               StructField("date",DateType(),True),
                               StructField("time",StringType(),True),
                               StructField("url",StringType(),True)
                              ])
                               

In [7]:
races_df = spark.read \
 .option("header",True) \
 .schema(race_schema) \
 .csv("./data/raw/races.csv")


In [8]:
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

In [9]:
from pyspark.sql.functions import current_timestamp

In [10]:
races_df_with_ingestion_date = races_df.withColumn("IngestionDate",current_timestamp())

In [11]:
races_df_with_ingestion_date.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|       IngestionDate|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2023-03-01 14:11:...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2023-03-01 14:11:...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|2023-03-01 14:11:...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|2023-03-01 14:11:...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|2023-03-01 14:11:...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|2023-

In [12]:
from pyspark.sql.functions import to_timestamp,concat,col,lit
                                        

In [13]:
races_df_with_ingestion_date = races_df_with_ingestion_date.withColumn("race_timesyamp", to_timestamp(concat(col('date'),lit(' '), col('time')),"yyyy-MM-dd HH:mm:SS"))

In [14]:
races_df_with_ingestion_date.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|       IngestionDate|     race_timesyamp|
+------+----+-----+---------+--------------------+----------+--------+--------------------+--------------------+-------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|2023-03-01 14:11:...|2009-03-29 06:00:00|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|2023-03-01 14:11:...|2009-04-05 09:00:00|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|2023-03-01 14:11:...|2009-04-19 07:00:00|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|2023-03-01 14:11:...|2009-04-26 12:00:00|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00

In [15]:
print(races_df_with_ingestion_date.rdd.getNumPartitions())

1


In [16]:
ls

 Volume in drive C is Windows
 Volume Serial Number is B263-7A3E

 Directory of C:\jupytarlab

03/01/2023  02:01 PM    <DIR>          .
03/01/2023  02:01 PM    <DIR>          ..
03/01/2023  01:53 PM    <DIR>          .ipynb_checkpoints
02/28/2023  08:06 PM            27,619 py_spark_practice_1.ipynb
03/01/2023  01:53 PM            17,325 pyspark_practice_2.ipynb
02/22/2023  12:30 PM    <DIR>          spark-warehouse
03/01/2023  02:01 PM             3,318 Untitled.ipynb
               3 File(s)         48,262 bytes
               4 Dir(s)  73,478,778,880 bytes free


In [None]:
races_df_with_ingestion_date.repartition(60)