In [None]:
df = spark.read \
        .option("header", True) \
        .csv('/Volumes/workspace/default/raw_data/orders.csv')

df.printSchema()
display(df)

root
 |-- order_date: string (nullable = true)
 |-- country: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- qty: string (nullable = true)
 |-- price: string (nullable = true)



order_date,country,order_id,product,qty,price
2024-02-16,IN,1000,Shoes,2,54.37
2024-02-01,CA,1001,Backpack,4,77.01
2024-02-03,AU,1002,Jacket,1,95.07
2024-03-01,UK,1003,Jeans,1,42.03
2024-01-31,IN,1004,T-Shirt,3,15.94
2024-01-17,IN,1005,Watch,2,131.49
2024-01-15,UK,1006,Shoes,3,50.83
2024-01-18,IN,1007,Backpack,3,85.85
2024-01-25,AU,1008,T-Shirt,2,16.94
2024-01-06,IN,1009,Jeans,4,37.45


### 2. Infer Schema

In [None]:
df = spark.read \
        .option("header", True) \
        .option("inferSchema", True) \
        .csv('/Volumes/workspace/default/raw_data/orders.csv')

df.printSchema()
display(df)

root
 |-- order_date: date (nullable = true)
 |-- country: string (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- price: double (nullable = true)



order_date,country,order_id,product,qty,price
2024-02-16,IN,1000,Shoes,2,54.37
2024-02-01,CA,1001,Backpack,4,77.01
2024-02-03,AU,1002,Jacket,1,95.07
2024-03-01,UK,1003,Jeans,1,42.03
2024-01-31,IN,1004,T-Shirt,3,15.94
2024-01-17,IN,1005,Watch,2,131.49
2024-01-15,UK,1006,Shoes,3,50.83
2024-01-18,IN,1007,Backpack,3,85.85
2024-01-25,AU,1008,T-Shirt,2,16.94
2024-01-06,IN,1009,Jeans,4,37.45


### 3. Enforce a strict schema on read

In [None]:
from pyspark.sql import functions as F, types as T
csv_schema = T.StructType([
    T.StructField("order_date", T.DateType(),   True),
    T.StructField("country",    T.StringType(), True),
    T.StructField("order_id",   T.IntegerType(),True),
    T.StructField("product",    T.StringType(), True),
    T.StructField("qty",        T.IntegerType(),True),
    T.StructField("price",      T.DoubleType(), True),
])

In [None]:
df = spark.read \
        .option("header", True) \
        .schema(csv_schema)    \
        .csv('/Volumes/workspace/default/raw_data/orders.csv')

df.printSchema()
display(df)

root
 |-- order_date: date (nullable = true)
 |-- country: string (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- price: double (nullable = true)



order_date,country,order_id,product,qty,price
2024-02-16,IN,1000,Shoes,2,54.37
2024-02-01,CA,1001,Backpack,4,77.01
2024-02-03,AU,1002,Jacket,1,95.07
2024-03-01,UK,1003,Jeans,1,42.03
2024-01-31,IN,1004,T-Shirt,3,15.94
2024-01-17,IN,1005,Watch,2,131.49
2024-01-15,UK,1006,Shoes,3,50.83
2024-01-18,IN,1007,Backpack,3,85.85
2024-01-25,AU,1008,T-Shirt,2,16.94
2024-01-06,IN,1009,Jeans,4,37.45


In [None]:
df = spark.read \
        .option("header", True) \
        .option("dateFormat", "yyyy-MM-dd")  \
        .schema(csv_schema)    \
        .csv('/Volumes/workspace/default/raw_data/orders.csv')

df.printSchema()
display(df)

root
 |-- order_date: date (nullable = true)
 |-- country: string (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- price: double (nullable = true)



order_date,country,order_id,product,qty,price
2024-02-16,IN,1000,Shoes,2,54.37
2024-02-01,CA,1001,Backpack,4,77.01
2024-02-03,AU,1002,Jacket,1,95.07
2024-03-01,UK,1003,Jeans,1,42.03
2024-01-31,IN,1004,T-Shirt,3,15.94
2024-01-17,IN,1005,Watch,2,131.49
2024-01-15,UK,1006,Shoes,3,50.83
2024-01-18,IN,1007,Backpack,3,85.85
2024-01-25,AU,1008,T-Shirt,2,16.94
2024-01-06,IN,1009,Jeans,4,37.45


In [None]:
# different way of writing the same code

df = (spark.read
          .format("csv")
          .options(header=True, dateFormat="yyyy-MM-dd")
          .schema(csv_schema)
          .load("/Volumes/workspace/default/raw_data/orders.csv"))

df.printSchema()
display(df)


root
 |-- order_date: date (nullable = true)
 |-- country: string (nullable = true)
 |-- order_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- price: double (nullable = true)



order_date,country,order_id,product,qty,price
2024-02-16,IN,1000,Shoes,2,54.37
2024-02-01,CA,1001,Backpack,4,77.01
2024-02-03,AU,1002,Jacket,1,95.07
2024-03-01,UK,1003,Jeans,1,42.03
2024-01-31,IN,1004,T-Shirt,3,15.94
2024-01-17,IN,1005,Watch,2,131.49
2024-01-15,UK,1006,Shoes,3,50.83
2024-01-18,IN,1007,Backpack,3,85.85
2024-01-25,AU,1008,T-Shirt,2,16.94
2024-01-06,IN,1009,Jeans,4,37.45


In [None]:
df.write.mode("overwrite").parquet('/Volumes/workspace/default/raw_data/orders')