# Ex-2210 - reading csv


In [1]:
# Write CSV file
csv_content = """OrderID,Date,Customer,Category,Product,Price,Quantity,TotalSales
101,2025-05-01,John Doe,Electronics,Laptop,1200,1,1200
102,2025-05-02,Jane Smith,Furniture,Chair,150,4,600
103,2025-05-02,Michael Lee,Electronics,Smartphone,MISSING,2,1600
104,2025-05-03,Susan Adams,Furniture,Table,300,1,300
105,2025-05-04,John Doe,Clothing,Jacket,120,3,360"""

with open("data-comma.csv", "w") as f:
    f.write(csv_content)

In [2]:
# Write CSV file
csv_content = """OrderID:Date:Customer:Category:Product:Price:Quantity:TotalSales
101:2025-05-01:John Doe:Electronics:Laptop:1200:1:1200
102:2025-05-02:Jane Smith:Furniture:Chair:150:4:600
103:2025-05-02:Michael Lee:Electronics:Smartphone:MISSING:2:1600
104:2025-05-03:Susan Adams:Furniture:Table:300:1:300
105:2025-05-04:John Doe:Clothing:Jacket:120:3:360"""

with open("data-colon.csv", "w") as f:
    f.write(csv_content)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("JoinExamples").getOrCreate()

In [4]:
df_no_header = spark.read.csv("data-comma.csv", inferSchema=False, header=False)
df_no_header.show()

+-------+----------+-----------+-----------+----------+-------+--------+----------+
|    _c0|       _c1|        _c2|        _c3|       _c4|    _c5|     _c6|       _c7|
+-------+----------+-----------+-----------+----------+-------+--------+----------+
|OrderID|      Date|   Customer|   Category|   Product|  Price|Quantity|TotalSales|
|    101|2025-05-01|   John Doe|Electronics|    Laptop|   1200|       1|      1200|
|    102|2025-05-02| Jane Smith|  Furniture|     Chair|    150|       4|       600|
|    103|2025-05-02|Michael Lee|Electronics|Smartphone|MISSING|       2|      1600|
|    104|2025-05-03|Susan Adams|  Furniture|     Table|    300|       1|       300|
|    105|2025-05-04|   John Doe|   Clothing|    Jacket|    120|       3|       360|
+-------+----------+-----------+-----------+----------+-------+--------+----------+



In [5]:
df_with_header = spark.read.csv("data-comma.csv", header=True, inferSchema=False)
df_with_header.show()

+-------+----------+-----------+-----------+----------+-------+--------+----------+
|OrderID|      Date|   Customer|   Category|   Product|  Price|Quantity|TotalSales|
+-------+----------+-----------+-----------+----------+-------+--------+----------+
|    101|2025-05-01|   John Doe|Electronics|    Laptop|   1200|       1|      1200|
|    102|2025-05-02| Jane Smith|  Furniture|     Chair|    150|       4|       600|
|    103|2025-05-02|Michael Lee|Electronics|Smartphone|MISSING|       2|      1600|
|    104|2025-05-03|Susan Adams|  Furniture|     Table|    300|       1|       300|
|    105|2025-05-04|   John Doe|   Clothing|    Jacket|    120|       3|       360|
+-------+----------+-----------+-----------+----------+-------+--------+----------+



In [6]:
df_infer_schema = spark.read.csv("data-comma.csv", header=True, inferSchema=True)
df_infer_schema.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- TotalSales: integer (nullable = true)



In [7]:
df_infer_schema = spark.read.csv("data-colon.csv", header=True, inferSchema=True)
df_infer_schema.show()

+----------------------------------------------------------------+
|OrderID:Date:Customer:Category:Product:Price:Quantity:TotalSales|
+----------------------------------------------------------------+
|                                            101:2025-05-01:Jo...|
|                                            102:2025-05-02:Ja...|
|                                            103:2025-05-02:Mi...|
|                                            104:2025-05-03:Su...|
|                                            105:2025-05-04:Jo...|
+----------------------------------------------------------------+



In [8]:
df_infer_schema = spark.read.csv("data-colon.csv", header=True, inferSchema=True,
                                 sep=":")
df_infer_schema.show()

+-------+----------+-----------+-----------+----------+-------+--------+----------+
|OrderID|      Date|   Customer|   Category|   Product|  Price|Quantity|TotalSales|
+-------+----------+-----------+-----------+----------+-------+--------+----------+
|    101|2025-05-01|   John Doe|Electronics|    Laptop|   1200|       1|      1200|
|    102|2025-05-02| Jane Smith|  Furniture|     Chair|    150|       4|       600|
|    103|2025-05-02|Michael Lee|Electronics|Smartphone|MISSING|       2|      1600|
|    104|2025-05-03|Susan Adams|  Furniture|     Table|    300|       1|       300|
|    105|2025-05-04|   John Doe|   Clothing|    Jacket|    120|       3|       360|
+-------+----------+-----------+-----------+----------+-------+--------+----------+



In [9]:
df_infer_schema = spark.read.csv("data-colon.csv", header=True, inferSchema=True,
                                 sep=":", nullValue="MISSING")
df_infer_schema.show()

+-------+----------+-----------+-----------+----------+-----+--------+----------+
|OrderID|      Date|   Customer|   Category|   Product|Price|Quantity|TotalSales|
+-------+----------+-----------+-----------+----------+-----+--------+----------+
|    101|2025-05-01|   John Doe|Electronics|    Laptop| 1200|       1|      1200|
|    102|2025-05-02| Jane Smith|  Furniture|     Chair|  150|       4|       600|
|    103|2025-05-02|Michael Lee|Electronics|Smartphone| NULL|       2|      1600|
|    104|2025-05-03|Susan Adams|  Furniture|     Table|  300|       1|       300|
|    105|2025-05-04|   John Doe|   Clothing|    Jacket|  120|       3|       360|
+-------+----------+-----------+-----------+----------+-----+--------+----------+



In [10]:
df_infer_schema.printSchema()

root
 |-- OrderID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- TotalSales: integer (nullable = true)

