# Ex-2220 - errors when reading data


In [1]:
# Write CSV file
csv_content = """OrderID,Date,Customer,Category,Product,Price,Quantity,TotalSales
101,2025-05-01,John Doe,Electronics,Empty Quantity,1200,,1200
102,2025-05-02,Jane Smith,Furniture,Empty Price,,4,600
103,2025-05-02,Michael Lee,Electronics,Price=MISSING,MISSING,2,1600
104,2025-05-03,Susan Adams,Furniture,To many fields,300,1,300,x,x,x,x,x
105:2025-05-05:John Doe:Clothing:Separator colon:120:3:2
106:2025-02-31,John Doe,Clothing,Nonexistent day,120,3,15"""

with open("data-comma.csv", "w") as f:
    f.write(csv_content)

In [2]:
! cat data-comma.csv

OrderID,Date,Customer,Category,Product,Price,Quantity,TotalSales
101,2025-05-01,John Doe,Electronics,Empty Quantity,1200,,1200
102,2025-05-02,Jane Smith,Furniture,Empty Price,,4,600
103,2025-05-02,Michael Lee,Electronics,Price=MISSING,MISSING,2,1600
104,2025-05-03,Susan Adams,Furniture,To many fields,300,1,300,x,x,x,x,x
105:2025-05-05:John Doe:Clothing:Separator colon:120:3:2
106:2025-02-31,John Doe,Clothing,Nonexistent day,120,3,15

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
    StructField('OrderID', IntegerType(), False),
    StructField('Date', DateType(), False),
    StructField('Customer', StringType(), False),
    StructField('Category', StringType(), False),
    StructField('Product', StringType(), False),
    StructField('Price', IntegerType(), False),
    StructField('Quantity', IntegerType(), False),
    StructField('TotalSales', IntegerType(), False),
    StructField("_corrupt_record", StringType(), False)
    ])

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("ReadErrors").getOrCreate()

In [5]:
df = spark.read.csv("data-comma.csv", header=True, schema=schema)
df.show()

+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|OrderID|      Date|   Customer|       Category|       Product|Price|Quantity|TotalSales|     _corrupt_record|
+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|    101|2025-05-01|   John Doe|    Electronics|Empty Quantity| 1200|    NULL|      1200|                NULL|
|    102|2025-05-02| Jane Smith|      Furniture|   Empty Price| NULL|       4|       600|                NULL|
|    103|2025-05-02|Michael Lee|    Electronics| Price=MISSING| NULL|       2|      1600|103,2025-05-02,Mi...|
|    104|2025-05-03|Susan Adams|      Furniture|To many fields|  300|       1|       300|104,2025-05-03,Su...|
|   NULL|      NULL|       NULL|           NULL|          NULL| NULL|    NULL|      NULL|105:2025-05-05:Jo...|
|   NULL|      NULL|   Clothing|Nonexistent day|           120|    3|      15|      NULL|106:2025-02-31,Jo...|
+

In [6]:
df = spark.read.csv("data-comma.csv", header=True, schema=schema, mode="permissive")
df.show()

+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|OrderID|      Date|   Customer|       Category|       Product|Price|Quantity|TotalSales|     _corrupt_record|
+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|    101|2025-05-01|   John Doe|    Electronics|Empty Quantity| 1200|    NULL|      1200|                NULL|
|    102|2025-05-02| Jane Smith|      Furniture|   Empty Price| NULL|       4|       600|                NULL|
|    103|2025-05-02|Michael Lee|    Electronics| Price=MISSING| NULL|       2|      1600|103,2025-05-02,Mi...|
|    104|2025-05-03|Susan Adams|      Furniture|To many fields|  300|       1|       300|104,2025-05-03,Su...|
|   NULL|      NULL|       NULL|           NULL|          NULL| NULL|    NULL|      NULL|105:2025-05-05:Jo...|
|   NULL|      NULL|   Clothing|Nonexistent day|           120|    3|      15|      NULL|106:2025-02-31,Jo...|
+

In [7]:
df = spark.read.csv("data-comma.csv",
                    header=True,
                    schema=schema,
                    mode="permissive",
                    columnNameOfCorruptRecord="_corrupt_record")
df.show()

+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|OrderID|      Date|   Customer|       Category|       Product|Price|Quantity|TotalSales|     _corrupt_record|
+-------+----------+-----------+---------------+--------------+-----+--------+----------+--------------------+
|    101|2025-05-01|   John Doe|    Electronics|Empty Quantity| 1200|    NULL|      1200|                NULL|
|    102|2025-05-02| Jane Smith|      Furniture|   Empty Price| NULL|       4|       600|                NULL|
|    103|2025-05-02|Michael Lee|    Electronics| Price=MISSING| NULL|       2|      1600|103,2025-05-02,Mi...|
|    104|2025-05-03|Susan Adams|      Furniture|To many fields|  300|       1|       300|104,2025-05-03,Su...|
|   NULL|      NULL|       NULL|           NULL|          NULL| NULL|    NULL|      NULL|105:2025-05-05:Jo...|
|   NULL|      NULL|   Clothing|Nonexistent day|           120|    3|      15|      NULL|106:2025-02-31,Jo...|
+

In [8]:
df = spark.read.csv("data-comma.csv",
                    header=True,
                    schema=schema,
                    mode="dropmalformed",
                    columnNameOfCorruptRecord="_corrupt_record")
df.show()

+-------+----------+----------+-----------+--------------+-----+--------+----------+---------------+
|OrderID|      Date|  Customer|   Category|       Product|Price|Quantity|TotalSales|_corrupt_record|
+-------+----------+----------+-----------+--------------+-----+--------+----------+---------------+
|    101|2025-05-01|  John Doe|Electronics|Empty Quantity| 1200|    NULL|      1200|           NULL|
|    102|2025-05-02|Jane Smith|  Furniture|   Empty Price| NULL|       4|       600|           NULL|
+-------+----------+----------+-----------+--------------+-----+--------+----------+---------------+



In [9]:
# df = spark.read.csv("data-comma.csv",
#                     header=True,
#                     schema=schema,
#                     mode="failfast",
#                     columnNameOfCorruptRecord="_corrupt_record")
# df.show()