In [0]:
# Spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from CSV Files")
    .master("local[*]")
    .getOrCreate()
)

spark

In [0]:
# Read a CSV file into the Data frame
# inferSchema allow spark to go back and check data and identify data types for each column
df = spark.read.format('csv').option("header",True).option('inferSchema', True).load("/FileStore/tables/emp_perf.csv")

In [0]:
df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- Joining Date: date (nullable = true)
 |-- Performance Score: double (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)



In [0]:
df.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining Date|Performance Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|  5641|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|  4249|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|  3058|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|  5895|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|  4317|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|


#### Spark Data source option
###### https://spark.apache.org/docs/latest/sql-data-sources-csv.html

In [0]:
# Reading with Schema
_schema = "ID int, Name string, Age int, Gender string, Department string, Salary double, Joining_Date date, Performance_Score double, Experience int, Status string, Location string, Session string"
df_schema = spark.read.format('csv').option('header',True).schema(_schema).load("/FileStore/tables/emp_perf.csv")

In [0]:
df_schema.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining_Date|Performance_Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|5641.0|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|4249.0|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|3058.0|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|5895.0|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|4317.0|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|


In [0]:
# Handle BAD records - PREMISSIVE (Default mode)
_schema = "ID int, Name string, Age int, Gender string, Department string, Salary double, Joining_Date date, Performance_Score double, Experience int, Status string, Location string, Session string, bad_record string"
df_p = spark.read.format('csv').option('header',True).option('columnNameOfCorruptRecord', 'bad_record').schema(_schema).load("/FileStore/tables/emp_perf_1-1.csv")

In [0]:
df_p.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: double (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Performance_Score: double (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)
 |-- bad_record: string (nullable = true)



In [0]:
df_p.where("bad_record is not null").show(truncate=False)
# df_p.show()

+---+-----------+---+------+----------+------+------------+-----------------+----------+------+--------+-------+-------------------------------------------------------------------+
|ID |Name       |Age|Gender|Department|Salary|Joining_Date|Performance_Score|Experience|Status|Location|Session|bad_record                                                         |
+---+-----------+---+------+----------+------+------------+-----------------+----------+------+--------+-------+-------------------------------------------------------------------+
|6  |Ashley Howe|29 |Female|HR        |NULL  |NULL        |1.0              |6         |Active|Chicago |Evening|6,Ashley Howe,29,Female,HR,Low,no date,1.0,6,Active,Chicago,Evening|
+---+-----------+---+------+----------+------+------------+-----------------+----------+------+--------+-------+-------------------------------------------------------------------+



In [0]:
# Handle BAD records - DROPMALFORMED

_schema = "ID int, Name string, Age int, Gender string, Department string, Salary double, Joining_Date date, Performance_Score double, Experience int, Status string, Location string, Session string"
df_m = spark.read.format('csv').option('header',True).option('mode', 'DROPMALFORMED').schema(_schema).load("/FileStore/tables/emp_perf_1-1.csv")

In [0]:
df_m.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: double (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Performance_Score: double (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)



In [0]:
df_m.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining_Date|Performance_Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|5641.0|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|4249.0|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|3058.0|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|5895.0|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|4317.0|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|


In [0]:
# Handle BAD records - FAILFAST

_schema = "ID int, Name string, Age int, Gender string, Department string, Salary double, Joining_Date date, Performance_Score double, Experience int, Status string, Location string, Session string"
df_m = spark.read.format('csv').option('header',True).option('mode', 'FAILFAST').schema(_schema).load("/FileStore/tables/emp_perf_1-1.csv")

In [0]:
df_m.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: double (nullable = true)
 |-- Joining_Date: date (nullable = true)
 |-- Performance_Score: double (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Status: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Session: string (nullable = true)



In [0]:
df_m.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-1676143563767187>, line 1[0m
[0;32m----> 1[0m [43mdf_m[49m[38;5;241;43m.[39;49m[43mshow[49m[43m([49m[43m)[49m

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     45[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     46[0m [38;5;28;01mtry[39;00m:
[0;32m---> 47[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     48[0m     logger[38;5;241m.[39mlog_success(
[1;32m     49[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;241m-[39m start, signature
[1;32m  

In [0]:
# Multiple option
_options = {
    'header': 'true',
    'inferSchema': 'true',
    'mode': 'PERMISSIVE'
}

df_final = spark.read.format('csv').options(**_options).schema(_schema).load("/FileStore/tables/emp_perf_1-1.csv")

In [0]:
df_final.show()

+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
| ID|              Name|Age|Gender|Department|Salary|Joining_Date|Performance_Score|Experience|  Status|   Location|Session|
+---+------------------+---+------+----------+------+------------+-----------------+----------+--------+-----------+-------+
|  1|      Cory Escobar| 48|Female|        HR|5641.0|  2015-05-03|              2.0|        16|  Active|   New York|  Night|
|  2|   Timothy Sanchez| 25| Other|     Sales|4249.0|  2020-11-09|              2.0|        11|Inactive|Los Angeles|Evening|
|  3|      Chad Nichols| 57| Other|     Sales|3058.0|  2019-02-12|             NULL|         1|Inactive|   New York|Morning|
|  4|Christine Williams| 58|Female|        IT|5895.0|  2017-09-08|              2.0|        13|Inactive|Los Angeles|Evening|
|  5|      Amber Harris| 35| Other|        IT|4317.0|  2020-02-15|              5.0|        16|Inactive|   New York|Evening|
