In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("myApp").master("local[*]").getOrCreate()

In [2]:
df = spark.read.format("csv").load("/content/ItalianRivers.csv")

In [3]:
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
|  _c0|   _c1|      _c2|                 _c3|           _c4|            _c5|
+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
|   Po|   652|     True|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     True|         Rome, Terni|Tyrrhenian Sea|             66|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [4]:
df = spark.read.format("csv").load("/content/ItalianRivers.csv", header=True)
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     True|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     True|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     True|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [5]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Length: string (nullable = true)
 |-- ItalyOnly: string (nullable = true)
 |-- Cities: string (nullable = true)
 |-- Sea: string (nullable = true)
 |-- NumberOfBridges: string (nullable = true)



In [6]:
df = spark.read.format("csv").load("/content/ItalianRivers.csv", header=True, inferSchema=True)
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [7]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Length: integer (nullable = true)
 |-- ItalyOnly: boolean (nullable = true)
 |-- Cities: string (nullable = true)
 |-- Sea: string (nullable = true)
 |-- NumberOfBridges: string (nullable = true)



https://spark.apache.org/docs/latest/sql-data-sources-csv.html


In [8]:
df = spark.read.format("csv").load("/content/ItalianRivers.csv",
                                   header=True, inferSchema=True, sep=";")
df.show(3)

+------------------------------------------------------------+
|"Name","Length","ItalyOnly","Cities","Sea","NumberOfBridges"|
+------------------------------------------------------------+
|                                        "Po",652,True,"Tu...|
|                                        "Tiber",405,True,...|
|                                        "Arno",241,True,"...|
+------------------------------------------------------------+
only showing top 3 rows



In [9]:
df = spark.read.format("csv")\
  .options(header=True, inferSchema=True)\
  .load("/content/ItalianRivers.csv")
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [10]:
df = spark.read.format("csv")\
  .option("header", True)\
  .option("inferSchema", True)\
  .load("/content/ItalianRivers.csv")
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [11]:
df = spark.read.csv("/content/ItalianRivers.csv", header=True, inferSchema=True)
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [12]:
my_options = {
    "header": True,
    "inferSchema": True,
    "sep": ","
}
df = spark.read.csv("/content/ItalianRivers.csv", **my_options)
df.show(3)

+-----+------+---------+--------------------+--------------+---------------+
| Name|Length|ItalyOnly|              Cities|           Sea|NumberOfBridges|
+-----+------+---------+--------------------+--------------+---------------+
|   Po|   652|     true|Turin, Piacenza, ...|  Adriatic Sea|            123|
|Tiber|   405|     true|         Rome, Terni|Tyrrhenian Sea|             66|
| Arno|   241|     true|      Florence, Pisa|  Ligurian Sea|             40|
+-----+------+---------+--------------------+--------------+---------------+
only showing top 3 rows



In [13]:
schema = df.schema
schema

StructType([StructField('Name', StringType(), True), StructField('Length', IntegerType(), True), StructField('ItalyOnly', BooleanType(), True), StructField('Cities', StringType(), True), StructField('Sea', StringType(), True), StructField('NumberOfBridges', StringType(), True)])

In [14]:
df_bad = spark.read.format("csv")\
  .options(header=True)\
  .schema(schema)\
  .load("/content/ItalianRivers-bad.csv")
df_bad.show(10, truncate=False)

+----------------------------+------+---------+--------------------------------+--------------+---------------+
|Name                        |Length|ItalyOnly|Cities                          |Sea           |NumberOfBridges|
+----------------------------+------+---------+--------------------------------+--------------+---------------+
|Po (ok)                     |652   |true     |Turin, Piacenza, Ferrara, Rovigo|Adriatic Sea  |123            |
|Tiber (empty length)        |NULL  |true     |Rome, Terni                     |Tyrrhenian Sea|66             |
|Arno ('unknown bridges')    |241   |true     |Florence, Pisa                  |Ligurian Sea  |unknown        |
|Adda ('maybe' italy only)   |313   |NULL     |Lecco, Milan                    |Po River      |63             |
|Piave (no quotes - too long)|220   |true     |Belluno                         | Treviso      |Adriatic Sea   |
|Adige (bad length)          |NULL  |true     |Verona, Trento                  |Adriatic Sea  |105      

In [15]:
df_bad.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Length: integer (nullable = true)
 |-- ItalyOnly: boolean (nullable = true)
 |-- Cities: string (nullable = true)
 |-- Sea: string (nullable = true)
 |-- NumberOfBridges: string (nullable = true)



In [16]:
from pyspark.sql.types import StructType, StructField, StringType
schema = StructType(schema.fields + [StructField('_corrupt_record', StringType(), True)])

In [17]:
schema.fields

[StructField('Name', StringType(), True),
 StructField('Length', IntegerType(), True),
 StructField('ItalyOnly', BooleanType(), True),
 StructField('Cities', StringType(), True),
 StructField('Sea', StringType(), True),
 StructField('NumberOfBridges', StringType(), True),
 StructField('_corrupt_record', StringType(), True)]

In [18]:
df_bad = spark.read.format("csv")\
  .option("header", True)\
  .option("columnNameOfCorruptRecord", "_corrupt_record")\
  .option("mode", "PERMISSIVE")\
  .schema(schema)\
  .load("/content/ItalianRivers-bad.csv")
df_bad.show(10, truncate=False)

+----------------------------+------+---------+--------------------------------+--------------+---------------+----------------------------------------------------------------------------------------+
|Name                        |Length|ItalyOnly|Cities                          |Sea           |NumberOfBridges|_corrupt_record                                                                         |
+----------------------------+------+---------+--------------------------------+--------------+---------------+----------------------------------------------------------------------------------------+
|Po (ok)                     |652   |true     |Turin, Piacenza, Ferrara, Rovigo|Adriatic Sea  |123            |NULL                                                                                    |
|Tiber (empty length)        |NULL  |true     |Rome, Terni                     |Tyrrhenian Sea|66             |NULL                                                                                 

In [19]:
df_bad.filter(df_bad._corrupt_record.isNotNull()).show(truncate=False)

+----------------------------+------+---------+-------------------+------------+---------------+----------------------------------------------------------------------------------------+
|Name                        |Length|ItalyOnly|Cities             |Sea         |NumberOfBridges|_corrupt_record                                                                         |
+----------------------------+------+---------+-------------------+------------+---------------+----------------------------------------------------------------------------------------+
|Adda ('maybe' italy only)   |313   |NULL     |Lecco, Milan       |Po River    |63             |"Adda ('maybe' italy only)",313,maybe,"Lecco, Milan","Po River",63                      |
|Piave (no quotes - too long)|220   |true     |Belluno            | Treviso    |Adriatic Sea   |Piave (no quotes - too long),220,True,Belluno, Treviso,Adriatic Sea,19                  |
|Adige (bad length)          |NULL  |true     |Verona, Trento     |Adr

In [20]:
df_bad = spark.read.format("csv")\
  .option("header", True)\
  .option("columnNameOfCorruptRecord", "_corrupt_record")\
  .option("mode", "DROPMALFORMED")\
  .schema(schema)\
  .load("/content/ItalianRivers-bad.csv")
df_bad.show(10, truncate=False)

+------------------------+------+---------+--------------------------------+--------------+---------------+---------------+
|Name                    |Length|ItalyOnly|Cities                          |Sea           |NumberOfBridges|_corrupt_record|
+------------------------+------+---------+--------------------------------+--------------+---------------+---------------+
|Po (ok)                 |652   |true     |Turin, Piacenza, Ferrara, Rovigo|Adriatic Sea  |123            |NULL           |
|Tiber (empty length)    |NULL  |true     |Rome, Terni                     |Tyrrhenian Sea|66             |NULL           |
|Arno ('unknown bridges')|241   |true     |Florence, Pisa                  |Ligurian Sea  |unknown        |NULL           |
|Brenta                  |174   |true     |Bassano del Grappa, Padua       |Adriatic Sea  |37             |NULL           |
|Savio                   |126   |true     |Cesena, Ravenna                 |Adriatic Sea  |12             |NULL           |
|Tagliam

In [21]:
df_bad = spark.read.format("csv")\
  .option("header", True)\
  .option("columnNameOfCorruptRecord", "_corrupt_record")\
  .option("mode", "FAILFAST")\
  .schema(schema)\
  .load("/content/ItalianRivers-bad.csv")

# df_bad.show(10, truncate=False)

In [22]:
# df_bad.show(10, truncate=False)