In [1]:
import pyspark
from pyspark import SparkContext

sc = SparkContext()
spark = pyspark.sql.SparkSession(sc, jsparkSession=None)

In [2]:
# Read csv
bookDF = spark.read.csv("bookcontents.csv")

In [3]:
bookDF.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [5]:
bookDF.show() # column names are not in header

+-------+--------------------+----+
|    _c0|                 _c1| _c2|
+-------+--------------------+----+
|Chapter|                Name|Page|
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



In [6]:
# Inference of schema with no header
bookInferDF = spark.read.option("inferSchema", "true").csv("bookContentsNoHeader.csv")

In [7]:
bookInferDF.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)



In [8]:
bookInferDF.show()

+---+--------------------+---+
|_c0|                 _c1|_c2|
+---+--------------------+---+
|  1|        Introduction| 11|
|  2|Basic Engineering...| 19|
|  3|Advanced Engineer...| 28|
|  4|     Hands On Course| 60|
|  5|        Case Studies| 62|
|  6|Best Practices Cl...| 73|
|  7|130+ Data Sources...| 77|
|  8|1001 Interview Qu...| 82|
|  9|Recommended Books...| 87|
+---+--------------------+---+



In [9]:
# Use header row
bookHeaderDF = spark.read.option("header", "true").csv("bookcontents.csv")

In [15]:
bookHeaderDF.printSchema() # data types not defined

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)



In [16]:
bookHeaderDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



In [17]:
# Combination (Infer schema & Header)
bookHeaderDF = spark.read.option("inferSchema", "true").option("header", "true").csv("bookcontents.csv")

In [18]:
bookHeaderDF.printSchema()

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)



In [19]:
bookHeaderDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+



In [22]:
# Manual Schema - inferSchema could be wrong when we use bigger data
from pyspark.sql.types import *

In [23]:
spark.read.option("inferSchema", "true").csv("bookcontentsNoHeader.csv").printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)



In [24]:
# Define schema
columns = [
    StructField("Chapter", IntegerType()),
    StructField("Name", StringType()),
    StructField("Page", IntegerType()),
]
csvSchema = StructType(columns)

In [25]:
manualSchemaDF = spark.read.schema(csvSchema).csv("bookcontentsNoHeader.csv")

In [26]:
manualSchemaDF.printSchema()

root
 |-- Chapter: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Page: integer (nullable = true)



In [27]:
manualSchemaDF.show()

+-------+--------------------+----+
|Chapter|                Name|Page|
+-------+--------------------+----+
|      1|        Introduction|  11|
|      2|Basic Engineering...|  19|
|      3|Advanced Engineer...|  28|
|      4|     Hands On Course|  60|
|      5|        Case Studies|  62|
|      6|Best Practices Cl...|  73|
|      7|130+ Data Sources...|  77|
|      8|1001 Interview Qu...|  82|
|      9|Recommended Books...|  87|
+-------+--------------------+----+

