In [62]:
# Set the PySpark environment variables
import os
import sys
os.environ['SPARK_HOME'] = r"C:\_dev\spark-3.5.1-hadoop3"
# os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
# os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = sys.executable

In [56]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Create-DataFrame").getOrCreate()

### Read CSV file into DataFrame

In [59]:
%%bash 
head -10 D:\pyspark-tutorial\data\products.csv

Couldn't find program: 'bash'


#### Read CSV with header

In [60]:
# Read CSV file into DataFrame
csv_file_path = r"D:\pyspark-tutorial\data\products.csv"
df = spark.read.csv(csv_file_path, header=True)

In [61]:
# Display schema of DataFrame, all types are inferred and converted to String, which is not true
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- price: string (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



#### Read CSV with an explicit schema definition

In [63]:
# import necessary types
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [64]:
# Define the schema, with correct type
schema = StructType([
    StructField(name="id", dataType=IntegerType(), nullable=True),
    StructField(name="name", dataType=StringType(), nullable=True),
    StructField(name="category", dataType=StringType(), nullable=True),
    StructField(name="quantity", dataType=IntegerType(), nullable=True),
    StructField(name="price", dataType=DoubleType(), nullable=True)
])

In [65]:
# Read CSV file into DataFrame with explicit schema definition
csv_file_path = r"D:\pyspark-tutorial\data\products.csv"
df = spark.read.csv(csv_file_path, header=True, schema=schema)

In [66]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



#### Read CSV with inferSchema

In [67]:
# Read CSV file into DataFrame with inferSchema
csv_file_path = r"D:\pyspark-tutorial\data\products.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

In [68]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)

+---+--------------------+---------------+--------+------+
| id|                name|       category|quantity| price|
+---+--------------------+---------------+--------+------+
|  1|           iPhone 12|    Electronics|      10|899.99|
|  2|     Nike Air Max 90|       Clothing|      25|119.99|
|  3|KitchenAid Stand ...|Home Appliances|       5|299.99|
|  4|    The Great Gatsby|          Books|      50| 12.99|
|  5|L'Oreal Paris Mas...|         Beauty|     100|  9.99|
+---+--------------------+---------------+--------+------+
only showing top 5 rows



### Read JSON file into DataFrame

#### Single Line JSON

In [69]:
%%bash
head -10 D:\pyspark-tutorial\data\products_singleline.json

Couldn't find program: 'bash'


In [70]:
# Read single line JSON
# Each row is a JSON record, records are separated by new line
json_file_path = r"D:\pyspark-tutorial\data\products_singleline.json"
df = spark.read.json(json_file_path)

In [71]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



#### Multi-lines JSON

In [83]:
%%bash
head -20 D:\pyspark-tutorial\data\products_multiline.json

UsageError: Line magic function `%%spark-shell` not found.


In [74]:
# Read multi-line JSON
# JSON is an array of record, records are separated by a comma.
# each record is defined in multiple lines
json_file_path = r"D:\pyspark-tutorial\data\products_multiline.json"
df = spark.read.json(json_file_path, multiLine=True)

In [75]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



In [78]:
# write dataframe into parquet file, what is parquet? delete it first, otherwise it will fail
parquet_file_path = r"D:\pyspark-tutorial\data\products.parquet"
df.write.parquet(parquet_file_path)

### Read parquet file into DataFrame

In [79]:
df = spark.read.parquet(parquet_file_path)

In [84]:
# Display schema of DataFrame
df.printSchema()

# Display content of DataFrame
df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



In [85]:
spark.stop()