In [1]:
import os
os.environ['SPARK_HOME'] = r'C:\Users\Marcos\Documents\Spark'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType

In [5]:
spark = SparkSession.builder.appName('Create-DataFrame').getOrCreate()

Read CSV with no schema

In [6]:
csv_path = 'data/walmart-sales-dataset-of-45stores.csv'
df = spark.read.csv(csv_path, header=True)

In [7]:
df.printSchema()

root
 |-- Store: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Weekly_Sales: string (nullable = true)
 |-- Holiday_Flag: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Fuel_Price: string (nullable = true)
 |-- CPI: string (nullable = true)
 |-- Unemployment: string (nullable = true)



In [9]:
df.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|    1|05-02-2010|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|12-02-2010|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|19-02-2010|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|26-02-2010|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|05-03-2010|  1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



Read CSV and defining a schema

In [20]:
schema = StructType([
    StructField(name='Store', dataType=IntegerType(), nullable=True),
    StructField(name='Date', dataType=StringType(), nullable=True),
    StructField(name='Weekly_Sales', dataType=DoubleType(), nullable=True),
    StructField(name='Holiday_Flag', dataType=IntegerType(), nullable=True),
    StructField(name='Temperature', dataType=DoubleType(), nullable=True),    
    StructField(name='Fuel_Price', dataType=DoubleType(), nullable=True),    
    StructField(name='CPI', dataType=DoubleType(), nullable=True),
    StructField(name='Unemployment', dataType=DoubleType(), nullable=True)
])

In [21]:
df = spark.read.csv(csv_path, header=True, schema=schema)

In [22]:
df.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Weekly_Sales: double (nullable = true)
 |-- Holiday_Flag: integer (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- CPI: double (nullable = true)
 |-- Unemployment: double (nullable = true)



In [23]:
df.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|    1|05-02-2010|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|12-02-2010|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|19-02-2010|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|26-02-2010|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|05-03-2010|  1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



Read CSV with inferSchema

In [24]:
df = spark.read.csv(csv_path, header=True, inferSchema=True)

In [25]:
df.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: string (nullable = true)
 |-- Weekly_Sales: double (nullable = true)
 |-- Holiday_Flag: integer (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- CPI: double (nullable = true)
 |-- Unemployment: double (nullable = true)



In [26]:
df.show(5)

+-----+----------+------------+------------+-----------+----------+-----------+------------+
|Store|      Date|Weekly_Sales|Holiday_Flag|Temperature|Fuel_Price|        CPI|Unemployment|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
|    1|05-02-2010|   1643690.9|           0|      42.31|     2.572|211.0963582|       8.106|
|    1|12-02-2010|  1641957.44|           1|      38.51|     2.548|211.2421698|       8.106|
|    1|19-02-2010|  1611968.17|           0|      39.93|     2.514|211.2891429|       8.106|
|    1|26-02-2010|  1409727.59|           0|      46.63|     2.561|211.3196429|       8.106|
|    1|05-03-2010|  1554806.68|           0|       46.5|     2.625|211.3501429|       8.106|
+-----+----------+------------+------------+-----------+----------+-----------+------------+
only showing top 5 rows



Read JSON

In [43]:
json_path = 'data/store.json'
df = spark.read.json(json_path)

In [44]:
df.printSchema()
df.show(5)

root
 |-- category: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- quantity: long (nullable = true)

+---------------+---+--------------------+------+--------+
|       category| id|                name| price|quantity|
+---------------+---+--------------------+------+--------+
|    Electronics|  1|           iPhone 12|899.99|      10|
|       Clothing|  2|     Nike Air Max 90|119.99|      25|
|Home Appliances|  3|KitchenAid Stand ...|299.99|       5|
|          Books|  4|    The Great Gatsby| 12.99|      50|
|         Beauty|  5|L'Oreal Paris Mas...|  9.99|     100|
+---------------+---+--------------------+------+--------+
only showing top 5 rows



In [45]:
json_path_mult = 'data/store-multiline.json'
df = spark.read.json(json_path_mult, multiLine=True)

In [48]:
df.printSchema()
df.show(5)

root
 |-- CPI: double (nullable = true)
 |-- Date: string (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- Holiday_Flag: long (nullable = true)
 |-- Store: long (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Unemployment: double (nullable = true)
 |-- Weekly_Sales: double (nullable = true)

+-----------+----------+----------+------------+-----+-----------+------------+------------+
|        CPI|      Date|Fuel_Price|Holiday_Flag|Store|Temperature|Unemployment|Weekly_Sales|
+-----------+----------+----------+------------+-----+-----------+------------+------------+
|211.0963582|05-02-2010|     2.572|           0|    1|      42.31|       8.106|   1643690.9|
|211.2421698|12-02-2010|     2.548|           1|    1|      38.51|       8.106|  1641957.44|
|211.2891429|19-02-2010|     2.514|           0|    1|      39.93|       8.106|  1611968.17|
|211.3196429|26-02-2010|     2.561|           0|    1|      46.63|       8.106|  1409727.59|
|211.3501429|05-03-201

Parquet data

In [None]:
parquet_path = 'data/store.parquet'
df.write.parquet(parquet_path)

In [None]:
df = spark.read.parquet(parquet_path)
df.printSchema()
df.show(5)

In [53]:
spark.stop()