In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, sum, avg, count, when, lit


In [2]:
# Initialize Spark Session
spark = SparkSession.builder.appName("First App").getOrCreate()

print("Spark version:", spark.version)

Spark version: 3.3.4


In [3]:
spark


In [5]:
df = spark.read.csv("../data/2019.csv")

In [6]:
df.show(5)

+-------+---+----------+--------------+--------------------+--------------------+---+--------+--------+
|    _c0|_c1|       _c2|           _c3|                 _c4|                 _c5|_c6|     _c7|     _c8|
+-------+---+----------+--------------+--------------------+--------------------+---+--------+--------+
|SO43701|  1|2019-07-01|   Christy Zhu|christy12@adventu...|Mountain-100 Silv...|  1| 3399.99|271.9992|
|SO43704|  1|2019-07-01|    Julio Ruiz|julio1@adventure-...|Mountain-100 Blac...|  1| 3374.99|269.9992|
|SO43705|  1|2019-07-01|     Curtis Lu|curtis9@adventure...|Mountain-100 Silv...|  1| 3399.99|271.9992|
|SO43700|  1|2019-07-01|  Ruben Prasad|ruben10@adventure...|  Road-650 Black, 62|  1|699.0982| 55.9279|
|SO43703|  1|2019-07-01|Albert Alvarez|albert7@adventure...|    Road-150 Red, 62|  1| 3578.27|286.2616|
+-------+---+----------+--------------+--------------------+--------------------+---+--------+--------+
only showing top 5 rows



In [7]:
type(df)

pyspark.sql.dataframe.DataFrame

In [8]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [9]:
df_customers = spark.read.csv("../data/customer.csv", header=True)

df_customers.show(5)

+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+--------------------+
|customer_id|       name|               email|  country|customer_type|registration_date|age|gender|total_purchases| ingestion_timestamp|
+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+--------------------+
|         12|Customer 12|customer12@exampl...|Australia|          VIP|       2019-06-16| 65| Other|            758|2024-09-26 18:39:...|
|         38|Customer 38|customer38@exampl...|    India|          VIP|       2021-10-01| 38| Other|             82|2024-09-26 18:39:...|
|         46|Customer 46|customer46@exampl...|   France|          VIP|       2023-03-20| 78| Other|            314|2024-09-26 18:39:...|
|         57|Customer 57|customer57@exampl...|    China|          VIP|       2010-12-08| 41| Other|             10|2024-09-26 18:39:...|
|         76|Customer 76|customer76@examp

In [10]:
df_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- country: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- total_purchases: string (nullable = true)
 |-- ingestion_timestamp: string (nullable = true)



In [11]:
type(df_customers)

pyspark.sql.dataframe.DataFrame

In [12]:
df_product = spark.read.json("../data/product.json")
df_product.show(5)

+------------+-----------+--------------------+---------+-----------+------+----------+------+--------------+
|       brand|   category| ingestion_timestamp|is_active|       name| price|product_id|rating|stock_quantity|
+------------+-----------+--------------------+---------+-----------+------+----------+------+--------------+
|  BeautyGlow|      Books|2024-09-26T18:41:...|    false| Product 20|515.79|        20|   1.6|           726|
|    SportMax|Electronics|2024-09-26T18:41:...|    false| Product 32|408.99|        32|   1.6|           801|
|    BookWorm|       Toys|2024-09-26T18:41:...|    false|Product 109|621.25|       109|   1.6|           455|
|GardenMaster|       Home|2024-09-26T18:41:...|    false|Product 148|545.57|       148|   1.6|           627|
|    FashionX| Automotive|2024-09-26T18:41:...|    false|Product 272|202.73|       272|   1.6|           135|
+------------+-----------+--------------------+---------+-----------+------+----------+------+--------------+
only showi

In [13]:
df_product.printSchema()

root
 |-- brand: string (nullable = true)
 |-- category: string (nullable = true)
 |-- ingestion_timestamp: string (nullable = true)
 |-- is_active: boolean (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- product_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- stock_quantity: long (nullable = true)



In [14]:
df_transactions = spark.read.parquet("../data/transactions.snappy.parquet")

df_transactions.show(5)

+--------------+-----------+----------+--------+------------+----------------+--------------+----------+--------------------+
|transaction_id|customer_id|product_id|quantity|total_amount|transaction_date|payment_method|store_type| ingestion_timestamp|
+--------------+-----------+----------+--------+------------+----------------+--------------+----------+--------------------+
|     TRX000063|        234|        67|       2|      550.83|      2021-09-12| Bank Transfer|    Online|2024-09-26 21:41:...|
|     TRX000115|         58|       475|       2|      299.56|      2022-07-31| Bank Transfer|    Online|2024-09-26 21:41:...|
|     TRX000126|         29|       609|       2|      706.21|      2021-12-02| Bank Transfer|    Online|2024-09-26 21:41:...|
|     TRX000144|        122|       202|       2|      446.44|      2022-09-24| Bank Transfer|    Online|2024-09-26 21:41:...|
|     TRX000311|        378|       719|       2|      945.18|      2020-02-19| Bank Transfer|    Online|2024-09-26 21:

In [15]:
df_transactions.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- store_type: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)



In [16]:
type(df_transactions)

pyspark.sql.dataframe.DataFrame