In [1]:
# Ячейка 1: Импортируем необходимые модули
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Ячейка 2: Создаем SparkSession
spark = SparkSession.builder \
    .appName("ProductCategoryAnalysis") \
    .getOrCreate()

# Ячейка 3: Загружаем данные
products_df = spark.read.csv("../data/products.csv", header=True, inferSchema=True)
categories_df = spark.read.csv("../data/categories.csv", header=True, inferSchema=True)
product_category_df = spark.read.csv("../data/product_category.csv", header=True, inferSchema=True)

# Ячейка 4: Отображаем загруженные данные
print("Products DataFrame:")
products_df.show()

print("Categories DataFrame:")
categories_df.show()

print("Product-Category DataFrame:")
product_category_df.show()

# Ячейка 5: Присоединяем категории к продуктам
product_category_full_df = product_category_df.join(categories_df, on="category_id", how="left") \
    .join(products_df, on="product_id", how="right")

# Ячейка 6: Найти все пары «Имя продукта – Имя категории»
product_category_pairs_df = product_category_full_df.select("product_name", "category_name")
print("Product-Category Pairs:")
product_category_pairs_df.show()

# Ячейка 7: Найти продукты, у которых нет категорий
products_without_categories_df = product_category_full_df.filter(col("category_id").isNull()).select("product_name").distinct()
print("Products without Categories:")
products_without_categories_df.show()

# Ячейка 8: Завершаем работу SparkSession
spark.stop()

24/07/23 15:40:50 WARN Utils: Your hostname, Sergejs-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.132 instead (on interface en0)
24/07/23 15:40:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/23 15:40:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

Products DataFrame:
+----------+------------+
|product_id|product_name|
+----------+------------+
|         1|    ProductA|
|         2|    ProductB|
|         3|    ProductC|
|         4|    ProductD|
+----------+------------+

Categories DataFrame:
+-----------+-------------+
|category_id|category_name|
+-----------+-------------+
|         10|    Category1|
|         11|    Category2|
|         12|    Category3|
+-----------+-------------+

Product-Category DataFrame:
+----------+-----------+
|product_id|category_id|
+----------+-----------+
|         1|         10|
|         1|         11|
|         2|         10|
|         3|         12|
+----------+-----------+

Product-Category Pairs:
+------------+-------------+
|product_name|category_name|
+------------+-------------+
|    ProductA|    Category2|
|    ProductA|    Category1|
|    ProductB|    Category1|
|    ProductC|    Category3|
|    ProductD|         NULL|
+------------+-------------+

Products without Categories:
+-------