# *PySpark setup*

In [1]:
! pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, max, avg, count, rank, to_date, round

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=95a50679eddf5a374a7851b5d4cec6ed82c17b6a3d2f007a01cbcc6b49a01fc6
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


# **E-commerce Transactions**

In [3]:
# creating session
spark = SparkSession.builder.appName("E-commerce Transactions").getOrCreate()

# loading data
data = '/content/drive/MyDrive/DataEngineering/PysparkCodingAssessment/EcommerceData.csv'
ecommerce_df = spark.read.csv(data, header=True, inferSchema=True)
ecommerce_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|   

In [6]:
# 1. Calculate the Total Revenue per Category
total_revenue_by_category = ecommerce_df.groupBy("category").agg(sum(col("price")*col("quantity")).alias("total_revenue"))
total_revenue_by_category.show()

+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|          180|
|   Electronics|         3200|
|         Books|           80|
|Home Appliance|          980|
+--------------+-------------+



In [7]:
# 2. Filter Transactions with a Discount Greater Than 10%
discounted_transactions = ecommerce_df.filter(col("discount_percentage") > 10)
discounted_transactions.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+



In [10]:
# 3. Find the Most Expensive Product Sold
most_expensive_product = ecommerce_df.orderBy(col("price").desc()).limit(1)
most_expensive_product.show()

+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|product|   category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+
|             1|        101| Laptop|Electronics| 1000|       1|                 10|      2023-08-01|
+--------------+-----------+-------+-----------+-----+--------+-------------------+----------------+



In [11]:
# 4. Calculate the Average Quantity of Products Sold per Category
avg_quantity_by_category = ecommerce_df.groupBy("category").agg(avg("quantity").alias("avg_quantity"))
avg_quantity_by_category.show()

+--------------+------------+
|      category|avg_quantity|
+--------------+------------+
|       Fashion|         2.0|
|   Electronics|        1.75|
|         Books|         4.0|
|Home Appliance|         1.0|
+--------------+------------+



In [13]:
# 5. Identify Customers Who Purchased More Than One Product
customers_with_multiple_products = ecommerce_df.groupBy("customer_id").agg(count("*").alias("total_products")).filter(col("total_products") > 1)
customers_with_multiple_products.show()

+-----------+--------------+
|customer_id|total_products|
+-----------+--------------+
|        101|             2|
|        102|             2|
+-----------+--------------+



In [18]:
# 6. Find the Top 3 Highest Revenue Transactions
from pyspark.sql.window import Window

ecommerce_df = ecommerce_df.withColumn("transaction_date", to_date(col("transaction_date"), "yyyy-MM-dd"))
window = Window.orderBy(col("final_price").desc())

highest_revenue_transactions = ecommerce_df.withColumn("final_price", col("price")*col('quantity') - (col("price") * col("discount_percentage") / 100)) \
    .withColumn("rank", rank().over(window)) \
    .filter(col("rank") <= 3)
highest_revenue_transactions.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+----+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|final_price|rank|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+----+
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|     1365.0|   1|
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|      900.0|   2|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|      600.0|   3|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+----+



In [19]:
# 7. Calculate the Total Number of Transactions per Day
total_transactions_per_day = ecommerce_df.groupBy("transaction_date").agg(count("*").alias("total_transactions"))
total_transactions_per_day.show()

+----------------+------------------+
|transaction_date|total_transactions|
+----------------+------------------+
|      2023-08-03|                 2|
|      2023-08-06|                 2|
|      2023-08-01|                 2|
|      2023-08-05|                 2|
|      2023-08-04|                 1|
|      2023-08-02|                 1|
+----------------+------------------+



In [23]:
# 8. Find the Customer Who Spent the Most Money
total_spending_by_customer = ecommerce_df.groupBy("customer_id") \
                          .agg(sum(col("price")*col('quantity') - (col("price") * col("discount_percentage") / 100)).alias("total_spending")) \
                          .orderBy(col("total_spending").desc()).limit(1)
total_spending_by_customer.show()

+-----------+--------------+
|customer_id|total_spending|
+-----------+--------------+
|        102|        1935.0|
+-----------+--------------+



In [24]:
# 9. Calculate the Average Discount Given per Product Category
avg_discount_by_category = ecommerce_df.groupBy("category").agg(avg("discount_percentage").alias("avg_discount"))
avg_discount_by_category.show()

+--------------+------------+
|      category|avg_discount|
+--------------+------------+
|       Fashion|        10.0|
|   Electronics|        8.75|
|         Books|         0.0|
|Home Appliance|        15.0|
+--------------+------------+



In [25]:
# 10. Create a New Column for Final Price After Discount
ecommerce_df = ecommerce_df.withColumn("final_price", col("price")*col('quantity') - (col("price") * col("discount_percentage") / 100))
ecommerce_df.show()

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|final_price|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+-----------+
|             1|        101|      Laptop|   Electronics| 1000|       1|                 10|      2023-08-01|      900.0|
|             2|        102|  Smartphone|   Electronics|  700|       2|                  5|      2023-08-01|     1365.0|
|             3|        103|       Shirt|       Fashion|   40|       3|                  0|      2023-08-02|      120.0|
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|      127.5|
|             5|        101|  Headphones|   Electronics|  100|       2|                 10|      2023-08-03|      190.0|
|             6|        105|    

# **Banking Transactions**

In [26]:
spark = SparkSession.builder.appName("Banking Transactions").getOrCreate()

data = '/content/drive/MyDrive/DataEngineering/PysparkCodingAssessment/BankingData.csv'
banking_df = spark.read.csv(data, header=True, inferSchema=True)
banking_df.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             6|        205|      Withdrawal|   500|      2023-09-03|
|             7|        202|         Deposit|  2500|      2023-09-04|
|             8|        206|      Withdrawal|   700|      2023-09-04|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [28]:
# 1. Calculate the Total Deposit and Withdrawal Amounts
total_amounts_by_type = banking_df.groupBy("transaction_type").agg(sum("amount").alias("total_amount"))
total_amounts_by_type.show()

+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+



In [29]:
# 2. Filter Transactions Greater Than $3,000
large_transactions = banking_df.filter(col("amount") > 3000)
large_transactions.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             5|        204|         Deposit| 10000|      2023-09-03|
|             9|        203|         Deposit|  4000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [32]:
# 3. Find the Largest Deposit Made
largest_deposit = banking_df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).limit(1)
largest_deposit.show()

+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             5|        204|         Deposit| 10000|      2023-09-03|
+--------------+-----------+----------------+------+----------------+



In [33]:
# 4. Calculate the Average Transaction Amount for Each Transaction Type
avg_amount_by_type = banking_df.groupBy("transaction_type").agg(avg("amount").alias("avg_amount"))
avg_amount_by_type.show()

+----------------+----------+
|transaction_type|avg_amount|
+----------------+----------+
|         Deposit|    4900.0|
|      Withdrawal|    1540.0|
+----------------+----------+



In [34]:
# 5. Find Customers Who Made Both Deposits and Withdrawals
from pyspark.sql.functions import countDistinct, when

customers_with_both = banking_df.groupBy("customer_id").agg(
    countDistinct(when(col("transaction_type") == "Deposit", 1)).alias("deposit_count"),
    countDistinct(when(col("transaction_type") == "Withdrawal", 1)).alias("withdrawal_count")
).filter((col("deposit_count") > 0) & (col("withdrawal_count") > 0))

customers_with_both.show()

+-----------+-------------+----------------+
|customer_id|deposit_count|withdrawal_count|
+-----------+-------------+----------------+
|        202|            1|               1|
|        204|            1|               1|
|        201|            1|               1|
+-----------+-------------+----------------+



In [35]:
# 6. Calculate the Total Amount of Transactions per Day
banking_df = banking_df.withColumn("transaction_date", to_date(col("transaction_date"), "yyyy-MM-dd"))

total_amount_per_day = banking_df.groupBy("transaction_date").agg(sum("amount").alias("total_amount"))
total_amount_per_day.show()

+----------------+------------+
|transaction_date|total_amount|
+----------------+------------+
|      2023-09-03|       10500|
|      2023-09-01|        7000|
|      2023-09-05|        7000|
|      2023-09-02|        4500|
|      2023-09-04|        3200|
+----------------+------------+



In [37]:
# 7. Find the Customer with the Highest Total Withdrawal
highest_withdrawal_customer = banking_df.filter(col("transaction_type") == "Withdrawal") \
              .groupBy("customer_id").agg(sum("amount").alias("total_withdrawal")) \
              .orderBy(col("total_withdrawal").desc()).limit(1)
highest_withdrawal_customer.show()

+-----------+----------------+
|customer_id|total_withdrawal|
+-----------+----------------+
|        204|            3000|
+-----------+----------------+



In [38]:
# 8. Calculate the Number of Transactions for Each Customer
transaction_count_by_customer = banking_df.groupBy("customer_id").agg(count("*").alias("transaction_count"))
transaction_count_by_customer.show()

+-----------+-----------------+
|customer_id|transaction_count|
+-----------+-----------------+
|        206|                1|
|        205|                1|
|        202|                2|
|        203|                2|
|        204|                2|
|        201|                2|
+-----------+-----------------+



In [53]:
# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater Than $1,000

filtered_withdrawals = banking_df.filter(col("transaction_type") == "Withdrawal") \
    .filter(col("amount") > 1000)

same_day_withdrawals = banking_df.join(filtered_withdrawals, banking_df["transaction_date"] == filtered_withdrawals["transaction_date"], "inner")\
    .select(banking_df["*"])
same_day_withdrawals.show()



+--------------+-----------+----------------+------+----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|
+--------------+-----------+----------------+------+----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|
|             2|        202|      Withdrawal|  2000|      2023-09-01|
|             3|        203|         Deposit|  3000|      2023-09-02|
|             4|        201|      Withdrawal|  1500|      2023-09-02|
|             9|        203|         Deposit|  4000|      2023-09-05|
|            10|        204|      Withdrawal|  3000|      2023-09-05|
+--------------+-----------+----------------+------+----------------+



In [55]:
# 10. Create a New Column to Classify Transactions as "High" or "Low" Value
banking_df = banking_df.withColumn("transaction_value", when(col("amount") > 5000, "High").otherwise("Low"))
banking_df.show()

+--------------+-----------+----------------+------+----------------+-----------------+
|transaction_id|customer_id|transaction_type|amount|transaction_date|transaction_value|
+--------------+-----------+----------------+------+----------------+-----------------+
|             1|        201|         Deposit|  5000|      2023-09-01|              Low|
|             2|        202|      Withdrawal|  2000|      2023-09-01|              Low|
|             3|        203|         Deposit|  3000|      2023-09-02|              Low|
|             4|        201|      Withdrawal|  1500|      2023-09-02|              Low|
|             5|        204|         Deposit| 10000|      2023-09-03|             High|
|             6|        205|      Withdrawal|   500|      2023-09-03|              Low|
|             7|        202|         Deposit|  2500|      2023-09-04|              Low|
|             8|        206|      Withdrawal|   700|      2023-09-04|              Low|
|             9|        203|    

# **Health & Fitness Tracker Data**

In [56]:
spark = SparkSession.builder.appName("Health & Fitness Tracker Data").getOrCreate()

data = '/content/drive/MyDrive/DataEngineering/PysparkCodingAssessment/HealthandFitnessData.csv'
fitness_df = spark.read.csv(data, header=True, inferSchema=True)
fitness_df.show()

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|
|      3|2023-09-02|14000|            600|           7.5|    Strength|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|
+-------+----------+-----+---------------+--------------+------------+



In [57]:
# 1. Find the Total Steps Taken by Each User
total_steps_by_user = fitness_df.groupBy("user_id").agg(sum("steps").alias("total_steps"))
total_steps_by_user.show()

+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      35000|
|      3|      45000|
|      2|      29500|
+-------+-----------+



In [59]:
# 2. Filter Days with More Than 10,000 Steps
high_step_days = fitness_df.filter(col("steps") > 10000).select("date","steps")
high_step_days.show()

+----------+-----+
|      date|steps|
+----------+-----+
|2023-09-01|12000|
|2023-09-01|15000|
|2023-09-02|14000|
|2023-09-03|13000|
|2023-09-03|12000|
|2023-09-03|16000|
+----------+-----+



In [61]:
# 3. Calculate the Average Calories Burned by Workout Type
avg_calories_by_workout = fitness_df.groupBy("workout_type").agg(round(avg("calories_burned"),2).alias("avg_calories"))
avg_calories_by_workout.show()

+------------+------------+
|workout_type|avg_calories|
+------------+------------+
|    Strength|       500.0|
|        Yoga|      573.33|
|      Cardio|       537.5|
+------------+------------+



In [64]:
# 4. Identify the Day with the Most Steps for Each User
max_steps_day = fitness_df.groupBy("user_id").agg(max("steps").alias("max_steps"), max("date").alias("max_steps_date"))
max_steps_day.show()

+-------+---------+--------------+
|user_id|max_steps|max_steps_date|
+-------+---------+--------------+
|      1|    13000|    2023-09-03|
|      3|    16000|    2023-09-03|
|      2|    12000|    2023-09-03|
+-------+---------+--------------+



In [65]:
# 5. Find Users Who Burned More Than 600 Calories on Any Day
high_calorie_users = fitness_df.filter(col("calories_burned") > 600).select("user_id").distinct()
high_calorie_users.show()

+-------+
|user_id|
+-------+
|      3|
+-------+



In [68]:
# 6. Calculate the Average Hours of Sleep per User
avg_sleep_by_user = fitness_df.groupBy("user_id").agg(round(avg("hours_of_sleep"),2).alias("avg_sleep"))
avg_sleep_by_user.show()

+-------+---------+
|user_id|avg_sleep|
+-------+---------+
|      1|      7.0|
|      3|      7.5|
|      2|     6.67|
+-------+---------+



In [70]:
# 7. Calculate the Total Calories Burned per Day
total_calories_per_day = fitness_df.groupBy("date").agg(sum("calories_burned").alias("total_calories"))
total_calories_per_day.show()

+----------+--------------+
|      date|total_calories|
+----------+--------------+
|2023-09-03|          1770|
|2023-09-01|          1550|
|2023-09-02|          1550|
+----------+--------------+



In [71]:
# 8. Identify Users Who Did Different Types of Workouts
users_with_multiple_workouts = fitness_df.groupBy("user_id").agg(countDistinct("workout_type").alias("workout_types")) \
    .filter(col("workout_types") > 1)
users_with_multiple_workouts.show()

+-------+-------------+
|user_id|workout_types|
+-------+-------------+
|      1|            2|
|      3|            3|
|      2|            3|
+-------+-------------+



In [72]:
# 9. Calculate the Total Number of Workouts per User
workout_count_by_user = fitness_df.groupBy("user_id").agg(count("*").alias("workout_count"))
workout_count_by_user.show()

+-------+-------------+
|user_id|workout_count|
+-------+-------------+
|      1|            3|
|      3|            3|
|      2|            3|
+-------+-------------+



In [74]:
# 10. Create a New Column for "Active" Days
fitness_df = fitness_df.withColumn("active_day", when(col("steps") > 10000, "Active").otherwise("Inactive"))
fitness_df.show()

+-------+----------+-----+---------------+--------------+------------+----------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|active_day|
+-------+----------+-----+---------------+--------------+------------+----------+
|      1|2023-09-01|12000|            500|           7.0|      Cardio|    Active|
|      2|2023-09-01| 8000|            400|           6.5|    Strength|  Inactive|
|      3|2023-09-01|15000|            650|           8.0|        Yoga|    Active|
|      1|2023-09-02|10000|            450|           6.0|      Cardio|  Inactive|
|      2|2023-09-02| 9500|            500|           7.0|      Cardio|  Inactive|
|      3|2023-09-02|14000|            600|           7.5|    Strength|    Active|
|      1|2023-09-03|13000|            550|           8.0|        Yoga|    Active|
|      2|2023-09-03|12000|            520|           6.5|        Yoga|    Active|
|      3|2023-09-03|16000|            700|           7.0|      Cardio|    Active|
+-------+-------

# **Music Streaming Data**

In [75]:
spark = SparkSession.builder.appName("Music Streaming Data").getOrCreate()

data = '/content/drive/MyDrive/DataEngineering/PysparkCodingAssessment/MusicStreamingData.csv'
music_df = spark.read.csv(data, header=True, inferSchema=True)
music_df.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      2|    Galway Girl|Ed Sheeran|             190|2023-09-02 10:00:00|Los Angeles|
|      3|      New Rules|  Dua Lipa|             230|2

In [77]:
# 1. Calculate the Total Listening Time for Each User
total_listening_time_by_user = music_df.groupBy("user_id").agg(sum("duration_seconds").alias("total_listening_time"))
total_listening_time_by_user.show()

+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+



In [78]:
# 2. Filter Songs Streamed for More Than 200 Seconds
long_songs = music_df.filter(col("duration_seconds") > 200)
long_songs.show()

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+



In [82]:
# 3. Find the Most Popular Artist (by Total Streams)
most_popular_artist = music_df.groupBy("artist").agg(count("*").alias("total_streams")) \
    .orderBy(col("total_streams").desc())
most_popular_artist.show()

+----------+-------------+
|    artist|total_streams|
+----------+-------------+
|  Dua Lipa|            3|
|Ed Sheeran|            3|
|The Weeknd|            3|
+----------+-------------+



In [84]:
# 4. Identify the Song with the Longest Duration
longest_song = music_df.orderBy(col("duration_seconds").desc()).limit(1)
longest_song.show()

+-------+----------+----------+----------------+-------------------+-----------+
|user_id|song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+----------+----------+----------------+-------------------+-----------+
|      2|   Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
+-------+----------+----------+----------------+-------------------+-----------+



In [87]:
# 5. Calculate the Average Song Duration by Artist
avg_duration_by_artist = music_df.groupBy("artist").agg(round(avg("duration_seconds"),2).alias("avg_duration"))
avg_duration_by_artist.show()

+----------+------------+
|    artist|avg_duration|
+----------+------------+
|  Dua Lipa|      203.33|
|Ed Sheeran|      226.67|
|The Weeknd|       210.0|
+----------+------------+



In [90]:
# 6. Find the Top 3 Most Streamed Songs per User
window = Window.partitionBy("user_id").orderBy(col("stream_count").desc())

top_streamed_songs = music_df.groupBy("user_id", "song_title").agg(sum("duration_seconds").alias("stream_count")) \
    .withColumn("rank", rank().over(window)) \
    .filter(col("rank") <= 3)

top_streamed_songs.show()

+-------+---------------+------------+----+
|user_id|     song_title|stream_count|rank|
+-------+---------------+------------+----+
|      1|        Starboy|         220|   1|
|      1|Save Your Tears|         210|   2|
|      1|Blinding Lights|         200|   3|
|      2|        Perfect|         250|   1|
|      2|   Shape of You|         240|   2|
|      2|    Galway Girl|         190|   3|
|      3|      New Rules|         230|   1|
|      3|Don't Start Now|         200|   2|
|      3|     Levitating|         180|   3|
+-------+---------------+------------+----+



In [92]:
# 7. Calculate the Total Number of Streams per Day
music_df = music_df.withColumn("streaming_date", to_date(col("streaming_time"), "yyyy-MM-dd HH:mm:ss"))
total_streams_per_day = music_df.groupBy("streaming_date").agg(count("*").alias("total_streams"))
total_streams_per_day.show()

+--------------+-------------+
|streaming_date|total_streams|
+--------------+-------------+
|    2023-09-01|            5|
|    2023-09-02|            4|
+--------------+-------------+



In [93]:
# 8. Identify Users Who Streamed Songs from More Than One Artist
users_with_multiple_artists = music_df.groupBy("user_id").agg(countDistinct("artist").alias("artists_streamed")) \
    .filter(col("artists_streamed") > 1)
users_with_multiple_artists.show()

+-------+----------------+
|user_id|artists_streamed|
+-------+----------------+
+-------+----------------+



In [94]:
# 9. Calculate the Total Streams for Each Location
total_streams_by_location = music_df.groupBy("location").agg(count("*").alias("total_streams"))
total_streams_by_location.show()

+-----------+-------------+
|   location|total_streams|
+-----------+-------------+
|Los Angeles|            3|
|     London|            3|
|   New York|            3|
+-----------+-------------+



In [95]:
# 10. Create a New Column to Classify Long and Short Songs
music_df = music_df.withColumn("song_length", when(col("duration_seconds") > 200, "Long").otherwise("Short"))
music_df.show()

+-------+---------------+----------+----------------+-------------------+-----------+--------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|streaming_date|song_length|
+-------+---------------+----------+----------------+-------------------+-----------+--------------+-----------+
|      1|Blinding Lights|The Weeknd|             200|2023-09-01 08:15:00|   New York|    2023-09-01|      Short|
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|    2023-09-01|       Long|
|      3|     Levitating|  Dua Lipa|             180|2023-09-01 10:30:00|     London|    2023-09-01|      Short|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|    2023-09-01|       Long|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|    2023-09-01|       Long|
|      3|Don't Start Now|  Dua Lipa|             200|2023-09-02 08:10:00|     London|    2023-09

# **Retail Store Sales Data**

In [96]:
spark = SparkSession.builder.appName("Retail Store Sales Data").getOrCreate()

data = '/content/drive/MyDrive/DataEngineering/PysparkCodingAssessment/RetailStoreSalesData.csv'
retail_df = spark.read.csv(data, header=True, inferSchema=True)
retail_df.show()

+--------------+------------+-----------+-----+--------+----------+
|transaction_id|product_name|   category|price|quantity|sales_date|
+--------------+------------+-----------+-----+--------+----------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|
|             8|         Pen| Stationery|  1.0|      10|2023-09-04|
|             9|      Orange|  Groceries|  0.6|       8|2023-09-05|
|            10|    Sneakers|   Clothing| 50.0|       1|2023-09-05|
+--------------+------------+-----------+-----+--------+----------+



In [97]:
# 1. Calculate the Total Revenue per Category
total_revenue_by_category = retail_df.withColumn("total_sales", col("price") * col("quantity")) \
    .groupBy("category").agg(sum("total_sales").alias("total_revenue"))
total_revenue_by_category.show()

+-----------+------------------+
|   category|     total_revenue|
+-----------+------------------+
| Stationery|              20.0|
|  Groceries|13.399999999999999|
|Electronics|            1000.0|
|   Clothing|             155.0|
+-----------+------------------+



In [98]:
# 2. Filter Transactions Where the Total Sales Amount is Greater Than $100
high_value_transactions = retail_df.filter(col("price") * col("quantity") > 100)
high_value_transactions.show()

+--------------+------------+-----------+-----+--------+----------+
|transaction_id|product_name|   category|price|quantity|sales_date|
+--------------+------------+-----------+-----+--------+----------+
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|
+--------------+------------+-----------+-----+--------+----------+



In [99]:
# 3. Find the Most Sold Product
most_sold_product = retail_df.groupBy("product_name").agg(sum("quantity").alias("total_quantity")) \
    .orderBy(col("total_quantity").desc()).limit(1)
most_sold_product.show()

+------------+--------------+
|product_name|total_quantity|
+------------+--------------+
|      Banana|            12|
+------------+--------------+



In [101]:
# 4. Calculate the Average Price per Product Category
avg_price_by_category = retail_df.groupBy("category").agg(round(avg("price"),2).alias("avg_price"))
avg_price_by_category.show()

+-----------+---------+
|   category|avg_price|
+-----------+---------+
| Stationery|      1.5|
|  Groceries|     0.47|
|Electronics|    450.0|
|   Clothing|     30.0|
+-----------+---------+



In [103]:
# 5. Find the Top 3 Highest Grossing Products
window = Window.orderBy(col("total_revenue").desc())

top_grossing_products = retail_df.withColumn("total_sales", col("price") * col("quantity")) \
    .groupBy("product_name").agg(sum("total_sales").alias("total_revenue")) \
    .withColumn("rank", rank().over(window)) \
    .filter(col("rank") <= 3)
top_grossing_products.show()

+------------+-------------+----+
|product_name|total_revenue|rank|
+------------+-------------+----+
|      Laptop|        800.0|   1|
|  Headphones|        200.0|   2|
|       Pants|         75.0|   3|
+------------+-------------+----+



In [104]:
# 6. Calculate the Total Number of Items Sold per Day
retail_df = retail_df.withColumn("sales_date", to_date(col("sales_date"), "yyyy-MM-dd"))

total_items_sold_per_day = retail_df.groupBy("sales_date").agg(sum("quantity").alias("total_items_sold"))
total_items_sold_per_day.show()

+----------+----------------+
|sales_date|total_items_sold|
+----------+----------------+
|2023-09-03|               4|
|2023-09-01|              12|
|2023-09-05|               9|
|2023-09-02|              17|
|2023-09-04|              12|
+----------+----------------+



In [112]:
# 7. Identify the Product with the Lowest Price in Each Category

lowest_price_product = retail_df.withColumn("min_price", min("price").over(Window.partitionBy("category"))) \
                            .filter(col("price") == col("min_price")) \
                            .select("category", "product_name", "price").distinct()
lowest_price_product.show()

+-----------+------------+-----+
|   category|product_name|price|
+-----------+------------+-----+
|   Clothing|     T-shirt| 15.0|
|Electronics|  Headphones|100.0|
|  Groceries|      Banana|  0.3|
| Stationery|         Pen|  1.0|
+-----------+------------+-----+



In [115]:
# 8. Calculate the Total Revenue for Each Product
total_revenue_by_product = retail_df.groupBy("product_name").agg((sum("price") * sum("quantity")).alias("total_revenue"))
total_revenue_by_product.show()

+------------+------------------+
|product_name|     total_revenue|
+------------+------------------+
|     T-shirt|              30.0|
|    Sneakers|              50.0|
|      Orange|               4.8|
|      Banana|3.5999999999999996|
|         Pen|              10.0|
|       Pants|              75.0|
|      Laptop|             800.0|
|    Notebook|              10.0|
|       Apple|               5.0|
|  Headphones|             200.0|
+------------+------------------+



In [119]:
# 9. Find the Total Sales per Day for Each Category
total_sales_per_day_category = retail_df.groupBy("sales_date", "category").agg((sum("price") * sum("quantity")).alias("total_sales"))\
                                        .orderBy("category","sales_date")
total_sales_per_day_category.show()

+----------+-----------+------------------+
|sales_date|   category|       total_sales|
+----------+-----------+------------------+
|2023-09-01|   Clothing|              30.0|
|2023-09-03|   Clothing|              75.0|
|2023-09-05|   Clothing|              50.0|
|2023-09-03|Electronics|             800.0|
|2023-09-04|Electronics|             200.0|
|2023-09-01|  Groceries|               5.0|
|2023-09-02|  Groceries|3.5999999999999996|
|2023-09-05|  Groceries|               4.8|
|2023-09-02| Stationery|              10.0|
|2023-09-04| Stationery|              10.0|
+----------+-----------+------------------+



In [120]:
# 10. Create a New Column for Discounted Price
retail_df = retail_df.withColumn("discounted_price", col("price") * 0.9)
retail_df.show()

+--------------+------------+-----------+-----+--------+----------+----------------+
|transaction_id|product_name|   category|price|quantity|sales_date|discounted_price|
+--------------+------------+-----------+-----+--------+----------+----------------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|            0.45|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|            13.5|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|             1.8|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|            0.27|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|           720.0|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|            22.5|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|            90.0|
|             8|         Pen| Stationery|  1.0|      10|2023-09-04|             0.9|
|             9|      Orange|  Groceries|  0.6|       8|2023-09-0