In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=7ee74b154a8ac8b198b25b6cc2700536b008a9dea6cedd5653ecb475da78eec5
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [70]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Ecommerce").getOrCreate()

ecommerce_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/ecommerce_data.csv")

# 1. Calculate the Total Revenue per Category

total_revenue_df = ecommerce_df.withColumn("revenue", (col("price") * col("quantity")) * (1-col("discount_percentage")/100)).groupBy("category").agg(F.sum("revenue").alias("total_revenue"))
total_revenue_df.show()

# 2. Filter Transactions with a Discount Greater Than 10%

high_discount_df = ecommerce_df.filter(col("discount_percentage") > 10)
high_discount_df.show()

# 3. Find the Most Expensive Product Sold

expensive_df = ecommerce_df.orderBy(col("price").desc()).limit(1)
expensive_df.show()

# 4. Calculate the Average Quantity of Products Sold per Category

avg_quantity_df = ecommerce_df.groupBy("category").agg(F.avg("quantity").alias("average_quantity"))
avg_quantity_df.show()

# 5. Identify Customers Who Purchased More Than One Product in single transaction

high_buy_df = ecommerce_df.filter(col("quantity")>1)
high_buy_df.show()

# 6. Find the Top 3 Highest Revenue Transactions

top_3_highest_df = ecommerce_df.withColumn("revenue", (col("price") * col("quantity")) * (1-col("discount_percentage")/100)).orderBy(col("revenue").desc()).limit(3)
top_3_highest_df.show()


# 7. Calculate the Total Number of Transactions per Day

transaction_per_day = ecommerce_df.groupBy("transaction_date").agg(F.count("*").alias("transaction_count"))
transaction_per_day.show()

# 8. Find the Customer Who Spent the Most Money

high_customer_df = ecommerce_df.withColumn("total_spent",(col("price") * col("quantity")) * (1-col("discount_percentage")/100)).groupBy("customer_id").agg(F.sum("total_spent").alias("total_spent")) \
                  .orderBy(col("total_spent").desc()).limit(1)
high_customer_df.show()

# 9. Calculate the Average Discount Given per Product Category

avg_discount_df = ecommerce_df.groupBy("category").agg(F.avg("discount_percentage").alias("average_discount"))
avg_discount_df.show()

# 10. Create a New Column for Final Price After Discount

ecommerce_df = ecommerce_df.withColumn("final_price", col("price") - (col("price") * col("discount_percentage") / 100))
ecommerce_df.show()



+--------------+-------------+
|      category|total_revenue|
+--------------+-------------+
|       Fashion|        168.0|
|   Electronics|       2950.0|
|         Books|         80.0|
|Home Appliance|        756.0|
+--------------+-------------+

+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|transaction_id|customer_id|     product|      category|price|quantity|discount_percentage|transaction_date|
+--------------+-----------+------------+--------------+-----+--------+-------------------+----------------+
|             4|        104|     Blender|Home Appliance|  150|       1|                 15|      2023-08-03|
|             6|        105|       Shoes|       Fashion|   60|       1|                 20|      2023-08-04|
|             7|        106|Refrigerator|Home Appliance|  800|       1|                 25|      2023-08-05|
+--------------+-----------+------------+--------------+-----+--------+-------------------+------

In [71]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("BankingTransaction").getOrCreate()

banking_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/bank_transaction.csv")

# 1. Calculate the Total Deposit and Withdrawal Amounts

amounts_df = banking_df.groupBy("transaction_type").agg(F.sum("amount").alias("total_amount"))
amounts_df.show()

# 2. Filter Transactions Greater Than $3,000

high_transaction = banking_df.filter(col("amount") > 3000)
high_transaction.show()

# 3. Find the Largest Deposit Made

largest_deposit = banking_df.filter(col("transaction_type") == "Deposit").orderBy(col("amount").desc()).limit(1)
largest_deposit.show()

# 4. Calculate the Average Transaction Amount for Each Transaction Type

avg_transaction_df = banking_df.groupBy("transaction_type").agg(F.avg("amount").alias("average_amount"))
avg_transaction_df.show()

# 5. Find Customers Who Made Both Deposits and Withdrawals

deposit_customers = banking_df.filter(col("transaction_type") == "Deposit").select("customer_id").distinct()
withdrawal_customers = banking_df.filter(col("transaction_type") == "Withdrawal").select("customer_id").distinct()

deposit_customers.intersect(withdrawal_customers).show()

# 6. Calculate the Total Amount of Transactions per Day

total_transaction_perday = banking_df.groupBy("transaction_date").agg(F.sum("amount").alias("total_amount"))
total_transaction_perday.show()

# 7. Find the Customer with the Highest Total Withdrawal

highest_withdrawal = banking_df.filter(col("transaction_type") == "Withdrawal").groupBy("customer_id").agg(F.sum("amount").alias("total_withdrawn")) \
          .orderBy(col("total_withdrawn").desc()).limit(1)

highest_withdrawal.show()

# 8. Calculate the Number of Transactions for Each Customer

transaction_per_customer = banking_df.groupBy("customer_id").agg(F.count("transaction_id").alias("transaction_count"))
transaction_per_customer.show()

# 9. Find All Transactions That Occurred on the Same Day as a Withdrawal Greater
#  Than $1,000

withdrawal_dates = banking_df.filter((col("transaction_type") == "Withdrawal") & (col("amount") > 1000)) \
                             .select("transaction_date").distinct()

banking_df.join(withdrawal_dates, on="transaction_date").show()

# 10. Create a New Column to Classify Transactions as "High" or "Low" Value

banking_df = banking_df.withColumn("transaction_value", F.when(col("amount") > 5000, "High").otherwise("Low"))

banking_df.show()




+----------------+------------+
|transaction_type|total_amount|
+----------------+------------+
|         Deposit|       24500|
|      Withdrawal|        7700|
+----------------+------------+

+--------------+-----------+----------------+------+-------------------+
|transaction_id|customer_id|transaction_type|amount|   transaction_date|
+--------------+-----------+----------------+------+-------------------+
|             1|        201|         Deposit|  5000|2023-09-01 00:00:00|
|             5|        204|         Deposit| 10000|2023-09-03 00:00:00|
|             9|        203|         Deposit|  4000|2023-09-05 00:00:00|
+--------------+-----------+----------------+------+-------------------+

+--------------+-----------+----------------+------+-------------------+
|transaction_id|customer_id|transaction_type|amount|   transaction_date|
+--------------+-----------+----------------+------+-------------------+
|             5|        204|         Deposit| 10000|2023-09-03 00:00:00|
+--

In [72]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("HealthFitness").getOrCreate()

health_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/health_fitness.csv")

# 1. Find the Total Steps Taken by Each User

steps_user_df = health_df.groupBy("user_id").agg(F.sum("steps").alias("total_steps"))
steps_user_df.show()

# 2. Filter Days with More Than 10,000 Steps

high_steps = health_df.filter(col("steps")>10000).select("date", "user_id")
high_steps.show()

# 3. Calculate the Average Calories Burned by Workout Type

avg_calories = health_df.groupBy("workout_type").agg(F.avg("calories_burned").alias("avg_calories"))

avg_calories.show()

# 4. Identify the Day with the Most Steps for Each User

max_user_steps = health_df.groupBy("user_id").agg(F.max("steps").alias("max_steps"))

max_user_steps_dates = max_user_steps.join(health_df,"user_id").filter(col("steps")==col("max_steps")).select("user_id","date","steps")
max_user_steps_dates.show()

# 5. Find Users Who Burned More Than 600 Calories on Any Day

high_calories_burned = health_df.filter(col("calories_burned") > 600)
high_calories_burned.show()

# 6. Calculate the Average Hours of Sleep per User

avg_sleep_hrs = health_df.groupBy("user_id").agg(F.avg("hours_of_sleep").alias("avg_sleep"))
avg_sleep_hrs.show()

# 7. Find the Total Calories Burned per Day

calories_burned_perday = health_df.groupBy("date").agg(F.sum("calories_burned").alias("total_calories"))
calories_burned_perday.show()

# 8. Identify Users Who Did Different Types of Workouts

different_workout = health_df.groupBy("user_id").agg(F.countDistinct("workout_type").alias("workout_types")).filter(col("workout_types") > 1)
different_workout.show()

# 9. Calculate the Total Number of Workouts per User

total_workout_peruser = health_df.groupBy("user_id").agg(F.count("workout_type").alias("total_workouts"))
total_workout_peruser.show()

# 10. Create a New Column for "Active" Days

health_df = health_df.withColumn("active_day", F.when(col("steps") > 10000, "Active").otherwise("Inactive"))
health_df.show()





+-------+-----------+
|user_id|total_steps|
+-------+-----------+
|      1|      35000|
|      3|      45000|
|      2|      29500|
+-------+-----------+

+----------+-------+
|      date|user_id|
+----------+-------+
|2023-09-01|      1|
|2023-09-01|      3|
|2023-09-02|      3|
|2023-09-03|      1|
|2023-09-03|      2|
|2023-09-03|      3|
+----------+-------+

+------------+-----------------+
|workout_type|     avg_calories|
+------------+-----------------+
|    Strength|            500.0|
|        Yoga|573.3333333333334|
|      Cardio|            537.5|
+------------+-----------------+

+-------+----------+-----+
|user_id|      date|steps|
+-------+----------+-----+
|      1|2023-09-03|13000|
|      2|2023-09-03|12000|
|      3|2023-09-03|16000|
+-------+----------+-----+

+-------+----------+-----+---------------+--------------+------------+
|user_id|      date|steps|calories_burned|hours_of_sleep|workout_type|
+-------+----------+-----+---------------+--------------+------------+

In [73]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("MusicStreaming").getOrCreate()

music_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/music_streaming.csv")

# 1. Calculate the Total Listening Time for Each User

total_listening_time = music_df.groupBy("user_id").agg(F.sum("duration_seconds").alias("total_listening_time"))
total_listening_time.show()

# 2. Filter Songs Streamed for More Than 200 Seconds

high_duration = music_df.filter(col("duration_seconds") > 200)
high_duration.show()

# 3. Find the Most Popular Artist (by Total Streams)

popular_artist = music_df.groupBy("artist").agg(F.count("*").alias("total_streams")).orderBy(col("total_streams").desc()).limit(1)
popular_artist.show()

# 4. Identify the Song with the Longest Duration

longest_duration_song = music_df.orderBy(col("duration_seconds").desc()).select("song_title","artist","duration_seconds").limit(1)
longest_duration_song.show()

# 5. Calculate the Average Song Duration by Artist

avg_artist_duration = music_df.groupBy("artist").agg(F.avg("duration_seconds").alias("average_duration"))
avg_artist_duration.show()

# 6. Find the Top 3 Most Streamed Songs per User

grouped_df = music_df.groupBy("user_id","song_title").agg(F.count("*").alias("play_count"))

window_spec = Window.partitionBy("user_id").orderBy(col("play_count").desc())

ranked_df = grouped_df.withColumn("rank",F.row_number().over(window_spec))

top_3_df = ranked_df.filter(col("rank") <=3).orderBy(col("user_id"),col("rank"))
top_3_df.show()

# 7. Calculate the Total Number of Streams per Day

streams_per_day = music_df.withColumn("stream_date",F.to_date("streaming_time")).groupBy("stream_date").agg(F.count("*").alias("total_streams"))
streams_per_day.show()

# 8. Identify Users Who Streamed Songs from More Than One Artist

more_than_oneArtist = music_df.groupBy("user_id").agg(F.countDistinct("artist").alias("artist_count")).filter(col("artist_count")>1)
more_than_oneArtist.show()

# 9. Calculate the Total Streams for Each Location

streams_per_location = music_df.groupBy("location").agg(F.count("*").alias("total_streams"))
streams_per_location.show()

# 10. Create a New Column to Classify Long and Short Songs

music_df = music_df.withColumn("song_length", F.when(col("duration_seconds") > 200, "Long").otherwise("Short"))
music_df.show()





+-------+--------------------+
|user_id|total_listening_time|
+-------+--------------------+
|      1|                 630|
|      3|                 610|
|      2|                 680|
+-------+--------------------+

+-------+---------------+----------+----------------+-------------------+-----------+
|user_id|     song_title|    artist|duration_seconds|     streaming_time|   location|
+-------+---------------+----------+----------------+-------------------+-----------+
|      2|   Shape of You|Ed Sheeran|             240|2023-09-01 09:20:00|Los Angeles|
|      1|        Starboy|The Weeknd|             220|2023-09-01 11:00:00|   New York|
|      2|        Perfect|Ed Sheeran|             250|2023-09-01 12:15:00|Los Angeles|
|      1|Save Your Tears|The Weeknd|             210|2023-09-02 09:00:00|   New York|
|      3|      New Rules|  Dua Lipa|             230|2023-09-02 11:00:00|     London|
+-------+---------------+----------+----------------+-------------------+-----------+

+------

In [74]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("RetailStore").getOrCreate()

retail_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/retail_data.csv")

# 1. Calculate the Total Revenue per Category

total_revenue_per_category = retail_df.withColumn("total_revenue", col("price") * col("quantity")).groupBy("category").agg(F.sum("total_revenue").alias("total_revenue"))

total_revenue_per_category.show()

# 2. Filter Transactions Where the Total Sales Amount is Greater Than $100

high_transactions = retail_df.withColumn("total_sales", col("price") * col("quantity")).filter(col("total_sales") > 100)

high_transactions.show()

# 3. Find the Most Sold Product

most_sold_product = retail_df.groupBy("product_name").agg(F.sum("quantity").alias("total_quantity")).orderBy(col("total_quantity").desc()).limit(1)
most_sold_product.show()

# 4. Calculate the Average Price per Product Category

avg_price_category = retail_df.groupBy("category").agg(F.avg("price").alias("average_price"))
avg_price_category.show()

# 5. Find the Top 3 Highest Grossing Products

top_grossing_products = retail_df.withColumn("total_revenue", col("price") * col("quantity")).groupBy("product_name").agg(F.sum("total_revenue").alias("total_revenue")) \
.orderBy(col("total_revenue").desc()).limit(3)

top_grossing_products.show()

# 6. Calculate the Total Number of Items Sold per Day

items_sold_perDay = retail_df.groupBy("sales_date").agg(F.sum("quantity").alias("total_quantity"))
items_sold_perDay.show()

# 7. Identify the Product with the Lowest Price in Each Category

lowest_cost = retail_df.groupBy("category").agg(F.min("price").alias("price"))
lowest_cost.show()

# 8. Calculate the Total Revenue for Each Product

revenue_product = retail_df.withColumn("total_revenue", col("price") * col("quantity")).groupBy("product_name").agg(F.sum("total_revenue").alias("total_revenue"))
revenue_product.show()

# 9. Find the Total Sales per Day for Each Category

total_sales_per_category = retail_df.withColumn("total_sales", col("price") * col("quantity")).groupBy("sales_date", "category").agg(F.sum("total_sales").alias("total_sales"))

total_sales_per_category.show()

# 10. Create a New Column for Discounted Price

retail_df = retail_df.withColumn("discounted_price", col("price") * 0.9)

retail_df.show()








+-----------+------------------+
|   category|     total_revenue|
+-----------+------------------+
| Stationery|              20.0|
|  Groceries|13.399999999999999|
|Electronics|            1000.0|
|   Clothing|             155.0|
+-----------+------------------+

+--------------+------------+-----------+-----+--------+----------+-----------+
|transaction_id|product_name|   category|price|quantity|sales_date|total_sales|
+--------------+------------+-----------+-----+--------+----------+-----------+
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|      800.0|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|      200.0|
+--------------+------------+-----------+-----+--------+----------+-----------+

+------------+--------------+
|product_name|total_quantity|
+------------+--------------+
|      Banana|            12|
+------------+--------------+

+-----------+------------------+
|   category|     average_price|
+-----------+------------------+
| St