In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=d3a1bcb1568353e06137a781d2baeedd4231bc47c080dc4f278d707ee2f1af60
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("FitnessTracker").getOrCreate()

fitness_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/fitness_tracker.csv")

# 1. Find the Total Steps Taken by Each User
#  Calculate the total number of steps taken by each user across all days.
df_total_steps_user = fitness_df.groupBy("user_id").agg(F.sum("steps").alias("TotalSteps"))
df_total_steps_user.show()

# 2. Filter Days Where a User Burned More Than 500 Calories
#  Identify all days where a user burned more than 500 calories.

df_high_calories = fitness_df.filter(col("calories") > 500).select("date")
df_high_calories.show()

#  3. Calculate the Average Distance Traveled by Each User
#  Calculate the average distance traveled (
#  across all days.
#  distance_km ) by each user

df_avg_dist = fitness_df.groupBy("user_id").agg(F.avg("distance_km").alias("AverageDistance"))
df_avg_dist.show()

# 4. Identify the Day with the Maximum Steps for Each User
#  For each user, find the day when they took the maximum number of steps.

window_spec = Window.partitionBy("user_id").orderBy(col("steps").desc())
df_max_steps = fitness_df.withColumn("rank",F.rank().over(window_spec)).filter(col("rank")==1)
df_max_steps.show()

# 5. Find Users Who Were Active for More Than 100 Minutes on Any Day
#  Identify users who had active minutes greater than 100 on any day.

df_high_active = fitness_df.filter(col("active_minutes ") > 100).select("user_id","date","active_minutes ")
df_high_active.show()

#  6. Calculate the Total Calories Burned per Day
#  Group by
# date and calculate the total number of calories burned by all
#  users combined for each day.

df_calories_day = fitness_df.groupBy("date").agg(F.sum("calories").alias("TotalCalories"))
df_calories_day.show()

#  7. Calculate the Average Steps per Day
#  Find the average number of steps taken across all users for each day.

df_avg_steps = fitness_df.groupBy("date").agg(F.avg("steps").alias("AverageSteps"))
df_avg_steps.show()

#  8. Rank Users by Total Distance Traveled
#  Rank the users by their total distance traveled, from highest to lowest.

total_distance_df = fitness_df.groupBy("user_id").agg(F.sum("distance_km").alias("totalDistance"))
window_spec2 = Window.orderBy(col("totalDistance").desc())
ranked_user_df = total_distance_df.withColumn("rank", F.rank().over(window_spec2))
ranked_user_df.show()

#  9. Find the Most Active User by Total Active Minutes
#  Identify the user with the highest total active minutes across all days.

df_total_active_minutes = fitness_df.groupBy("user_id").agg(F.sum("active_minutes ").alias("totalMinutes"))
most_active_user = df_total_active_minutes.orderBy(col("totalMinutes").desc()).limit(1)
most_active_user.show()

# 10. Create a New Column for Calories Burned per Kilometer
# Add a new column called calories_per_km that calculates how many calories were burned per kilometer (
# calories / distance_km ) for each row.

fitness_df = fitness_df.withColumn("calories_per_km", col("calories") / col("distance_km"))
fitness_df.show()



+-------+----------+
|user_id|TotalSteps|
+-------+----------+
|      1|     33000|
|      3|     44000|
|      2|     24000|
+-------+----------+

+----------+
|      date|
+----------+
|01-07-2023|
|02-07-2023|
|03-07-2023|
+----------+

+-------+------------------+
|user_id|   AverageDistance|
+-------+------------------+
|      1| 7.833333333333333|
|      3|10.066666666666666|
|      2| 5.566666666666667|
+-------+------------------+

+-------+----------+-----+--------+-----------+---------------+----+
|user_id|      date|steps|calories|distance_km|active_minutes |rank|
+-------+----------+-----+--------+-----------+---------------+----+
|      1|01-07-2023|12000|     500|        8.5|             90|   1|
|      2|02-07-2023| 9000|     400|        6.2|             70|   1|
|      3|03-07-2023|16000|     620|       11.0|            130|   1|
+-------+----------+-----+--------+-----------+---------------+----+

+-------+----------+---------------+
|user_id|      date|active_minutes 

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("BookSales").getOrCreate()

book_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/book_data.csv")


# 1. Total Sales Revenue per Genre
total_sales_per_genre = book_df.groupBy("genre").agg(F.sum(F.col("sale_price") * F.col("quantity")).alias("total_revenue"))
total_sales_per_genre.show()

# 2. Filter Books Sold in the "Fiction" Genre
fiction_books = book_df.filter(F.col("genre") == "Fiction")
fiction_books.show()

# 3. Book with the Highest Sale Price
book_highest_price = book_df.orderBy(F.col("sale_price").desc()).limit(1)
book_highest_price.show()

# 4. Total Quantity of Books Sold by Author
total_quantity_by_author = book_df.groupBy("author").agg(F.sum("quantity").alias("total_quantity"))
total_quantity_by_author.show()

# 5. Sales Transactions Worth More Than $50
sales_over_50 = book_df.filter((F.col("sale_price") * F.col("quantity")) > 50)
sales_over_50.show()

# 6. Average Sale Price per Genre
average_sale_price_per_genre = book_df.groupBy("genre").agg(F.avg("sale_price").alias("average_price"))
average_sale_price_per_genre.show()

# 7. Count the Number of Unique Authors
unique_authors_count = book_df.select("author").distinct().count()
print(f"Number of unique authors: {unique_authors_count}")

# 8. Top 3 Best-Selling Books by Quantity
top_3_books_by_quantity = book_df.groupBy("book_title").agg(F.sum("quantity").alias("total_quantity")).orderBy(F.col("total_quantity").desc()).limit(3)
top_3_books_by_quantity.show()

# 9. Total Sales for Each Month
total_sales_by_month = book_df.withColumn("month", F.month("date")).groupBy("month").agg(F.sum(F.col("sale_price") * F.col("quantity")).alias("total_revenue"))
total_sales_by_month.show()

# 10. Create a New Column for Total Sales Amount
book_df_with_total_sales = book_df.withColumn("total_sales", F.col("sale_price") * F.col("quantity"))
book_df_with_total_sales.show()

+---------------+-----------------+
|          genre|    total_revenue|
+---------------+-----------------+
|        Fiction|            135.9|
|      Self-Help|            68.97|
|Science Fiction|            25.99|
|    Non-Fiction|             22.5|
|      Biography|95.97999999999999|
+---------------+-----------------+

+-------+--------------------+-------------------+-------+----------+--------+-------------------+
|sale_id|          book_title|             author|  genre|sale_price|quantity|               date|
+-------+--------------------+-------------------+-------+----------+--------+-------------------+
|      1|The Catcher in th...|      J.D. Salinger|Fiction|     15.99|       2|2023-01-05 00:00:00|
|      2|To Kill a Mocking...|         Harper Lee|Fiction|     18.99|       1|2023-01-10 00:00:00|
|      6|    The Great Gatsby|F. Scott Fitzgerald|Fiction|     10.99|       5|2023-03-15 00:00:00|
|      9|                1984|      George Orwell|Fiction|     14.99|       2|202

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("FoodDelivery").getOrCreate()

food_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/food_delivery.csv")


# 1. Calculate Total Revenue per Restaurant
total_revenue_per_restaurant = food_df.groupBy("restaurant_name").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_revenue"))
total_revenue_per_restaurant.show()

# 2. Find the Fastest Delivery
fastest_delivery = food_df.orderBy(F.col("delivery_time_mins")).limit(1)
fastest_delivery.show()

# 3. Calculate Average Delivery Time per Restaurant
avg_delivery_time_per_restaurant = food_df.groupBy("restaurant_name").agg(F.avg("delivery_time_mins").alias("avg_delivery_time"))
avg_delivery_time_per_restaurant.show()

# 4. Filter Orders for a Specific Customer (customer_id = 201)
customer_orders = food_df.filter(F.col("customer_id") == 201)
customer_orders.show()

# 5. Find Orders Where Total Amount Spent is Greater Than $20
orders_over_20 = food_df.filter((F.col("price") * F.col("quantity")) > 20)
orders_over_20.show()

# 6. Calculate the Total Quantity of Each Food Item Sold
total_quantity_per_food_item = food_df.groupBy("food_item").agg(F.sum("quantity").alias("total_quantity"))
total_quantity_per_food_item.show()

# 7. Find the Top 3 Most Popular Restaurants by Number of Orders
top_3_restaurants = food_df.groupBy("restaurant_name").agg(F.count("order_id").alias("num_orders")).orderBy(F.col("num_orders").desc()).limit(3)
top_3_restaurants.show()

# 8. Calculate Total Revenue per Day
total_revenue_per_day = food_df.groupBy("order_date").agg(F.sum(F.col("price") * F.col("quantity")).alias("total_revenue"))
total_revenue_per_day.show()

# 9. Find the Longest Delivery Time for Each Restaurant
longest_delivery_time = food_df.groupBy("restaurant_name").agg(F.max("delivery_time_mins").alias("max_delivery_time"))
longest_delivery_time.show()

# 10. Create a New Column for Total Order Value
total_order_value = food_df.withColumn("total_order_value", F.col("price") * F.col("quantity"))
total_order_value.show()

+---------------+------------------+
|restaurant_name|     total_revenue|
+---------------+------------------+
|         Subway|              13.0|
|      Pizza Hut|             12.99|
|    Burger King|              6.99|
|            KFC|             35.96|
|       Domino's|             23.98|
|     McDonald's|20.950000000000003|
|      Starbucks|              13.5|
+---------------+------------------+

+--------+-----------+---------------+---------+--------+-----+------------------+----------+
|order_id|customer_id|restaurant_name|food_item|quantity|price|delivery_time_mins|order_date|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+
|       6|        205|      Starbucks|   Coffee|       1|  4.5|                15|2023-06-18|
+--------+-----------+---------------+---------+--------+-----+------------------+----------+

+---------------+-----------------+
|restaurant_name|avg_delivery_time|
+---------------+-----------------+
|         Sub

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("WeatherData").getOrCreate()

weather_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/weather_data.csv")


# 1. Find the Average Temperature for Each City

df_avg_temmp_city = weather_df.groupBy("city").agg(F.avg("temperature_c").alias("average_temperature"))
df_avg_temmp_city.show()

#  2. Filter Days with Temperature Below Freezing

df_freezing = weather_df.filter(col("temperature_c") < 0)
df_freezing.show()

#  3. Find the City with the Highest Wind Speed on a Specific Day

df_highest_wind = weather_df.filter(col("date") == "2023-01-02").orderBy(col("wind_speed_kph").desc()).limit(1)
df_highest_wind.show()

# 4. Calculate the Total Number of Days with Rainy Weather

rainy_days_count = weather_df.filter(col("condition") == "Rain").count()
print(f"Total number of rainy days: {rainy_days_count}")

# 5. Calculate the Average Humidity for Each Weather Condition

df_avg_humidity_per_condition = weather_df.groupBy("condition").agg(F.avg("humidity").alias("avg_humidity"))
df_avg_humidity_per_condition.show()

# 6. Find the Hottest Day in Each City

df_hottest_day = weather_df.groupBy("city").agg(F.max("temperature_c").alias("max_temperature"))
df_hottest_day.show()

# 7. Identify Cities That Experienced Snow

df_snow = weather_df.filter(col("condition") == "Snow").select("city")
df_snow.show()

# 8. Calculate the Average Wind Speed for Days When the Condition was Sunny

df_avg_wind_sunny_days = weather_df.filter(col("condition") == "Sunny").agg(F.avg("wind_speed_kph").alias("avg_wind_speed"))
df_avg_wind_sunny_days.show()

# 9. Find the Coldest Day Across All Cities

coldest_day = weather_df.orderBy(col("temperature_c").asc()).limit(1)
coldest_day.show()

# 10. Create a New Column for Wind Chill

wind_chill_df = weather_df.withColumn("wind_chill", 13.12+0.6215 * col("temperature_c") - 11.37 * (col("wind_speed_kph")**0.16) + 0.3965 * col("temperature_c") * (col("wind_speed_kph")**0.16))
wind_chill_df.show()



+-----------+-------------------+
|       city|average_temperature|
+-----------+-------------------+
|Los Angeles| 17.666666666666668|
|    Chicago|-2.6666666666666665|
|   New York|  4.666666666666667|
+-----------+-------------------+

+----------+-------+-------------+--------+--------------+---------+
|      date|   city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-------+-------------+--------+--------------+---------+
|2023-01-01|Chicago|           -2|      75|            25|    Snow |
|2023-01-02|Chicago|           -5|      80|            30|     Snow|
|2023-01-03|Chicago|           -1|      70|            18|   Cloudy|
+----------+-------+-------------+--------+--------------+---------+

+----------+-------+-------------+--------+--------------+---------+
|      date|   city|temperature_c|humidity|wind_speed_kph|condition|
+----------+-------+-------------+--------+--------------+---------+
|2023-01-02|Chicago|           -5|      80|            30|     Snow|
+

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("FlightData").getOrCreate()

flight_df = spark.read.format("csv").option("header","true").option("inferSchema","true").load("/content/sample_data/flight_data.csv")


# 1. Find the Total Distance Traveled by Each Airline

df_airline_distance = flight_df.groupBy("airline").agg(F.sum("distance_travelled").alias("distance_travelled"))
df_airline_distance.show()

# 2. Filter Flights with Delays Greater than 30 Minutes

df_high_delay = flight_df.filter(col("delay_min")>30)
df_high_delay.show()

# 3. Find the Flight with the Longest Distance

df_longest_dist = flight_df.orderBy(col("distance_travelled").desc()).limit(1)
df_longest_dist.show()

# 4. Calculate the Average Delay Time for Each Airline

df_avg_delay = flight_df.groupBy("airline").agg(F.avg("delay_min").alias("average_delay"))
df_avg_delay.show()

# 5. Identify Flights That Were Not Delayed

df_no_delay = flight_df.filter(col("delay_min") == 0)
df_no_delay.show()

# 6. Find the Top 3 Most Frequent Routes

df_frequent_routes = flight_df.groupBy("origin","destination").agg(F.count("*").alias("route_count")).orderBy(F.col("route_count").desc()).limit(3)
df_frequent_routes.show()

# 7. Calculate the Total Number of Flights per Day

df_total_per_day = flight_df.groupBy("date").agg(F.count("*").alias("flight_per_day"))
df_total_per_day.show()

# 8. Find the Airline with the Most Flights

airline_with_most_flights = flight_df.groupBy("airline").agg(F.count("*").alias("flight_count")).orderBy(F.col("flight_count").desc()).limit(1)
airline_with_most_flights.show()

# 9. Calculate the Average Flight Distance per Day

avg_flight_distance_per_day = flight_df.groupBy("date").agg(F.avg("distance_travelled").alias("avg_distance"))
avg_flight_distance_per_day.show()

# 10. Create a New Column for On-Time Status

flight_df_with_on_time = flight_df.withColumn("on_time", F.when(F.col("delay_min") == 0, True).otherwise(False))
flight_df_with_on_time.show()



+---------+------------------+
|  airline|distance_travelled|
+---------+------------------+
|    Delta|             11840|
|   United|              5920|
|  JetBlue|              4180|
|Southwest|              2300|
| American|              5540|
+---------+------------------+

+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+------------------+----------+
|flight_id|airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance_travelled|      date|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+------------------+----------+
|        2| United|        UA456|   SFO|        ORD|2024-09-09 09:30:00|2024-09-09 15:00:00|       45|              2960|2023-07-01|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+------------------+----------+

+---------+-------+-------------+------+-----------+--