In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, month, year, avg as spark_avg, max as spark_max


# 1. Create Spark session

In [2]:
spark = SparkSession.builder.appName("ExpenseAnalysis").getOrCreate()

# 2. Load CSV files

In [7]:
from google.colab import files
uploaded = files.upload()

Saving categories.csv to categories.csv
Saving expense.csv to expense.csv
Saving users.csv to users.csv


In [8]:
!ls


 categories.csv   sample_data  'WEEK 1_MongoDB.js'
 expense.csv	  users.csv    'WEEK 1_SQL_TASK.sql'


In [9]:
users_df = spark.read.csv("users.csv", header=True, inferSchema=True)

In [10]:
print("=== Users Data ===")
users_df.show()


=== Users Data ===
+-------+-------+--------------------+
|user_id|   name|               email|
+-------+-------+--------------------+
|      1|Karthik|karthik2025@mail.com|
|      2|  Sneha|sneha.raju@email.com|
|      3| Vikram|vikram_nair@domai...|
|      4|  Pooja|pooja.star95@exam...|
|      5|  Tejas|tejas.mani@gmail.com|
+-------+-------+--------------------+



In [12]:
expenses_df = spark.read.csv("expense.csv", header=True, inferSchema=True)


In [13]:
print("=== Expenses Data ===")
expenses_df.show()



=== Expenses Data ===
+----------+-------+-----------+------+------------+--------------------+
|expense_id|user_id|category_id|amount|expense_date|         description|
+----------+-------+-----------+------+------------+--------------------+
|         1|      1|          1| 950.0|  2025-07-02|Doctor consultati...|
|         2|      1|          2|2000.0|  2025-07-04|   Online course fee|
|         3|      2|          3|7000.0|  2025-07-06|     July month rent|
|         4|      3|          4| 450.0|  2025-07-09|       Movie tickets|
|         5|      4|          5|3000.0|  2025-07-11|        Weekend trip|
|         6|      5|          1| 600.0|  2025-07-12|   Pharmacy purchase|
|         7|      2|          5|1250.0|  2025-07-13|Cab and hotel for...|
|         8|      3|          2|1800.0|  2025-07-16|Textbooks and mat...|
|         9|      4|          3|7200.0|  2025-07-19|Shared apartment ...|
|        10|      5|          4| 520.0|  2025-07-21|Stand-up comedy show|
+----------+----

In [16]:
categories_df = spark.read.csv("categories.csv", header=True, inferSchema=True)

In [17]:
print("=== Categories Data ===")
categories_df.show()

=== Categories Data ===
+-----------+-------------+
|category_id|category_name|
+-----------+-------------+
|          2|    Education|
|          4|Entertainment|
|          1|   Healthcare|
|          3|         Rent|
|          5|       Travel|
+-----------+-------------+



# 3. Calculate monthly total spend per user

In [18]:
monthly_spend = (expenses_df
    .withColumn("month", month("expense_date"))
    .withColumn("year", year("expense_date"))
    .groupBy("user_id", "year", "month")
    .agg(spark_sum("amount").alias("total_monthly_spend"))
)

print("=== Monthly Total Spend per User ===")
monthly_spend.show()

=== Monthly Total Spend per User ===
+-------+----+-----+-------------------+
|user_id|year|month|total_monthly_spend|
+-------+----+-----+-------------------+
|      5|2025|    7|             1120.0|
|      3|2025|    7|             2250.0|
|      1|2025|    7|             2950.0|
|      2|2025|    7|             8250.0|
|      4|2025|    7|            10200.0|
+-------+----+-----+-------------------+




# 4. Compute average monthly spend per user

In [19]:
avg_spend_per_user = monthly_spend.groupBy("user_id").agg(
    spark_avg("total_monthly_spend").alias("avg_spend")
)

print("=== Average Monthly Spend per User ===")
avg_spend_per_user.show()

=== Average Monthly Spend per User ===
+-------+---------+
|user_id|avg_spend|
+-------+---------+
|      1|   2950.0|
|      3|   2250.0|
|      5|   1120.0|
|      4|  10200.0|
|      2|   8250.0|
+-------+---------+



# 5. Detect unusual spending spikes (monthly spend > 2x average)

In [20]:
spikes = (monthly_spend.join(avg_spend_per_user, on="user_id")
          .filter(col("total_monthly_spend") > 2 * col("avg_spend")))

print("=== Users with Spending Spikes (>2x Average) ===")
spikes.show()


=== Users with Spending Spikes (>2x Average) ===
+-------+----+-----+-------------------+---------+
|user_id|year|month|total_monthly_spend|avg_spend|
+-------+----+-----+-------------------+---------+
+-------+----+-----+-------------------+---------+




# 6. Detect large one-time expenses (expense > 3x user's average)


In [21]:
user_max_expense = expenses_df.groupBy("user_id").agg(
    spark_max("amount").alias("max_expense")
)
user_avg_expense = expenses_df.groupBy("user_id").agg(
    spark_avg("amount").alias("avg_expense")
)
large_one_time = (user_max_expense.join(user_avg_expense, "user_id")
                  .filter(col("max_expense") > 3 * col("avg_expense")))

print("=== Users with Large One-Time Expenses (>3x Avg) ===")
large_one_time.show()

=== Users with Large One-Time Expenses (>3x Avg) ===
+-------+-----------+-----------+
|user_id|max_expense|avg_expense|
+-------+-----------+-----------+
+-------+-----------+-----------+



# 7. Combine unusual spending users


In [22]:
unusual_users = spikes.select("user_id").union(
    large_one_time.select("user_id")
).distinct()

print("=== Unusual Spending Users (Combined) ===")
unusual_users.show()

=== Unusual Spending Users (Combined) ===
+-------+
|user_id|
+-------+
+-------+



# 8. Join with users for final report


In [23]:
final_output = unusual_users.join(users_df, unusual_users.user_id == users_df.user_id, "left") \
                            .select(users_df.user_id, "name", "email")

print("=== Final Output: Users with Potential Unusual Spending ===")
final_output.show()

=== Final Output: Users with Potential Unusual Spending ===
+-------+----+-----+
|user_id|name|email|
+-------+----+-----+
+-------+----+-----+

