In [1]:
!pip install pyspark==3.5.0 delta-spark==3.1.0


Collecting pyspark==3.5.0
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting delta-spark==3.1.0
  Downloading delta_spark-3.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading delta_spark-3.1.0-py3-none-any.whl (21 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425346 sha256=e4a504d4d86264a96b8656fdefab0996b0a02862c3f2812a1c323c252a6a3de7
  Stored in directory: /root/.cache/pip/wheels/38/df/61/8c121f50c3cffd77f8178180dd232d90b3b99d1bd61fb6d6be
Successfully built pyspark
Installing collected packages: pyspark, delta-spark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled

In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder.appName("CapstoneETL")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
from google.colab import files
uploaded = files.upload()


Saving categories.csv to categories.csv
Saving expense.csv to expense.csv
Saving users.csv to users.csv


In [4]:
users_df = spark.read.csv("users.csv", header=True, inferSchema=True)
expenses_df = spark.read.csv("expense.csv", header=True, inferSchema=True)
categories_df = spark.read.csv("categories.csv", header=True, inferSchema=True)

print("=== Users ===")
users_df.show()
print("=== Expenses ===")
expenses_df.show()


=== Users ===
+-------+-------+--------------------+
|user_id|   name|               email|
+-------+-------+--------------------+
|      1|Karthik|karthik2025@mail.com|
|      2|  Sneha|sneha.raju@email.com|
|      3| Vikram|vikram_nair@domai...|
|      4|  Pooja|pooja.star95@exam...|
|      5|  Tejas|tejas.mani@gmail.com|
+-------+-------+--------------------+

=== Expenses ===
+----------+-------+-----------+------+------------+--------------------+
|expense_id|user_id|category_id|amount|expense_date|         description|
+----------+-------+-----------+------+------------+--------------------+
|         1|      1|          1| 950.0|  2025-07-02|Doctor consultati...|
|         2|      1|          2|2000.0|  2025-07-04|   Online course fee|
|         3|      2|          3|7000.0|  2025-07-06|     July month rent|
|         4|      3|          4| 450.0|  2025-07-09|       Movie tickets|
|         5|      4|          5|3000.0|  2025-07-11|        Weekend trip|
|         6|      5|     

In [5]:
from pyspark.sql.functions import month, year, sum as spark_sum

combined_df = (expenses_df
    .join(users_df, "user_id")
    .withColumn("month", month("expense_date"))
    .withColumn("year", year("expense_date"))
)

combined_df.show()


+-------+----------+-----------+------+------------+--------------------+-------+--------------------+-----+----+
|user_id|expense_id|category_id|amount|expense_date|         description|   name|               email|month|year|
+-------+----------+-----------+------+------------+--------------------+-------+--------------------+-----+----+
|      1|         1|          1| 950.0|  2025-07-02|Doctor consultati...|Karthik|karthik2025@mail.com|    7|2025|
|      1|         2|          2|2000.0|  2025-07-04|   Online course fee|Karthik|karthik2025@mail.com|    7|2025|
|      2|         3|          3|7000.0|  2025-07-06|     July month rent|  Sneha|sneha.raju@email.com|    7|2025|
|      3|         4|          4| 450.0|  2025-07-09|       Movie tickets| Vikram|vikram_nair@domai...|    7|2025|
|      4|         5|          5|3000.0|  2025-07-11|        Weekend trip|  Pooja|pooja.star95@exam...|    7|2025|
|      5|         6|          1| 600.0|  2025-07-12|   Pharmacy purchase|  Tejas|tejas.m

In [8]:
from pyspark.sql.functions import lit, when, col, month, year, sum as spark_sum

# Assume fixed income = 10000 for demo
income = 10000

summary_df = (combined_df
    .groupBy("user_id", "name", "year", "month")
    .agg(spark_sum("amount").alias("total_spend"))
    .withColumn("savings", lit(income) - col("total_spend"))
    .withColumn("alert", when(col("total_spend") > 0.8 * income, "High Spend").otherwise("OK"))
)

summary_df.show()

+-------+-------+----+-----+-----------+-------+----------+
|user_id|   name|year|month|total_spend|savings|     alert|
+-------+-------+----+-----+-----------+-------+----------+
|      2|  Sneha|2025|    7|     8250.0| 1750.0|High Spend|
|      1|Karthik|2025|    7|     2950.0| 7050.0|        OK|
|      3| Vikram|2025|    7|     2250.0| 7750.0|        OK|
|      4|  Pooja|2025|    7|    10200.0| -200.0|High Spend|
|      5|  Tejas|2025|    7|     1120.0| 8880.0|        OK|
+-------+-------+----+-----+-----------+-------+----------+



In [10]:
summary_df.write.format("delta").mode("overwrite").save("/content/summary_delta")
summary_df.write.csv("/content/summary_csv", header=True, mode="overwrite")

print(" Summary saved as Delta and CSV")


 Summary saved as Delta and CSV
