In [5]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("Reporting")
         .getOrCreate())

In [6]:
from pyspark.sql import functions as F

global_invoice = spark.read.parquet("invoice")

sortedCustomersByInvoice = (global_invoice
    .filter("data_total > 0")  # Filter for customers with data usage
    .groupBy("customer_id")
    .agg(F.sum("data_total").alias("total_data_usage"),
         F.sum("amount_due").alias("total_amount_due"))
    .orderBy(F.desc("total_data_usage")))

# Write the results
sortedCustomersByInvoice.write.mode("overwrite").option("header", True).csv("report/sortedCustomersByInvoices.csv")

In [7]:
# Revenus TTC par plan tarifaire 

global_invoice = spark.read.parquet("invoice")
rev_plan = (global_invoice
            .groupBy("rate_plan_id")
            .agg(F.sum("amount_due").alias("revenue_TTC"))
            .orderBy(F.desc("revenue_TTC")))

rev_plan.write.mode("overwrite").option("header", True).csv("report/revenueByPlan.csv")


In [8]:
plan_region = (global_invoice
    .groupBy("region", "rate_plan_id")
    .agg(F.sum("amount_due").alias("revenue_TTC"))
    .orderBy("region", F.desc("revenue_TTC"))
)

plan_region.write.mode("overwrite")\
    .option("header", True)\
    .csv("report/planByRegion.csv")


In [9]:
# Read the Parquet file
rated = spark.read.parquet("rated_data/")

# Calculate total cost per product code
product_costs = (
    rated.filter("rating_status = 'rated'")
    .groupBy("product_code")
    .agg(
        (F.floor(F.sum("cost") * 100) / 100).alias("total_cost"),  # Floored to 2 decimals
        F.count("*").alias("record_count"),
        F.avg("cost").alias("avg_cost_per_record")
    )
    .orderBy(F.desc("total_cost"))
)

# Save results to CSV
(
    product_costs
    .write
    .mode("overwrite")
    .option("header", True)
    .csv("report/product_cost_analysis.csv")
)