In [11]:
from pyspark.sql import SparkSession

spark = (SparkSession.builder
         .appName("BillingEngine")
         .config("spark.jars",         r".\conf\postgresql-42.7.6.jar")
         .config("spark.driver.extraClassPath", r".\conf\postgresql-42.7.6.jar")
         .getOrCreate())

In [12]:
rated = spark.read.parquet("rated_data/")
rated.printSchema()


root
 |-- rate_plan_id: string (nullable = true)
 |-- product_code: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- record_ID: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- cell_id: string (nullable = true)
 |-- technology: string (nullable = true)
 |-- caller_id: string (nullable = true)
 |-- callee_id: string (nullable = true)
 |-- duration_sec: integer (nullable = true)
 |-- rating_status: string (nullable = true)
 |-- sender_id: string (nullable = true)
 |-- receiver_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- session_duration_sec: string (nullable = true)
 |-- data_volume_mb: string (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- record_type: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- rate_type: string (nullable = true)
 |-- description: string (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- free_units: integer (nullable = true)
 |-- tier_thresh

In [13]:
israted = rated.filter( (rated.rating_status == "rated") )

In [14]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, lit, round

invoices = (israted
    .groupBy("customer_id", "billing_period")
    .agg(
        # sous-totaux
        F.sum(when(col("record_type") == "voice", col("cost"))
              .otherwise(0)).alias("voice_total"),

        F.sum(when(col("record_type") == "sms",  col("cost"))
              .otherwise(0)).alias("sms_total"),

        F.sum(when(col("record_type") == "data", col("cost"))
              .otherwise(0)).alias("data_total"),

        # total hors taxes
        F.sum(col("cost")).alias("amount_before_tax")
    )
)


In [15]:
TVA = 0.20
regulatory_fees = 1
invoices = (invoices
    .withColumn("discount_pct", lit(0.05))
    .withColumn("after_discount",
                col("amount_before_tax") * (1 - col("discount_pct")))
    .withColumn("tax", col("after_discount") * lit(TVA))
    .withColumn("amount_due", round(col("after_discount") + col("tax")+ lit(regulatory_fees), 2))
)

invoices.show(truncate=False)


+------------+--------------+------------------+---------+------------------+------------------+------------+------------------+--------------------+----------+
|customer_id |billing_period|voice_total       |sms_total|data_total        |amount_before_tax |discount_pct|after_discount    |tax                 |amount_due|
+------------+--------------+------------------+---------+------------------+------------------+------------+------------------+--------------------+----------+
|212781778912|2025-06       |0.0               |0.1      |0.0               |0.1               |0.05        |0.095             |0.019000000000000003|1.11      |
|212745994411|2025-06       |102.86            |0.0      |0.0               |102.86            |0.05        |97.717            |19.543400000000002  |118.26    |
|212753433014|2025-06       |103.665           |0.0      |0.0               |103.665           |0.05        |98.48175          |19.696350000000002  |119.18    |
|212671680918|2025-06       |50.36

In [16]:
customersDf = spark.read \
    .format("jdbc")\
    .option("url",      "jdbc:postgresql://localhost:5432/telecomdb")\
    .option("dbtable",  "customer_subscriptions")\
    .option("user",     "postgres")\
    .option("password", "0000")\
    .load() 

customersDf.show(5)

+------------+-----------------+---------------+-----------------+------+--------------------+------------+
| customer_id|    customer_name|activation_date|subscription_type|status|              region|rate_plan_id|
+------------+-----------------+---------------+-----------------+------+--------------------+------------+
|212603516508|   Noura El Fassi|     2024-04-03|         postpaid|active|Tanger-Tétouan-Al...|    Titanium|
|212754643912| Hamza Benjelloun|     2023-09-07|         postpaid|active|      Drâa-Tafilalet|        Gold|
|212646849604|Zahra El Khattabi|     2023-10-14|         postpaid|active|          Fès-Meknès|        Gold|
|212637298216|     Zahra Kabbaj|     2025-01-09|         postpaid|active|         Souss-Massa|      Silver|
|212698457056|    Othman Daoudi|     2024-09-29|         postpaid|active|Dakhla-Oued Ed-Dahab|        Gold|
+------------+-----------------+---------------+-----------------+------+--------------------+------------+
only showing top 5 rows



In [17]:
from pyspark.sql import functions as F

# 1) liste des mois à facturer (ex. juin 2025)
mois_courant = "2025-06"              # ou boucle sur plusieurs périodes

# 2) sous-ensemble des clients éligibles
clients_elig = (customersDf
    .filter( (F.col("status") == "active") &
             (F.col("subscription_type") == "postpaid") )
    .select("customer_id","region","rate_plan_id")
    .withColumn("billing_period", F.lit(mois_courant))
)

# 3) jointure gauche clients ↔ invoices
facturation_complete = (clients_elig.join(
        invoices.filter(F.col("billing_period") == mois_courant),
        on=["customer_id", "billing_period"],
        how="left")
    # 4) remplacer les NULL par 0
    .fillna({
        "voice_total":          0.0,
        "sms_total":            0.0,
        "data_total":           0.0,
        "amount_before_tax":    0.0,
        "discount_pct":         0.05,
        "after_discount":       0.0,
        "tax":                  0.0,
        "amount_due":           0.0
    })
)

facturation_complete.show()


+------------+--------------+--------------------+------------+------------------+---------+----------+------------------+------------+------------------+------------------+----------+
| customer_id|billing_period|              region|rate_plan_id|       voice_total|sms_total|data_total| amount_before_tax|discount_pct|    after_discount|               tax|amount_due|
+------------+--------------+--------------------+------------+------------------+---------+----------+------------------+------------+------------------+------------------+----------+
|212603516508|       2025-06|Tanger-Tétouan-Al...|    Titanium|             14.54|      0.0|       0.0|             14.54|        0.05|13.812999999999999|            2.7626|     17.58|
|212754643912|       2025-06|      Drâa-Tafilalet|        Gold|2.2750000000000004|      0.0|   1.89291| 4.167910000000001|        0.05|3.9595145000000005|0.7919029000000002|      5.75|
|212646849604|       2025-06|          Fès-Meknès|        Gold|            

In [18]:
facturation_complete.write.mode("overwrite").partitionBy("billing_period").parquet("invoice/")


In [19]:
(facturation_complete.write
 .format("jdbc")
 .option("url",      "jdbc:postgresql://localhost:5432/telecomdb")
 .option("dbtable",  "invoices")
 .option("user",     "postgres")
 .option("password", "0000")
 .mode("overwrite")          
 .save())


In [20]:
facturation_complete.show()

+------------+--------------+--------------------+------------+------------------+---------+----------+------------------+------------+------------------+------------------+----------+
| customer_id|billing_period|              region|rate_plan_id|       voice_total|sms_total|data_total| amount_before_tax|discount_pct|    after_discount|               tax|amount_due|
+------------+--------------+--------------------+------------+------------------+---------+----------+------------------+------------+------------------+------------------+----------+
|212603516508|       2025-06|Tanger-Tétouan-Al...|    Titanium|             14.54|      0.0|       0.0|             14.54|        0.05|13.812999999999999|            2.7626|     17.58|
|212754643912|       2025-06|      Drâa-Tafilalet|        Gold|2.2750000000000004|      0.0|   1.89291| 4.167910000000001|        0.05|3.9595145000000005|0.7919029000000002|      5.75|
|212646849604|       2025-06|          Fès-Meknès|        Gold|            