### Ingest status file
1. Create schema
2. Read the file
3. Perform small transformation (rename columns, add ingestion_time column)
4. Write data as new table, or upsert the data into existing table with same schema

In [0]:
# Creating a text input widget for parameter "p_file_date"
dbutils.widgets.text("p_file_date", "")

In [0]:
# Get the value of the parameters from the text input widget
var_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the CSV file

In [0]:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

In [0]:
# Create a schema for reading the data
status_schema = StructType(fields=[StructField("Status ID", StringType(), False),
                                    StructField("Customer ID", StringType(), False),
                                    StructField("Count", IntegerType(), True),
                                    StructField("Quarter", StringType(), True),
                                    StructField("Satisfaction Score", StringType(), True),
                                    StructField("Customer Status", StringType(), True),
                                    StructField("Churn Label", StringType(), True),
                                    StructField("Churn Value", StringType(), True),
                                    StructField("Churn Score", IntegerType(), True),
                                    StructField("CLTV", IntegerType(), True),
                                    StructField("Churn Category", StringType(), True),
                                    StructField("Churn Reason", StringType(), True)
                                       ])

In [0]:
# Reading the data from storage using defined schema
status_df = spark.read \
                    .option("header", "true") \
                    .option("delimiter", ",") \
                    .schema(status_schema) \
                    .csv(f"{raw_folder_path}/{var_file_date}/Telco_customer_churn_status.csv")

### Step 2 - Select only required columns + Rename the columns + Add ingestion date

In [0]:
# Select columns to use and rename them for consistency
status_renamed_df = status_df.select(col("Customer ID"),
                                     col("Quarter"),
                                     col("Churn Label"),
                                     col("Churn Value"),
                                     col("Churn Category")) \
                              .withColumnRenamed("Customer ID", "customer_id") \
                              .withColumnRenamed("Quarter", "quarter") \
                              .withColumnRenamed("Churn Label", "churn_label") \
                              .withColumnRenamed("Churn Value", "churn_value") \
                              .withColumnRenamed("Churn Category", "churn_category")

In [0]:
# Using pre-defined add_ingestion function to create ingestion column
status_final_df = add_ingestion(status_renamed_df)

In [0]:
# Check the description of the data
# status_final_df.describe().show()

+-------+-----------+-------+-----------+-------------------+--------------+
|summary|customer_id|quarter|churn_label|        churn_value|churn_category|
+-------+-----------+-------+-----------+-------------------+--------------+
|  count|       6598|   6598|       6598|               6598|          1869|
|   mean|       NULL|   NULL|       NULL|0.28326765686571687|          NULL|
| stddev|       NULL|   NULL|       NULL|0.45061942602265187|          NULL|
|    min| 0002-ORFBO|     Q3|         No|                  0|      Attitude|
|    max| 9995-HOTOH|     Q3|        Yes|                  1|         Price|
+-------+-----------+-------+-----------+-------------------+--------------+



In [0]:
# Show the top 5 of the dataframe for checking
# status_final_df.show(5)

+-----------+-------+-----------+-----------+---------------+--------------------+
|customer_id|quarter|churn_label|churn_value| churn_category|      ingestion_time|
+-----------+-------+-----------+-----------+---------------+--------------------+
| 8779-QRDMV|     Q3|        Yes|          1|     Competitor|2024-09-19 10:12:...|
| 7495-OOKFY|     Q3|        Yes|          1|     Competitor|2024-09-19 10:12:...|
| 1658-BYGOY|     Q3|        Yes|          1|     Competitor|2024-09-19 10:12:...|
| 4598-XLKNJ|     Q3|        Yes|          1|Dissatisfaction|2024-09-19 10:12:...|
| 4846-WHAFZ|     Q3|        Yes|          1|          Price|2024-09-19 10:12:...|
+-----------+-------+-----------+-----------+---------------+--------------------+
only showing top 5 rows



### Step 3 - Write data

In [0]:
# Using pre-defined create_or_upsert_managed_delta_table function
# to write new table or upsert data into existing table with same schema
delta_table_catalog_path = "customerchurn.silver.status"
create_or_upsert_managed_delta_table(status_final_df, delta_table_catalog_path)

'A new table has been succesfully created at: customer_churn.silver.status'

In [0]:
# Exit the notebook - useful when running multiple notebooks
dbutils.notebook.exit("Success")