###Ingest service file
1. Create schema
2. Read the file
3. Perform small transformation (rename columns, add ingestion_time column)
4. Write data as new table, or upsert the data into existing table with same schema

In [0]:
# Creating a text input widget for parameter "p_file_date"
dbutils.widgets.text("p_file_date", "")

In [0]:
# Get the value of the parameters from the text input widget
var_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the CSV file

In [0]:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

In [0]:
# Create a schema for reading the data
service_schema = StructType(fields=[StructField("Service ID", StringType(), False),
                                    StructField("Customer ID", StringType(), False),
                                    StructField("Count", IntegerType(), True),
                                    StructField("Quarter", StringType(), True),
                                    StructField("Referred a Friend", StringType(), True),
                                    StructField("Number of Referrals", IntegerType(), True),
                                    StructField("Tenure in Months", IntegerType(), True),
                                    StructField("Offer", StringType(), True),
                                    StructField("Phone Service", StringType(), True),
                                    StructField("Avg Monthly Long Distance Charges", FloatType(), True),
                                    StructField("Multiple Lines", StringType(), True),
                                    StructField("Internet Service", StringType(), True),
                                    StructField("Internet Type", StringType(), True),
                                    StructField("Avg Monthly GB Download", IntegerType(), True),
                                    StructField("Online Security", StringType(), True),
                                    StructField("Online Backup", StringType(), True),
                                    StructField("Device Protection Plan", StringType(), True),
                                    StructField("Premium Tech Support", StringType(), True),
                                    StructField("Streaming TV", StringType(), True),
                                    StructField("Streaming Movies", StringType(), True),
                                    StructField("Streaming Music", StringType(), True),
                                    StructField("Unlimited Data", StringType(), True),
                                    StructField("Contract", StringType(), True),
                                    StructField("Paperless Billing", StringType(), True),
                                    StructField("Payment Method", StringType(), True),
                                    StructField("Monthly Charge", FloatType(), True),
                                    StructField("Total Charges", FloatType(), True),
                                    StructField("Total Refunds", FloatType(), True),
                                    StructField("Total Extra Data Charges", FloatType(), True),
                                    StructField("Total Long Distance Charges", FloatType(), True),
                                    StructField("Total Revenue", FloatType(), True)
                                       ])

In [0]:
# Reading the data from storage using defined schema
service_df = spark.read \
                    .option("header", "true") \
                    .option("delimiter", ",") \
                    .schema(service_schema) \
                    .csv(f"{raw_folder_path}/{var_file_date}/Telco_customer_churn_services.csv")

### Step 2 - Select only required columns + Rename the columns + Add ingestion date

In [0]:
# Select columns to use and rename them for consistency
service_renamed_df = service_df.select(col("Customer ID"),
                                       col("Quarter"),
                                       col("Contract"),
                                       col("Tenure in Months"),
                                       col("Phone Service"),
                                       col("Internet Type"),
                                       col("Online Security"),
                                       col("Online Backup"),
                                       col("Device Protection Plan"),
                                       col("Premium Tech Support"),
                                       col("Streaming TV"),
                                       col("Streaming Movies"),
                                       col("Streaming Music"),
                                       col("Unlimited Data"),
                                       col("Monthly Charge"),
                                       col("Total Charges")) \
                                .withColumnRenamed("Customer ID", "customer_id") \
                                .withColumnRenamed("Quarter", "quarter") \
                                .withColumnRenamed("Tenure in Months", "tenure") \
                                .withColumnRenamed("Phone Service", "phone_service") \
                                .withColumnRenamed("Internet Type", "internet_type") \
                                .withColumnRenamed("Online Security", "online_security") \
                                .withColumnRenamed("Online Backup", "online_backup") \
                                .withColumnRenamed("Device Protection Plan", "device_protection_plan") \
                                .withColumnRenamed("Premium Tech Support", "premium_tech_support") \
                                .withColumnRenamed("Streaming TV", "streaming_tv") \
                                .withColumnRenamed("Streaming Movies", "streaming_movies") \
                                .withColumnRenamed("Streaming Music", "streaming_music") \
                                .withColumnRenamed("Unlimited Data", "unlimited_data") \
                                .withColumnRenamed("Monthly Charge", "monthly_charge") \
                                .withColumnRenamed("Total Charges", "total_charges") \
                                .withColumnRenamed("Contract", "contract")

In [0]:
# Using pre-defined add_ingestion function to create ingestion column
service_final_df = add_ingestion(service_renamed_df)

In [0]:
# Check the description of the data
# service_final_df.describe().show()

+-------+-----------+-------+--------------+------------------+-------------+-------------+---------------+-------------+----------------------+--------------------+------------+----------------+---------------+--------------+-----------------+------------------+
|summary|customer_id|quarter|      contract|            tenure|phone_service|internet_type|online_security|online_backup|device_protection_plan|premium_tech_support|streaming_tv|streaming_movies|streaming_music|unlimited_data|   monthly_charge|     total_charges|
+-------+-----------+-------+--------------+------------------+-------------+-------------+---------------+-------------+----------------------+--------------------+------------+----------------+---------------+--------------+-----------------+------------------+
|  count|       6598|   6598|          6598|              6598|         6598|         6598|           6598|         6598|                  6598|                6598|        6598|            6598|           65

In [0]:
# Show the top 5 of the dataframe for checking
# display(service_final_df)

customer_id,quarter,contract,tenure,phone_service,internet_type,online_security,online_backup,device_protection_plan,premium_tech_support,streaming_tv,streaming_movies,streaming_music,unlimited_data,monthly_charge,total_charges,ingestion_time
8779-QRDMV,Q3,Month-to-Month,1,No,DSL,No,No,Yes,No,No,Yes,No,No,39.65,39.65,2024-09-19T10:10:36.098Z
7495-OOKFY,Q3,Month-to-Month,8,Yes,Fiber Optic,No,Yes,No,No,No,No,No,Yes,80.65,633.3,2024-09-19T10:10:36.098Z
1658-BYGOY,Q3,Month-to-Month,18,Yes,Fiber Optic,No,No,No,No,Yes,Yes,Yes,Yes,95.45,1752.55,2024-09-19T10:10:36.098Z
4598-XLKNJ,Q3,Month-to-Month,25,Yes,Fiber Optic,No,Yes,Yes,No,Yes,Yes,No,Yes,98.5,2514.5,2024-09-19T10:10:36.098Z
4846-WHAFZ,Q3,Month-to-Month,37,Yes,Fiber Optic,No,No,No,No,No,No,No,Yes,76.5,2868.15,2024-09-19T10:10:36.098Z
4412-YLTKF,Q3,Month-to-Month,27,Yes,Fiber Optic,No,No,Yes,No,No,No,No,No,78.05,2135.5,2024-09-19T10:10:36.098Z
0390-DCFDQ,Q3,Month-to-Month,1,Yes,Fiber Optic,No,No,No,No,No,No,No,Yes,70.45,70.45,2024-09-19T10:10:36.098Z
3445-HXXGF,Q3,Month-to-Month,58,No,DSL,No,Yes,Yes,No,No,Yes,No,Yes,45.3,2651.2,2024-09-19T10:10:36.098Z
2656-FMOKZ,Q3,Month-to-Month,15,Yes,Fiber Optic,No,No,No,No,No,No,No,Yes,74.45,1145.7,2024-09-19T10:10:36.098Z
2070-FNEXE,Q3,Month-to-Month,7,Yes,Fiber Optic,Yes,No,No,No,No,No,No,No,76.45,503.6,2024-09-19T10:10:36.098Z


### Step 3 - Write data

In [0]:
# Using pre-defined create_or_upsert_managed_delta_table function
# to write new table or upsert data into existing table with same schema
delta_table_catalog_path = "customerchurn.silver.service"
create_or_upsert_managed_delta_table(service_final_df, delta_table_catalog_path)

'A new table has been succesfully created at: customer_churn.silver.service'

In [0]:
# Exit the notebook - useful when running multiple notebooks
dbutils.notebook.exit("Success")