### Ingest demographic file
1. Create schema
2. Read the file
3. Perform small transformation (rename columns, add ingestion_time column)
4. Write data as new table, or upsert the data into existing table with same schema

In [0]:
# Creating a text input widget for parameter "p_file_date"
dbutils.widgets.text("p_file_date", "")

In [0]:
# Get the value of the parameters from the text input widget
var_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1 - Read the CSV file

In [0]:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

In [0]:
# Create a schema for reading the data
demographic_schema = StructType(fields=[StructField("Customer ID", StringType(), False),
                                       StructField("Count", IntegerType(), True),
                                       StructField("Gender", StringType(), True),
                                       StructField("Age", IntegerType(), True),
                                       StructField("Under 30", StringType(), True),
                                       StructField("Senior Citizen", StringType(), True),
                                       StructField("Married", StringType(), True),
                                       StructField("Dependents", StringType(), True),
                                       StructField("Number of Dependents", IntegerType(), True)
                                       ])

In [0]:
# Reading the data from storage using defined schema
demographic_df = spark.read.option("header", "true") \
                            .option("delimiter", ",") \
                            .schema(demographic_schema) \
                            .csv(f"{raw_folder_path}/{var_file_date}/Telco_customer_churn_demographics.csv")

### Step 2 - Select only required columns + Rename the columns + Add ingestion date

In [0]:
# Select columns to use and rename them for consistency
demographic_renamed_df = demographic_df.select(col("Customer ID"), col("Gender"), col("Under 30"), col("Married")) \
                            .withColumnRenamed("Customer ID", "customer_id") \
                            .withColumnRenamed("Gender", "customer_gender") \
                            .withColumnRenamed("Under 30", "under_30") \
                            .withColumnRenamed("Married", "married")

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# Using pre-defined add_ingestion function to create ingestion column
demographic_final_df = add_ingestion(demographic_renamed_df)

In [0]:
# Check the description of the data
# demographic_final_df.describe().show()

+-------+-----------+---------------+--------+-------+
|summary|customer_id|customer_gender|under_30|married|
+-------+-----------+---------------+--------+-------+
|  count|        445|            445|     445|    445|
|   mean|       NULL|           NULL|    NULL|   NULL|
| stddev|       NULL|           NULL|    NULL|   NULL|
|    min| 0048-PIHNL|         Female|      No|     No|
|    max| 9979-RGMZT|           Male|     Yes|    Yes|
+-------+-----------+---------------+--------+-------+



In [0]:
# Show the top 5 of the dataframe for checking
# demographic_final_df.show(5)

+-----------+---------------+--------+-------+--------------------+
|customer_id|customer_gender|under_30|married|      ingestion_time|
+-----------+---------------+--------+-------+--------------------+
| 3451-VAWLI|         Female|      No|    Yes|2024-09-19 08:27:...|
| 3407-QGWLG|           Male|     Yes|     No|2024-09-19 08:27:...|
| 2347-WKKAE|           Male|      No|    Yes|2024-09-19 08:27:...|
| 8735-DCXNF|           Male|      No|    Yes|2024-09-19 08:27:...|
| 3214-IYUUQ|         Female|     Yes|    Yes|2024-09-19 08:27:...|
+-----------+---------------+--------+-------+--------------------+
only showing top 5 rows



### Step 3 - Write data

In [0]:
# Using pre-defined create_or_upsert_managed_delta_table function
# to write new table or upsert data into existing table with same schema
delta_table_catalog_path = "customerchurn.silver.demographic"
create_or_upsert_managed_delta_table(demographic_final_df, delta_table_catalog_path)

'New data has been succesfully merged to table: customer_churn.silver.demographic'

In [0]:
# Exit the notebook - useful when running multiple notebooks
dbutils.notebook.exit("Success")