### Ingest population file
1. Create schema
2. Read the file
3. Perform small transformation (rename columns, add ingestion_time column)
4. Write data as new table, or upsert the data into existing table with same schema

In [0]:
# Creating a text input widget for parameter "p_file_date"
dbutils.widgets.text("p_file_date", "")

In [0]:
# Get the value of the parameters from the text input widget
var_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the CSV file

In [0]:
from pyspark.sql.functions import col, lit, translate
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

In [0]:
# Create a schema for reading the data
population_schema = StructType(fields=[StructField("ID", StringType(), False),
                                       StructField("Zip Code", IntegerType(), True),
                                       StructField("Population", StringType(), True)
                                       ])

In [0]:
# Reading the data from storage using defined schema
try:
    population_df = spark.read \
                            .option("header", "true") \
                            .option("delimiter", ",") \
                            .schema(population_schema) \
                            .csv(f"{raw_folder_path}/{var_file_date}/Telco_customer_churn_population.csv")
except Exception as e:
    if 'PATH_NOT_FOUND' in str(e):
        dbutils.notebook.exit(f"Population data ingestion is skipped in this run: No population data found.")
    else:
        raise e

### Step 2 - Select only required columns + Rename the columns + Add ingestion date

In [0]:
# Select columns to use and rename them for consistency
population_renamed_df = population_df.withColumnRenamed("Zip Code", "zip_code") \
                                    .withColumnRenamed("Population", "population") \
                                    .withColumn('population', translate('population', ',', '')) \
                                    .select(col("zip_code"), col("population").cast(IntegerType()))

In [0]:
# Using pre-defined add_ingestion function to create ingestion column
population_final_df = add_ingestion(population_renamed_df)

In [0]:
# Check the description of the data
# population_final_df.describe().show()

+-------+------------------+------------------+
|summary|          zip_code|        population|
+-------+------------------+------------------+
|  count|              1671|              1671|
|   mean|  93678.9922202274|  20276.3842010772|
| stddev|1817.7635912275455|20689.117300066067|
|    min|             90001|                11|
|    max|             96161|            105285|
+-------+------------------+------------------+



In [0]:
# Show the top 5 of the dataframe for checking
# population_final_df.show(5)

+--------+----------+--------------------+
|zip_code|population|      ingestion_time|
+--------+----------+--------------------+
|   90001|     54492|2024-09-19 13:40:...|
|   90002|     44586|2024-09-19 13:40:...|
|   90003|     58198|2024-09-19 13:40:...|
|   90004|     67852|2024-09-19 13:40:...|
|   90005|     43019|2024-09-19 13:40:...|
+--------+----------+--------------------+
only showing top 5 rows



### Step 3 - Write data

In [0]:
# Using simple write method without upserting mechanism like other files
# Since population data is not frequently updated; and update usually means overwriting existing data
population_final_df.write.format("delta").mode("overwrite").saveAsTable("customerchurn.silver.population")

In [0]:
# Exit the notebook - useful when running multiple notebooks
dbutils.notebook.exit("Success")