### Ingest location file
1. Create schema
2. Read the file
3. Perform small transformation (rename columns, add ingestion_time column)
4. Write data as new table, or upsert the data into existing table with same schema

In [0]:
# Creating a text input widget for parameter "p_file_date"
dbutils.widgets.text("p_file_date", "")

In [0]:
# Get the value of the parameters from the text input widget
var_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

##### Step 1 - Read the CSV file

In [0]:
from pyspark.sql.functions import col, lit
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType

In [0]:
# Create a schema for reading the data
location_schema = StructType(fields=[StructField("Location ID", StringType(), False),
                                     StructField("Customer ID", StringType(), False),
                                     StructField("Count", IntegerType(), True),
                                     StructField("Country", StringType(), True),
                                     StructField("State", StringType(), True),
                                     StructField("City", StringType(), True),
                                     StructField("Zip Code", IntegerType(), True),
                                     StructField("Lat Long", StringType(), True),
                                     StructField("Latitude", DoubleType(), True),
                                     StructField("Longitude", DoubleType(), True)
                                    ])

In [0]:
# Reading the data from storage using defined schema
location_df = spark.read \
                    .option("header", "true") \
                    .option("delimiter", ",") \
                    .schema(location_schema) \
                    .csv(f"{raw_folder_path}/{var_file_date}/Telco_customer_churn_location.csv")

### Step 2 - Select only required columns + Rename the columns + Add ingestion date

In [0]:
# Select columns to use and rename them for consistency
location_renamed_df = location_df.select(col("Customer ID"), col("Zip Code"), col("Latitude"), col("Longitude")) \
                            .withColumnRenamed("Customer ID", "customer_id") \
                            .withColumnRenamed("Latitude", "latitude") \
                            .withColumnRenamed("Longitude", "longitude") \
                            .withColumnRenamed("Zip Code", "zip_code")

In [0]:
# Using pre-defined add_ingestion function to create ingestion column
location_final_df = add_ingestion(location_renamed_df)

In [0]:
# Check the description of the data
# location_final_df.describe().show()

+-------+-----------+------------------+------------------+-------------------+
|summary|customer_id|          zip_code|          latitude|          longitude|
+-------+-----------+------------------+------------------+-------------------+
|  count|       6598|              6598|              6598|               6598|
|   mean|       NULL| 93522.09639284632| 36.21661099257365|-119.80349616717132|
| stddev|       NULL|1812.3485655719824|2.4334029577773046| 2.1608903865645774|
|    min| 0002-ORFBO|             90001|         32.555828|        -124.301372|
|    max| 9995-HOTOH|             96150|         41.962127|        -114.192901|
+-------+-----------+------------------+------------------+-------------------+



In [0]:
# Show the top 5 of the dataframe for checking
# location_final_df.show(5)

+-----------+--------+---------+-----------+--------------------+
|customer_id|zip_code| latitude|  longitude|      ingestion_time|
+-----------+--------+---------+-----------+--------------------+
| 8779-QRDMV|   90022| 34.02381|-118.156582|2024-09-19 09:45:...|
| 7495-OOKFY|   90063|34.044271|-118.185237|2024-09-19 09:45:...|
| 1658-BYGOY|   90065|34.108833|-118.229715|2024-09-19 09:45:...|
| 4598-XLKNJ|   90303|33.936291|-118.332639|2024-09-19 09:45:...|
| 4846-WHAFZ|   90602|33.972119|-118.020188|2024-09-19 09:45:...|
+-----------+--------+---------+-----------+--------------------+
only showing top 5 rows



### Step 3 - Write data

In [0]:
# Using pre-defined create_or_upsert_managed_delta_table function
# to write new table or upsert data into existing table with same schema
delta_table_catalog_path = "customerchurn.silver.location"
create_or_upsert_managed_delta_table(location_final_df, delta_table_catalog_path)

'A new table has been succesfully created at: customer_churn.silver.location'

In [0]:
# Exit the notebook - useful when running multiple notebooks
dbutils.notebook.exit("Success")