# Week 6 Assignment: File Ingestion Pipeline with PySpark
This notebook:
1. Generates sample CSV files for three file types
2. Ingests them with PySpark
3. Extracts date metadata from filenames
4. Writes out the data (simulate truncate-load)
No manual uploads needed—just run and download your outputs.

## 1. Install and configure PySpark

In [None]:
!apt-get update -qq
!apt-get install openjdk-8-jdk-headless -qq
!pip install pyspark

## 2. Mount Google Drive and set up directories

In [None]:
from google.colab import drive
import os
import pandas as pd

# Mount Drive
drive.mount('/content/drive')

# Set data and output directories
DATA_DIR = '/content/drive/MyDrive/Week6Data'
OUTPUT_DIR = '/content/drive/MyDrive/Week6Output'
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data directory:", DATA_DIR)
print("Output directory:", OUTPUT_DIR)

## 3. Generate sample CSV files

In [None]:
# Generate sample CUST_MSTR files for two dates
dates = ['20191112', '20191113']
for d in dates:
    df = pd.DataFrame({
        'CustID': [1, 2, 3],
        'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [30, 25, 35]
    })
    df.to_csv(f"{DATA_DIR}/CUST_MSTR_{d}.csv", index=False)

# Generate sample master_child_export files for two dates
for d in dates:
    mdf = pd.DataFrame({
        'ParentID': [1, 2, 3],
        'ChildID': [10, 20, 30],
        'Value': [100.0, 200.5, 300.75]
    })
    mdf.to_csv(f"{DATA_DIR}/master_child_export-{d}.csv", index=False)

# Generate sample H_ECOM_ORDER file
odf = pd.DataFrame({
    'OrderID': [1001, 1002, 1003],
    'CustomerID': [1, 2, 3],
    'Amount': [250.5, 100.0, 450.75]
})
odf.to_csv(f"{DATA_DIR}/H_ECOM_ORDER.csv", index=False)

print("Sample files created:") 
print(os.listdir(DATA_DIR))

## 4. Start Spark session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, to_date, input_file_name, col

spark = SparkSession.builder.appName("Week6_FileIngestion").getOrCreate()

## 5. Read CSV files by pattern

In [None]:
# Read CUST_MSTR files
cust_df = spark.read.csv(f"{DATA_DIR}/CUST_MSTR_*.csv", header=True, inferSchema=True)

# Read master_child_export files
mc_df = spark.read.csv(f"{DATA_DIR}/master_child_export-*.csv", header=True, inferSchema=True)

# Read H_ECOM_ORDER file
orders_df = spark.read.csv(f"{DATA_DIR}/H_ECOM_ORDER.csv", header=True, inferSchema=True)

print("CUST_MSTR count:", cust_df.count())
print("master_child_export count:", mc_df.count())
print("H_ECOM_ORDER count:", orders_df.count())

## 6. Extract date information from filenames

In [None]:
# Extract date for CUST_MSTR
cust_df = cust_df.withColumn("src_file", input_file_name()) \
    .withColumn("FileDate", to_date(regexp_extract(col("src_file"), r"CUST_MSTR_(\d{8})\\.csv$", 1), "yyyyMMdd"))

# Extract date and date key for master_child_export
mc_df = mc_df.withColumn("src_file", input_file_name()) \
    .withColumn("DateKey", regexp_extract(col("src_file"), r"master_child_export-(\d{8})\\.csv$", 1)) \
    .withColumn("FileDate", to_date(col("DateKey"), "yyyyMMdd"))

# Orders - include filename column for traceability
orders_df = orders_df.withColumn("src_file", input_file_name())

## 7. Show schemas and sample data

In [None]:
print('CUST_MSTR Schema & Sample:')
cust_df.printSchema()
cust_df.show(5)

print('master_child_export Schema & Sample:')
mc_df.printSchema()
mc_df.show(5)

print('H_ECOM_ORDER Schema & Sample:')
orders_df.printSchema()
orders_df.show(5)

## 8. Write output (truncate-load simulation)

In [None]:
# Overwrite mode simulates daily truncate-load
cust_df.write.mode("overwrite").parquet(f"{OUTPUT_DIR}/CUST_MSTR")
mc_df.write.mode("overwrite").parquet(f"{OUTPUT_DIR}/master_child")
orders_df.write.mode("overwrite").parquet(f"{OUTPUT_DIR}/H_ECOM_Orders")

print("Data written to:")
print(f"- {OUTPUT_DIR}/CUST_MSTR")
print(f"- {OUTPUT_DIR}/master_child")
print(f"- {OUTPUT_DIR}/H_ECOM_Orders")

## 9. Next Steps: Automation
- Convert this notebook to a Python script:
  ```bash
  jupyter nbconvert --to script week6_assignment_complete.ipynb
  ```
- Schedule via `cron` on your local machine or using GitHub Actions for daily runs.
- When migrating to Azure Data Factory, map each code block to ADF activities.