In [None]:
# etl_transform.ipynb

import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Set seaborn style
sns.set(style='whitegrid')

# Step 1: Load the cleaned datasets from the 'data/' folder
raw_df = pd.read_csv('data/raw_data.csv')
incremental_df = pd.read_csv('data/incremental_data.csv')

# Create output directory if not exists
os.makedirs('transformed', exist_ok=True)

### ---------------- TRANSFORMATION 1: Remove Duplicates ---------------- ###
print("\n--- TRANSFORMATION 1: Remove Duplicate Rows ---")
print("Raw Data - Before:")
display(raw_df.duplicated().value_counts())

raw_df = raw_df.drop_duplicates()
incremental_df = incremental_df.drop_duplicates()

print("Raw Data - After:")
display(raw_df.duplicated().value_counts())

### ---------------- TRANSFORMATION 2: Handle Missing Values ---------------- ###
print("\n--- TRANSFORMATION 2: Handle Missing Values ---")
print("Missing values BEFORE (raw_df):")
display(raw_df.isnull().sum())

# Example: Fill missing 'age' with median, drop rows where 'email' is missing
if 'age' in raw_df.columns:
    raw_df['age'] = raw_df['age'].fillna(raw_df['age'].median())
    incremental_df['age'] = incremental_df['age'].fillna(incremental_df['age'].median())

if 'email' in raw_df.columns:
    raw_df = raw_df.dropna(subset=['email'])
    incremental_df = incremental_df.dropna(subset=['email'])

print("Missing values AFTER (raw_df):")
display(raw_df.isnull().sum())

### ---------------- TRANSFORMATION 3: Convert Dates to Datetime ---------------- ###
print("\n--- TRANSFORMATION 3: Convert signup_date to datetime ---")
if 'signup_date' in raw_df.columns:
    print("Before:")
    display(raw_df['signup_date'].head())

    raw_df['signup_date'] = pd.to_datetime(raw_df['signup_date'], errors='coerce')
    incremental_df['signup_date'] = pd.to_datetime(incremental_df['signup_date'], errors='coerce')

    print("After:")
    display(raw_df['signup_date'].head())

### ---------------- TRANSFORMATION 4: Add Enrichment Column ---------------- ###
print("\n--- TRANSFORMATION 4: Add total_price column = quantity * unit_price ---")
if 'quantity' in raw_df.columns and 'unit_price' in raw_df.columns:
    raw_df['total_price'] = raw_df['quantity'] * raw_df['unit_price']
    incremental_df['total_price'] = incremental_df['quantity'] * incremental_df['unit_price']

    print("New column total_price added:")
    display(raw_df[['quantity', 'unit_price', 'total_price']].head())

### ---------------- TRANSFORMATION 5 (Optional): Categorize Age ---------------- ###
print("\n--- (Optional) TRANSFORMATION 5: Categorize Customers by Age Groups ---")
def categorize_age(age):
    if age < 18:
        return 'Teen'
    elif 18 <= age < 35:
        return 'Young Adult'
    elif 35 <= age < 60:
        return 'Adult'
    else:
        return 'Senior'

if 'age' in raw_df.columns:
    raw_df['age_group'] = raw_df['age'].apply(categorize_age)
    incremental_df['age_group'] = incremental_df['age'].apply(categorize_age)

    print("New column age_group added:")
    display(raw_df[['age', 'age_group']].head())

### ---------------- Save the Transformed Files ---------------- ###
raw_df.to_csv('transformed/transformed_full.csv', index=False)
incremental_df.to_csv('transformed/transformed_incremental.csv', index=False)
print("\nTransformed files saved to 'transformed/' folder.")

# Load transformed datasets
full_df = pd.read_csv('transformed/transformed_full.csv')
inc_df = pd.read_csv('transformed/transformed_incremental.csv')

# Display previews
print("Transformed Full Dataset:")
display(full_df.head())

print("\nTransformed Incremental Dataset:")
display(inc_df.head())

### ---------------- VISUALIZATION SECTION ---------------- ###
print("\n--- DATA VISUALIZATIONS ---")

# 1. Age Distribution
if 'age' in raw_df.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(raw_df['age'], bins=20, kde=True, color='steelblue')
    plt.title('Customer Age Distribution')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show(block=True)

# 2. Total Sales Over Time
if 'signup_date' in raw_df.columns and 'total_price' in raw_df.columns:
    sales_by_day = raw_df.groupby(raw_df['signup_date'].dt.date)['total_price'].sum()
    plt.figure(figsize=(10, 5))
    sales_by_day.plot(marker='o', color='green')
    plt.title('Total Sales Over Time')
    plt.xlabel('Signup Date')
    plt.ylabel('Total Price')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# 3. Age Group Breakdown
if 'age_group' in raw_df.columns:
    raw_df['age_group'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, figsize=(6, 6))
    plt.title('Customer Age Group Distribution')
    plt.ylabel('')
    plt.show()


--- TRANSFORMATION 1: Remove Duplicate Rows ---
Raw Data - Before:


False    99
True      1
Name: count, dtype: int64

Raw Data - After:


False    99
Name: count, dtype: int64


--- TRANSFORMATION 2: Handle Missing Values ---
Missing values BEFORE (raw_df):


order_id          0
customer_name     1
product           0
quantity         26
unit_price       35
order_date        1
region           25
dtype: int64

Missing values AFTER (raw_df):


order_id          0
customer_name     1
product           0
quantity         26
unit_price       35
order_date        1
region           25
dtype: int64


--- TRANSFORMATION 3: Convert signup_date to datetime ---

--- TRANSFORMATION 4: Add total_price column = quantity * unit_price ---
New column total_price added:


Unnamed: 0,quantity,unit_price,total_price
0,,500.0,
1,,,
2,2.0,250.0,500.0
3,2.0,750.0,1500.0
4,3.0,,



--- (Optional) TRANSFORMATION 5: Categorize Customers by Age Groups ---

Transformed files saved to 'transformed/' folder.
Transformed Full Dataset:


Unnamed: 0,order_id,customer_name,product,quantity,unit_price,order_date,region,total_price
0,1,Diana,Tablet,,500.0,2024-01-20,South,
1,2,Eve,Laptop,,,2024-04-29,North,
2,3,Charlie,Laptop,2.0,250.0,2024-01-08,,500.0
3,4,Eve,Laptop,2.0,750.0,2024-01-07,West,1500.0
4,5,Eve,Tablet,3.0,,2024-03-07,South,



Transformed Incremental Dataset:


Unnamed: 0,order_id,customer_name,product,quantity,unit_price,order_date,region,total_price
0,101,Alice,Laptop,,900.0,2024-05-09,Central,
1,102,,Laptop,1.0,300.0,2024-05-07,Central,300.0
2,103,,Laptop,1.0,600.0,2024-05-04,Central,600.0
3,104,,Tablet,,300.0,2024-05-26,Central,
4,105,Heidi,Tablet,2.0,600.0,2024-05-21,North,1200.0



--- DATA VISUALIZATIONS ---
