In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

**Import Data and Data Clean**

In [None]:
base = Path("/content/data/raw")
files = list(base.glob("*.csv"))

In [None]:
dfs = {}
for f in files:
    df = pd.read_csv(f)
    dfs[f.name] = df
    print(f"Loaded {f.name} — {df.shape[0]} rows, {df.shape[1]} columns")

Loaded Sales.csv — 135 rows, 6 columns
Loaded Stores.csv — 7 rows, 4 columns
Loaded BundledProducts_Products.csv — 20 rows, 2 columns
Loaded Customers.csv — 12 rows, 5 columns
Loaded Products.csv — 17 rows, 4 columns
Loaded Categories.csv — 5 rows, 2 columns
Loaded BundledProductsPromotions.csv — 9 rows, 2 columns
Loaded Promotions.csv — 12 rows, 5 columns
Loaded BundledProducts.csv — 9 rows, 2 columns


Check if data loaded correctly



In [None]:
dfs["Sales.csv"].head()

Unnamed: 0,OrderID,CustomerID,BundledProductID,OrderDate,SalesAmount,StoreID
0,1001.0,5001.0,3001.0,15/01/2024,150.0,101.0
1,1002.0,5002.0,3002.0,16/01/2024,200.0,102.0
2,1003.0,5003.0,3003.0,17/01/2024,300.0,101.0
3,1004.0,5001.0,3001.0,15/02/2024,100.0,103.0
4,1005.0,5002.0,3002.0,16/02/2024,250.0,101.0


In [None]:
for name, df in dfs.items():
    print(name)
    display(df.head())

Sales.csv


Unnamed: 0,OrderID,CustomerID,BundledProductID,OrderDate,SalesAmount,StoreID
0,1001.0,5001.0,3001.0,15/01/2024,150.0,101.0
1,1002.0,5002.0,3002.0,16/01/2024,200.0,102.0
2,1003.0,5003.0,3003.0,17/01/2024,300.0,101.0
3,1004.0,5001.0,3001.0,15/02/2024,100.0,103.0
4,1005.0,5002.0,3002.0,16/02/2024,250.0,101.0


Stores.csv


Unnamed: 0,StoreID,StoreName,Location,Manager
0,101,Store A,New York,Alice
1,102,Store B,Los Angeles,Bob
2,103,Store C,Chicago,Charlie
3,104,Store D,San Francisco,David
4,105,Store E,Miami,Eve


BundledProducts_Products.csv


Unnamed: 0,BundledProductID,ProductID
0,3001,2001
1,3001,2002
2,3001,2003
3,3002,2004
4,3002,2005


Customers.csv


Unnamed: 0,CustomerID,Name,Age,Gender,City
0,5001,John Doe,28.0,M,New York
1,5002,Jane Smith,34.0,F,Los Angeles
2,5003,Jim Brown,45.0,M,Chicago
3,5004,Susan Green,23.0,F,San Francisco
4,5005,Tom White,39.0,M,Miami


Products.csv


Unnamed: 0,ProductID,ProductName,CategoryID,Price
0,2001,Product A,100,150.0
1,2002,Product B,101,200.0
2,2003,Product C,102,300.0
3,2004,Product D,103,100.0
4,2005,Product E,104,350.0


Categories.csv


Unnamed: 0,CategoryID,CategoryName
0,100,Electronics
1,101,Clothing
2,102,Home & Kitchen
3,103,Sports
4,104,Books


BundledProductsPromotions.csv


Unnamed: 0,BundledProductID,PromotionID
0,3001,4001
1,3002,4002
2,3003,4003
3,3004,4004
4,3005,4005


Promotions.csv


Unnamed: 0,PromotionID,PromotionName,StartDate,EndDate,DiscountPercentage
0,4001,Promo A,2024-01-01,2024-01-31,10
1,4002,Promo B,2024-02-01,2024-02-28,15
2,4003,Promo C,2024-03-01,2024-03-31,20
3,4004,Promo D,2024-04-01,2024-04-30,25
4,4005,Promo E,2024-05-01,2024-05-31,30


BundledProducts.csv


Unnamed: 0,BundledProductID,BundledProductName
0,3001,Bundle A
1,3002,Bundle B
2,3003,Bundle C
3,3004,Bundle D
4,3005,Bundle E


Data Profilling Summary



In [None]:

summary = []

for name, df in dfs.items():
    # basic info
    total_rows = len(df)
    total_cols = len(df.columns)
    total_cells = total_rows * total_cols

    # missing values
    missing_cells = df.isna().sum().sum()
    missing_pct_cells = round((missing_cells / total_cells) * 100, 2)
    missing_pct_rows = round(df.isna().any(axis=1).mean() * 100, 2)

    # duplicates
    dup_count = df.duplicated().sum()
    dup_pct = round((dup_count / total_rows) * 100, 2) if total_rows > 0 else 0

    summary.append({
        "Dataset": name,
        "Rows": total_rows,
        "Columns": total_cols,
        "Missing Cells %": missing_pct_cells,
        "Rows w/ Missing %": missing_pct_rows,
        "Duplicates": dup_count,
        "Duplicates %": dup_pct
    })

summary_df = pd.DataFrame(summary)
summary_df


Unnamed: 0,Dataset,Rows,Columns,Missing Cells %,Rows w/ Missing %,Duplicates,Duplicates %
0,Sales.csv,135,6,3.7,11.85,5,3.7
1,Stores.csv,7,4,0.0,0.0,0,0.0
2,BundledProducts_Products.csv,20,2,0.0,0.0,0,0.0
3,Customers.csv,12,5,3.33,16.67,1,8.33
4,Products.csv,17,4,1.47,5.88,1,5.88
5,Categories.csv,5,2,0.0,0.0,0,0.0
6,BundledProductsPromotions.csv,9,2,0.0,0.0,0,0.0
7,Promotions.csv,12,5,0.0,0.0,0,0.0
8,BundledProducts.csv,9,2,0.0,0.0,0,0.0


  Obseravation:
  - Duplicates need to be deduplicated
  - Missing values for [Sales], [Products] and [Customers] need to be handled, considering the missing percentage is relatively small, we can either safely drop some rows, or handle numeric with median, category with 'unknown'etc. however if the primary key is missing we need to handle them specifically. If foreign keys are missing we need to investigate and think about an approach
  - For outliers we can have better visualization in PBI later on

Data Clean

In [None]:
cleaned_dfs = {}

In [None]:
for name, df in dfs.items():
    #Trim whitespaces
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    #Standardise the nulls before removing null values
    df = df.replace(["", " ", "None", "NULL", "null", "N/A", "na", "-", "--"], np.nan)

    # Deduplicate
    df = df.drop_duplicates()

    #If all columns are missing for one row, safe to drop
    all_null_before = df.isna().all(axis=1).sum()
    df = df.dropna(how="all")

    cleaned_dfs[name] = df



  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


**Define and validate primary keys**

For table [BundledProducts_Products], bundledproductid and productid is many to many relationship, we can create a surrogate key for this table

For table [BundledProductsPromotions], the relationship between bundledproductid and promotionid looks like 1:1 however from business perspective many to many relationship makes more sense, we will also create a surrogate key for this table

In [None]:

bp = cleaned_dfs["BundledProducts_Products.csv"].copy()

# keep only unique pairs, then assign a bridge PK
# Used the auto-increment integers as surrogate key
pair_cols = ["BundledProductID", "ProductID"]
bp = bp.drop_duplicates(subset=pair_cols).reset_index(drop=True)
bp.insert(0, "BP_Prod_BridgeID", range(1, len(bp) + 1))

cleaned_dfs["BundledProducts_Products.csv"] = bp

# BundledProductsPromotions bridge
bpp = cleaned_dfs["BundledProductsPromotions.csv"].copy()

pair_cols2 = ["BundledProductID", "PromotionID"]
bpp = bpp.drop_duplicates(subset=pair_cols2).reset_index(drop=True)
bpp.insert(0, "BP_Promo_BridgeID", range(1, len(bpp) + 1))

cleaned_dfs["BundledProductsPromotions.csv"] = bpp


In [None]:
primary_keys = {
    "Products.csv": ["ProductID"],
    "BundledProducts.csv": ["BundledProductID"],
    "Promotions.csv": ["PromotionID"],
    "Customers.csv": ["CustomerID"],
    "Stores.csv": ["StoreID"],
    "Categories.csv": ["CategoryID"],
    "BundledProducts_Products.csv": ["BP_Prod_BridgeID"],
    "BundledProductsPromotions.csv": ["BP_Promo_BridgeID"],
    "Sales.csv": ["OrderID"]
}


pk_results = []
for name, df in cleaned_dfs.items():
    pks = primary_keys.get(name, [])
    if not pks:
        pk_results.append({"Dataset": name, "PK": "None defined", "Missing PK values": "N/A", "Duplicate PKs": "N/A"})
        continue

    missing_cols = [c for c in pks if c not in df.columns]
    if missing_cols:
        pk_results.append({"Dataset": name, "PK": f"Missing columns: {missing_cols}", "Missing PK values": "N/A", "Duplicate PKs": "N/A"})
        continue

    miss_rows = int(df[pks].isna().any(axis=1).sum())
    dup_rows  = int(df.duplicated(subset=pks).sum())
    pk_results.append({"Dataset": name, "PK": ", ".join(pks), "Missing PK values": miss_rows, "Duplicate PKs": dup_rows})

pd.DataFrame(pk_results)


Unnamed: 0,Dataset,PK,Missing PK values,Duplicate PKs
0,Sales.csv,OrderID,0,0
1,Stores.csv,StoreID,0,0
2,BundledProducts_Products.csv,BP_Prod_BridgeID,0,0
3,Customers.csv,CustomerID,0,0
4,Products.csv,ProductID,0,0
5,Categories.csv,CategoryID,0,0
6,BundledProductsPromotions.csv,BP_Promo_BridgeID,0,0
7,Promotions.csv,PromotionID,0,0
8,BundledProducts.csv,BundledProductID,0,0


Primary keys are all clean, then we need to check the foreign keys and missing values. By observation on the dataset, [Sales] table is the only one we need to handle specifically

In [None]:
sales = cleaned_dfs["Sales.csv"]

# Foreign keys in Sales
key_columns = [ "CustomerID", "StoreID", "BundledProductID"]

null_summary = []

for col in key_columns:
    null_count = int(sales[col].isna().sum())
    total = len(sales)
    pct = round((null_count / total) * 100, 2)
    null_summary.append({
        "Column": col,
        "Missing Count": null_count,
        "Missing %": pct
    })

missing_df = pd.DataFrame(null_summary)
display(missing_df)


Unnamed: 0,Column,Missing Count,Missing %
0,CustomerID,8,6.2
1,StoreID,0,0.0
2,BundledProductID,0,0.0


CustomerID in [Sales] have 8 null values, we can assign these as CustomerID = -1 and add CustomerID = -1 in [Customers] dimention table and capture it as 'Unknow' category, so we have a clear explanation on this and PBI will still join successfully on these rows

In [None]:
# Replace missing CustomerID in Sales with -1
sales["CustomerID"] = sales["CustomerID"].fillna(-1)

In [None]:
customers = cleaned_dfs["Customers.csv"]
if -1 not in customers["CustomerID"].values:
    unknown_row = {
        "CustomerID": -1,
        "Name": "Unknown Customer"
    }
    customers = pd.concat([customers, pd.DataFrame([unknown_row])], ignore_index=True)

cleaned_dfs["Sales.csv"] = sales
cleaned_dfs["Customers.csv"] = customers

In [None]:
#Copy and run the data profilling on cleaned_dfs again
summary = []

for name, df in cleaned_dfs.items():
    # basic info
    total_rows = len(df)
    total_cols = len(df.columns)
    total_cells = total_rows * total_cols

    # missing values
    missing_cells = df.isna().sum().sum()
    missing_pct_cells = round((missing_cells / total_cells) * 100, 2)
    missing_pct_rows = round(df.isna().any(axis=1).mean() * 100, 2)

    # duplicates
    dup_count = df.duplicated().sum()
    dup_pct = round((dup_count / total_rows) * 100, 2) if total_rows > 0 else 0

    summary.append({
        "Dataset": name,
        "Rows": total_rows,
        "Columns": total_cols,
        "Missing Cells %": missing_pct_cells,
        "Rows w/ Missing %": missing_pct_rows,
        "Duplicates": dup_count,
        "Duplicates %": dup_pct
    })

summary_df = pd.DataFrame(summary)
summary_df

Unnamed: 0,Dataset,Rows,Columns,Missing Cells %,Rows w/ Missing %,Duplicates,Duplicates %
0,Sales.csv,129,6,1.29,7.75,0,0.0
1,Stores.csv,7,4,0.0,0.0,0,0.0
2,BundledProducts_Products.csv,20,3,0.0,0.0,0,0.0
3,Customers.csv,12,5,8.33,25.0,0,0.0
4,Products.csv,16,4,1.56,6.25,0,0.0
5,Categories.csv,5,2,0.0,0.0,0,0.0
6,BundledProductsPromotions.csv,9,3,0.0,0.0,0,0.0
7,Promotions.csv,12,5,0.0,0.0,0,0.0
8,BundledProducts.csv,9,2,0.0,0.0,0,0.0


In [None]:
import os

os.makedirs("/content/data/clean", exist_ok=True)

for name, df in cleaned_dfs.items():
    file_name = name.lower()
    save_path = f"/content/data/clean/{file_name}"

    df.to_csv(save_path, index=False)
    print(f"Saved {save_path} ({len(df):,} rows)")


✅ Saved /content/data/clean/sales.csv (129 rows)
✅ Saved /content/data/clean/stores.csv (7 rows)
✅ Saved /content/data/clean/bundledproducts_products.csv (20 rows)
✅ Saved /content/data/clean/customers.csv (12 rows)
✅ Saved /content/data/clean/products.csv (16 rows)
✅ Saved /content/data/clean/categories.csv (5 rows)
✅ Saved /content/data/clean/bundledproductspromotions.csv (9 rows)
✅ Saved /content/data/clean/promotions.csv (12 rows)
✅ Saved /content/data/clean/bundledproducts.csv (9 rows)


In [None]:
!zip -r /content/data_cleaned.zip /content/data
from google.colab import files
files.download('/content/data_cleaned.zip')


  adding: content/data/ (stored 0%)
  adding: content/data/raw/ (stored 0%)
  adding: content/data/raw/Sales.csv (deflated 73%)
  adding: content/data/raw/Stores.csv (deflated 32%)
  adding: content/data/raw/BundledProducts_Products.csv (deflated 59%)
  adding: content/data/raw/Customers.csv (deflated 41%)
  adding: content/data/raw/Products.csv (deflated 61%)
  adding: content/data/raw/Categories.csv (deflated 10%)
  adding: content/data/raw/BundledProductsPromotions.csv (deflated 38%)
  adding: content/data/raw/Promotions.csv (deflated 59%)
  adding: content/data/raw/BundledProducts.csv (deflated 51%)
  adding: content/data/clean/ (stored 0%)
  adding: content/data/clean/bundledproducts_products.csv (deflated 53%)
  adding: content/data/clean/bundledproducts.csv (deflated 51%)
  adding: content/data/clean/promotions.csv (deflated 59%)
  adding: content/data/clean/stores.csv (deflated 31%)
  adding: content/data/clean/sales.csv (deflated 77%)
  adding: content/data/clean/customers.csv

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>