In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

**Import Data**

In [None]:
base = Path("/content/sample_data/data")
files = list(base.glob("*.csv"))

In [None]:
dfs = {}
for f in files:
    df = pd.read_csv(f)
    dfs[f.name] = df
    print(f"Loaded {f.name} — {df.shape[0]} rows, {df.shape[1]} columns")

Loaded Sales.csv — 135 rows, 6 columns
Loaded Categories.csv — 5 rows, 2 columns
Loaded BundledProducts_Products.csv — 20 rows, 2 columns
Loaded Stores.csv — 7 rows, 4 columns
Loaded Promotions.csv — 12 rows, 5 columns
Loaded BundledProductsPromotions.csv — 9 rows, 2 columns
Loaded BundledProducts.csv — 9 rows, 2 columns
Loaded Products.csv — 17 rows, 4 columns
Loaded Customers.csv — 12 rows, 5 columns


**Check if data loaded correctly**

In [None]:
dfs["Sales.csv"].head()

Unnamed: 0,OrderID,CustomerID,BundledProductID,OrderDate,SalesAmount,StoreID
0,1001.0,5001.0,3001.0,15/01/2024,150.0,101.0
1,1002.0,5002.0,3002.0,16/01/2024,200.0,102.0
2,1003.0,5003.0,3003.0,17/01/2024,300.0,101.0
3,1004.0,5001.0,3001.0,15/02/2024,100.0,103.0
4,1005.0,5002.0,3002.0,16/02/2024,250.0,101.0


In [None]:
for name, df in dfs.items():
    print(name)
    display(df.head())

Sales.csv


Unnamed: 0,OrderID,CustomerID,BundledProductID,OrderDate,SalesAmount,StoreID
0,1001.0,5001.0,3001.0,15/01/2024,150.0,101.0
1,1002.0,5002.0,3002.0,16/01/2024,200.0,102.0
2,1003.0,5003.0,3003.0,17/01/2024,300.0,101.0
3,1004.0,5001.0,3001.0,15/02/2024,100.0,103.0
4,1005.0,5002.0,3002.0,16/02/2024,250.0,101.0


Categories.csv


Unnamed: 0,CategoryID,CategoryName
0,100,Electronics
1,101,Clothing
2,102,Home & Kitchen
3,103,Sports
4,104,Books


BundledProducts_Products.csv


Unnamed: 0,BundledProductID,ProductID
0,3001,2001
1,3001,2002
2,3001,2003
3,3002,2004
4,3002,2005


Stores.csv


Unnamed: 0,StoreID,StoreName,Location,Manager
0,101,Store A,New York,Alice
1,102,Store B,Los Angeles,Bob
2,103,Store C,Chicago,Charlie
3,104,Store D,San Francisco,David
4,105,Store E,Miami,Eve


Promotions.csv


Unnamed: 0,PromotionID,PromotionName,StartDate,EndDate,DiscountPercentage
0,4001,Promo A,2024-01-01,2024-01-31,10
1,4002,Promo B,2024-02-01,2024-02-28,15
2,4003,Promo C,2024-03-01,2024-03-31,20
3,4004,Promo D,2024-04-01,2024-04-30,25
4,4005,Promo E,2024-05-01,2024-05-31,30


BundledProductsPromotions.csv


Unnamed: 0,BundledProductID,PromotionID
0,3001,4001
1,3002,4002
2,3003,4003
3,3004,4004
4,3005,4005


BundledProducts.csv


Unnamed: 0,BundledProductID,BundledProductName
0,3001,Bundle A
1,3002,Bundle B
2,3003,Bundle C
3,3004,Bundle D
4,3005,Bundle E


Products.csv


Unnamed: 0,ProductID,ProductName,CategoryID,Price
0,2001,Product A,100,150.0
1,2002,Product B,101,200.0
2,2003,Product C,102,300.0
3,2004,Product D,103,100.0
4,2005,Product E,104,350.0


Customers.csv


Unnamed: 0,CustomerID,Name,Age,Gender,City
0,5001,John Doe,28.0,M,New York
1,5002,Jane Smith,34.0,F,Los Angeles
2,5003,Jim Brown,45.0,M,Chicago
3,5004,Susan Green,23.0,F,San Francisco
4,5005,Tom White,39.0,M,Miami


**Data Profilling Summary**

In [None]:

summary = []

for name, df in dfs.items():
    # --- basic info ---
    total_rows = len(df)
    total_cols = len(df.columns)
    total_cells = total_rows * total_cols

    # --- missing value analysis ---
    missing_cells = df.isna().sum().sum()
    missing_pct_cells = round((missing_cells / total_cells) * 100, 2)
    missing_pct_rows = round(df.isna().any(axis=1).mean() * 100, 2)

    # --- duplicates ---
    dup_count = df.duplicated().sum()
    dup_pct = round((dup_count / total_rows) * 100, 2) if total_rows > 0 else 0

    # --- summary dictionary ---
    summary.append({
        "Dataset": name,
        "Rows": total_rows,
        "Columns": total_cols,
        "Missing Cells %": missing_pct_cells,
        "Rows w/ Missing %": missing_pct_rows,
        "Duplicates": dup_count,
        "Duplicates %": dup_pct
    })

summary_df = pd.DataFrame(summary)
summary_df


Unnamed: 0,Dataset,Rows,Columns,Missing Cells %,Rows w/ Missing %,Duplicates,Duplicates %
0,Sales.csv,135,6,3.7,11.85,5,3.7
1,Categories.csv,5,2,0.0,0.0,0,0.0
2,BundledProducts_Products.csv,20,2,0.0,0.0,0,0.0
3,Stores.csv,7,4,0.0,0.0,0,0.0
4,Promotions.csv,12,5,0.0,0.0,0,0.0
5,BundledProductsPromotions.csv,9,2,0.0,0.0,0,0.0
6,BundledProducts.csv,9,2,0.0,0.0,0,0.0
7,Products.csv,17,4,1.47,5.88,1,5.88
8,Customers.csv,12,5,3.33,16.67,1,8.33


  Obseravation:
  - Duplicates need to be deduplicated
  - Missing values for [Sales], [Products] and [Customers] need to be handled, considering the missing percentage is relatively small, we can either safely drop some rows, or handle numeric with median, category with 'unknown'etc. however if the primary key is missing we need to handle them specifically
  - For outliers we can have better visualization in PBI later on

**Clean the data**

In [None]:
cleaned_dfs = {}

In [None]:
for name, df in dfs.items():
    #Trim whitespaces
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    #Standardise the nulls before removing null values
    df = df.replace(["", " ", "None", "NULL", "null", "N/A", "na", "-", "--"], np.nan)

    # Deduplicate
    df = df.drop_duplicates()

    #If all columns are missing for one row, safe to drop
    all_null_before = df.isna().all(axis=1).sum()
    df = df.dropna(how="all")

    cleaned_dfs[name] = df



  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


**Define and validate primary keys**

In [None]:
primary_keys = {
    "Products.csv": ["ProductID"],
    "BundledProducts.csv": ["BundledProductID"],
    "Promotions.csv": ["PromotionID"],
    "Customers.csv": ["CustomerID"],
    "Stores.csv": ["StoreID"],
    "Categories.csv": ["CategoryID"],
    "BundledProducts_Products.csv": ["BundledProductID", "ProductID"],          # many to many relation ship - composite key
    "BundledProductsPromotions.csv": ["BundledProductID", "PromotionID"],       # many to many relationship - composite key(currently look like 1:1 however business rule m:m makese more sense)
    "Sales.csv": ["OrderID"]  # change to your real unique column if it exists
}


pk_results = []
for name, df in cleaned_dfs.items():
    pks = primary_keys.get(name, [])
    if not pks:
        pk_results.append({"Dataset": name, "PK": "None defined", "Missing PK values": "N/A", "Duplicate PKs": "N/A"})
        continue

    missing_cols = [c for c in pks if c not in df.columns]
    if missing_cols:
        pk_results.append({"Dataset": name, "PK": f"Missing columns: {missing_cols}", "Missing PK values": "N/A", "Duplicate PKs": "N/A"})
        continue

    miss_rows = int(df[pks].isna().any(axis=1).sum())
    dup_rows  = int(df.duplicated(subset=pks).sum())
    pk_results.append({"Dataset": name, "PK": ", ".join(pks), "Missing PK values": miss_rows, "Duplicate PKs": dup_rows})

pd.DataFrame(pk_results)


Unnamed: 0,Dataset,PK,Missing PK values,Duplicate PKs
0,Customers.csv,CustomerID,0,0
1,Sales.csv,OrderID,0,0
2,Categories.csv,CategoryID,0,0
3,BundledProducts_Products.csv,"BundledProductID, ProductID",0,0
4,Stores.csv,StoreID,0,0
5,Promotions.csv,PromotionID,0,0
6,BundledProductsPromotions.csv,"BundledProductID, PromotionID",0,0
7,BundledProducts.csv,BundledProductID,0,0
8,Products.csv,ProductID,0,0


In [None]:
key_summary = []

for name, df in cleaned_dfs.items():
    # find all columns ending with 'ID'
    id_cols = [c for c in df.columns if c.endswith("ID")]
    pk_cols = primary_keys.get(name, [])
    other_keys = [c for c in id_cols if c not in pk_cols]

    key_summary.append({
        "Dataset": name,
        "Primary Keys": ", ".join(pk_cols) if pk_cols else "None",
        "Other ID Columns": ", ".join(other_keys) if other_keys else "None",
        "Total ID Columns": len(id_cols)
    })

key_summary_df = pd.DataFrame(key_summary)
display(key_summary_df)

Unnamed: 0,Dataset,Primary Keys,Other ID Columns,Total ID Columns
0,Customers.csv,CustomerID,,1
1,Sales.csv,OrderID,"CustomerID, BundledProductID, StoreID",4
2,Categories.csv,CategoryID,,1
3,BundledProducts_Products.csv,"BundledProductID, ProductID",,2
4,Stores.csv,StoreID,,1
5,Promotions.csv,PromotionID,,1
6,BundledProductsPromotions.csv,"BundledProductID, PromotionID",,2
7,BundledProducts.csv,BundledProductID,,1
8,Products.csv,ProductID,CategoryID,2
