In [4]:
# Phase 3: Feature Engineering
# ==========================================
# Objective: Calculate financial metrics and categorize data.

import pandas as pd
import numpy as np
import os

# CONFIGURATION
# ---------------------------------------------------------
FILENAME = 'Siddharth_Associates_sample data 2 - Sheet1.csv'

# Robust Path Finding: Check root, project folder, and relative paths
PATHS_TO_CHECK = [
    f'siddharth_trade_pipeline/data/raw/{FILENAME}',
    FILENAME,
    f'../data/raw/{FILENAME}'
]

FILE_PATH = None
for path in PATHS_TO_CHECK:
    if os.path.exists(path):
        FILE_PATH = path
        break

if not FILE_PATH:
    raise FileNotFoundError(f"Could not find '{FILENAME}' in any of the checked paths: {PATHS_TO_CHECK}")

print(f"Loading data from: {FILE_PATH}")

try:
    df = pd.read_csv(FILE_PATH, encoding='utf-8')
except:
    df = pd.read_csv(FILE_PATH, encoding='ISO-8859-1')

df.columns = [c.strip().replace(' ', '_').upper() for c in df.columns]

print("--- 1. LANDED COST CALCULATION ---")
# Ensure numeric types
for c in ['TOTAL_VALUE_INR', 'DUTY_PAID_INR', 'QUANTITY']:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

# Grand Total = Value + Duty
df['GRAND_TOTAL'] = df['TOTAL_VALUE_INR'] + df['DUTY_PAID_INR']

# Landed Cost Per Unit (Handling Division by Zero)
df['LANDED_COST_PER_UNIT'] = np.where(
    df['QUANTITY'] > 0,
    df['GRAND_TOTAL'] / df['QUANTITY'],
    0
)

print(df[['GRAND_TOTAL', 'LANDED_COST_PER_UNIT']].head())


print("\n--- 2. HIERARCHICAL CATEGORIZATION ---")
def categorize(row):
    desc = str(row['GOODS_DESCRIPTION']).upper()
    
    cat = 'GENERAL'
    sub_cat = 'STANDARD'
    
    # Rule 1: Kitchenware
    if 'CUTLERY' in desc or 'SPOON' in desc or 'FORK' in desc:
        cat = 'KITCHENWARE'
        if 'SPOON' in desc: sub_cat = 'SPOON'
        elif 'FORK' in desc: sub_cat = 'FORK'
            
    # Rule 2: Glassware
    elif 'GLASS' in desc:
        cat = 'GLASSWARE'
        if 'BOROSILICATE' in desc: sub_cat = 'BOROSILICATE'
            
    return pd.Series([cat, sub_cat])

print("Assigning categories...")
df[['CATEGORY', 'SUB_CATEGORY']] = df.apply(categorize, axis=1)
print(df['CATEGORY'].value_counts())


print("\n--- 3. ANOMALY DETECTION (DUTY %) ---")
# Calculate Duty Percentage
df['DUTY_PCT'] = np.where(
    df['TOTAL_VALUE_INR'] > 0,
    (df['DUTY_PAID_INR'] / df['TOTAL_VALUE_INR']) * 100,
    0
)

# Flag anomalies (e.g., > 50% duty paid)
anomalies = df[df['DUTY_PCT'] > 50]
print(f"High Duty Anomalies Found: {len(anomalies)}")
if not anomalies.empty:
    print(anomalies[['GOODS_DESCRIPTION', 'DUTY_PCT']].head())

Loading data from: ../data/raw/Siddharth_Associates_sample data 2 - Sheet1.csv
--- 1. LANDED COST CALCULATION ---
   GRAND_TOTAL  LANDED_COST_PER_UNIT
0    149236.94            514.610138
1   6216051.24            634.290943
2   2097266.23            523.792765
3    750865.21            484.804500
4   5142274.46            512.066525

--- 2. HIERARCHICAL CATEGORIZATION ---
Assigning categories...
CATEGORY
GENERAL        2043
KITCHENWARE      34
GLASSWARE         2
Name: count, dtype: int64

--- 3. ANOMALY DETECTION (DUTY %) ---
High Duty Anomalies Found: 0
