In [4]:
# Phase 2: Advanced Parsing Logic
# ==========================================
# Objective: Extract Model, Material, Price, and Quantity from unstructured text.

import pandas as pd
import re
import os

# Load Data (Simulating previous step)
FILENAME = 'Siddharth_Associates_sample data 2 - Sheet1.csv'

# Robust Path Finding: Check root, project folder, and relative paths
PATHS_TO_CHECK = [
    f'siddharth_trade_pipeline/data/raw/{FILENAME}',
    FILENAME,
    f'../data/raw/{FILENAME}'
]

FILE_PATH = None
for path in PATHS_TO_CHECK:
    if os.path.exists(path):
        FILE_PATH = path
        break

if not FILE_PATH:
    raise FileNotFoundError(f"Could not find '{FILENAME}' in any of the checked paths: {PATHS_TO_CHECK}")

print(f"File found at: {FILE_PATH}")

try:
    df = pd.read_csv(FILE_PATH, encoding='utf-8')
except:
    df = pd.read_csv(FILE_PATH, encoding='ISO-8859-1')

df.columns = [c.strip().replace(' ', '_').upper() for c in df.columns]

print("--- 1. REGEX DEVELOPMENT ---")
# Test string representing a difficult row
test_string = "TH5170 STEEL CUTLERY HOLDER (QTY:600 PCS/USD 2.03 PER PCS)"
print(f"Testing on: {test_string}")

# Pattern 1: Price (Looks for 'USD', optional separators, then numbers)
price_regex = r'USD\s*[:\-\s]?\s*([\d\.]+)'
price_match = re.search(price_regex, test_string)
print(f"Price Match: {price_match.group(1) if price_match else 'None'}")

# Pattern 2: Quantity (Looks for 'QTY', optional separators, then numbers)
qty_regex = r'QTY\s*[:\-\s]?\s*([\d]+)'
qty_match = re.search(qty_regex, test_string)
print(f"Qty Match: {qty_match.group(1) if qty_match else 'None'}")


print("\n--- 2. APPLYING LOGIC TO DATASET ---")
def extract_features(text):
    if pd.isna(text): return None, None, None
    text = str(text).upper()
    
    # Extract Price
    usd = re.search(r'USD\s*[:\-\s]?\s*([\d\.]+)', text)
    price = float(usd.group(1)) if usd else None
    
    # Extract Qty
    qty = re.search(r'QTY\s*[:\-\s]?\s*([\d]+)', text)
    embedded_qty = float(qty.group(1)) if qty else None
    
    # Extract Material
    material = "UNKNOWN"
    if "STEEL" in text: material = "STEEL"
    elif "GLASS" in text: material = "GLASS"
    elif "PLASTIC" in text: material = "PLASTIC"
    
    return price, embedded_qty, material

print("Applying extraction function...")
# Zip allows us to assign 3 new columns at once from the function return
df['EXT_PRICE'], df['EXT_QTY'], df['EXT_MAT'] = zip(*df['GOODS_DESCRIPTION'].apply(extract_features))

print(df[['GOODS_DESCRIPTION', 'EXT_PRICE', 'EXT_QTY', 'EXT_MAT']].head())


print("\n--- 3. UNIT STANDARDIZATION ---")
unit_map = {
    'PCS': 'PCS', 'PIECES': 'PCS', 'NOS': 'PCS', 'NO': 'PCS',
    'KGS': 'KGS', 'KG': 'KGS',
    'SETS': 'SETS'
}

df['STD_UNIT'] = df['UNIT'].str.upper().str.strip().map(unit_map).fillna('OTHER')
print("Standardized Unit Distribution:")
print(df['STD_UNIT'].value_counts())

File found at: ../data/raw/Siddharth_Associates_sample data 2 - Sheet1.csv
--- 1. REGEX DEVELOPMENT ---
Testing on: TH5170 STEEL CUTLERY HOLDER (QTY:600 PCS/USD 2.03 PER PCS)
Price Match: 2.03
Qty Match: 600

--- 2. APPLYING LOGIC TO DATASET ---
Applying extraction function...
                                   GOODS_DESCRIPTION  EXT_PRICE   EXT_QTY  \
0  TH5170 STEEL CUTLERY HOLDER (QTY:600 PCS/USD 2...     2.0300     600.0   
1  8001-2 STEEL SCRUBBER 2PCS SET (QTY: 336000 SE...     0.1390  336000.0   
2  MILD STEEL MULTI FUNCTION CLOTH STAND (RYX-02-...        NaN       1.0   
3  SB-12 STEEL TEA STRAINER BIG (QTY 6336 PCS/USD...     0.9730    6336.0   
4  SB-12 STEEL TEA STRAINER BIG (QTY 43038 PCS/US...     0.9718   43038.0   

  EXT_MAT  
0   STEEL  
1   STEEL  
2   STEEL  
3   STEEL  
4   STEEL  

--- 3. UNIT STANDARDIZATION ---
Standardized Unit Distribution:
STD_UNIT
KGS      1644
PCS       401
OTHER      34
Name: count, dtype: int64
