In [18]:
import pandas as pd
import re
from difflib import SequenceMatcher
import numpy as np

# --- Serial extraction and match logic ---
def extract_serials(text):
    """Extract valid serials: alphanumeric strings with at least one digit, 4+ characters"""
    candidates = re.findall(r'[A-Z0-9\.\-]{4,}', text.upper())
    return [s for s in candidates if any(c.isdigit() for c in s)]

def similarity(a, b):
    return SequenceMatcher(None, a, b).ratio()

def find_closest_serial_match(search_serials, source_serials, threshold=0.85):
    """Find the closest matching serial from source_serials"""
    best_match = None
    best_score = 0
    for search_serial in search_serials:
        for source_serial in source_serials:
            score = similarity(search_serial, source_serial)
            if score > best_score and score >= threshold:
                best_score = score
                best_match = source_serial
    return best_match, best_score

# --- Load files ---
list_file = "List_Of_Items.csv"
source_file = "Source.csv"

df = pd.read_csv(list_file, encoding="ISO-8859-1")
source_df = pd.read_csv(source_file, encoding="ISO-8859-1")

# Clean column names
df.columns = df.columns.str.strip()
source_df.columns = source_df.columns.str.strip()

# Get relevant columns
actual_part_number_col = next((col for col in source_df.columns if 'Part Number' in col), None)
actual_unit_cost_col = next((col for col in source_df.columns if 'Unit Cost' in col), None)
item_col = next((col for col in df.columns if "Item" in col), None)

if not actual_part_number_col or not actual_unit_cost_col or not item_col:
    raise Exception("❌ Missing required column(s) in your input files.")

# Extract rows 250–500
subset_df = df.iloc[249:500].copy()

# Clean up text fields
subset_df["Description"] = subset_df["Description"].fillna("").astype(str).str.upper().str.strip()
subset_df["Details"] = subset_df["Details"].fillna("").astype(str).str.upper().str.strip()
source_df[actual_part_number_col] = source_df[actual_part_number_col].fillna("").astype(str).str.upper().str.strip()

# Extract serial numbers from source
source_df["SERIALS"] = source_df[actual_part_number_col].apply(lambda x: extract_serials(x))
all_source_serials = set([s for sub in source_df["SERIALS"] for s in sub])

# Detect duplicates
full_item_list = df[item_col].dropna().astype(str)
duplicates_dict = {}
for item in full_item_list:
    matches = full_item_list[full_item_list == item]
    if len(matches) > 1:
        for idx in matches.index:
            duplicates_dict[idx] = item

# Initialize output columns
subset_df["Vendor"] = "NA"
subset_df["UNITCOST"] = "NA"
subset_df["DUPLICATES"] = "NA"
subset_df["Macro"] = "NA"

matches_found = 0

# --- Main loop: Match rows ---
for idx, row in subset_df.iterrows():
    combined_text = (row["Description"] + " " + row["Details"]).strip()
    search_serials = extract_serials(combined_text)
    
    if not search_serials:
        continue

    best_serial, score = find_closest_serial_match(search_serials, all_source_serials, threshold=0.85)
    
    if best_serial:
        matched_rows = source_df[source_df["SERIALS"].apply(lambda lst: best_serial in lst)]
        costs = matched_rows[actual_unit_cost_col].dropna().astype(float)

        if not costs.empty:
            if len(costs) == 1:
                chosen_cost = round(costs.iloc[0], 2)
            elif len(costs) == 2:
                chosen_cost = round(max(costs), 2)
            else:
                chosen_cost = round(np.median(costs), 2)

            subset_df.at[idx, "Macro"] = best_serial
            subset_df.at[idx, "UNITCOST"] = chosen_cost
            matches_found += 1

    if idx in duplicates_dict:
        subset_df.at[idx, "DUPLICATES"] = duplicates_dict[idx]

# --- Save results ---
output_file = "Updated_List_of_Items_250_to_500.xlsx"
subset_df.to_excel(output_file, index=False)

# --- Summary ---
print(f"✅ Matching complete! Saved to '{output_file}'")
print(f"📊 Matches found: {matches_found} out of {len(subset_df)} rows")


✅ Matching complete! Saved to 'Updated_List_of_Items_250_to_500.xlsx'
📊 Matches found: 19 out of 251 rows
