<a href="https://colab.research.google.com/github/riyasharma-kline/HackWeek-TheDebuggingFive/blob/main/Challenge2-Test-DataSet-Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/

/content/drive/MyDrive


In [None]:
import pandas as pd
import numpy as np
import joblib
import re
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer
from io import BytesIO

# ------------------------
# STEP 1: Load Raw Data
# ------------------------
df = pd.read_csv("HackWeekProductsVerify.csv")
# df =  df[(df["MappingModeName"] == "Omit")].sample(n=100000, random_state=42) #df.sample(n=10000, random_state=42).copy()

# ------------------------
# STEP 2: Load Model + Encoders + Embedder
# ------------------------
clf_map = joblib.load("clf_map_model_1m.pkl")
label_encoders = joblib.load("label_encoders_1m.pkl")
model_embed = SentenceTransformer('all-MiniLM-L6-v2')

# ------------------------
# STEP 3: Preprocessing Functions
# ------------------------

def clean_text(val):
    if pd.isnull(val):
        return ""
    val = re.sub(r"[^a-zA-Z0-9\s\-\.\%\/]", "", str(val))
    return val.lower().strip()

def clean_dataframe(df):
    text_cols_to_clean = [
        "SourceMasterBrand", "SourceBrand", "SourceSubBrand",
        "SourceCategory", "SourceSubcategory", "SourceDescription",
    ]
    for col in text_cols_to_clean:
        if col in df.columns:
            df[col] = df[col].fillna("").astype(str).apply(clean_text)

    columns_to_drop = [
        "SourceBarcode","CleanBarcode","IsValidBarcode","Splitter",
        "SourcePackagingTypeName","ProductMasterId","CompanyName",
        "MasterBrandName","BrandName","SubBrandName","ProductName",
        "CategoryName","SubcategoryFormName","SubcategoryFunctionName",
        "ProductSizeOnLabel"
    ]
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')
    return df

def safe_label_transform(le, values):
    known_classes = set(le.classes_)
    fallback = le.classes_[0]
    safe_values = [val if val in known_classes else fallback for val in values]
    return le.transform(safe_values)

def prepare_features_for_inference(df, label_encoders, model_embed):
    df = clean_dataframe(df)
    categorical_features = ['SourceBrand', 'SourceSubBrand', 'SourceCategory', 'SourceSubcategory']

    for col in categorical_features:
        le = label_encoders[col]
        df[col] = safe_label_transform(le, df[col].astype(str))

    df['FullText'] = (
        df['SourceBrand'].astype(str) + ' ' +
        df['SourceSubBrand'].astype(str) + ' ' +
        df['SourceCategory'].astype(str) + ' ' +
        df['SourceSubcategory'].astype(str) + ' ' +
        df['SourceDescription'].fillna('')
    ).str.strip()

    desc_embeddings = model_embed.encode(df['FullText'].tolist(), show_progress_bar=True)
    dense_matrix = df[categorical_features].astype(np.float32).values
    X = np.hstack([dense_matrix, desc_embeddings])

    return X, df

# ------------------------
# STEP 4: Inference
# ------------------------

X_eval, df_processed = prepare_features_for_inference(df, label_encoders, model_embed)

probs = clf_map.predict_proba(X_eval)[:, 1]
pred_classes = (probs >= 0.5).astype(int)
pred_labels = np.where(pred_classes == 1, 'Map', 'Omit')

df_processed['Outcome'] = pred_labels
df_processed['Confidence'] = probs.round(4)
df_processed['Actual Class'] = df_processed['MappingModeName']
df_processed['ServiceAndProductMappingId'] = np.arange(1, len(df_processed) + 1)

# ------------------------
# STEP 5: Confidence Evaluation
# ------------------------

threshold = 0.6
df_processed['HighConfidence'] = (df_processed['Confidence'] >= threshold) | (df_processed['Confidence'] <= (1 - threshold))
high_conf_df = df_processed[df_processed['HighConfidence']]

accuracy = accuracy_score(
    high_conf_df['Actual Class'].map({'Map': 1, 'Omit': 0}),
    high_conf_df['Outcome'].map({'Map': 1, 'Omit': 0})
)

# ------------------------
# STEP 6: Export Results
# ------------------------

df_processed[['ServiceAndProductMappingId', 'Outcome', 'Confidence', 'Actual Class']].to_csv(
    "final_predictions_cleaned.csv", index=False
)

# ------------------------
# STEP 7: Reporting
# ------------------------

total = len(df_processed)
high_conf_total = len(high_conf_df)
high_conf_pct = (high_conf_total / total) * 100



print("\nInference Complete.")
print(f"Total Records Evaluated: {total}")
print(f"High-Confidence Records: {high_conf_total} ({high_conf_pct:.2f}%)")
print(f"Accuracy on High-Confidence: {accuracy:.4f}")
print("\nOutput saved to: final_predictions_cleaned.csv")

from sklearn.metrics import precision_score, recall_score

# Map string labels to binary for evaluation
y_true = high_conf_df['Actual Class'].map({'Map': 1, 'Omit': 0})
y_pred = high_conf_df['Outcome'].map({'Map': 1, 'Omit': 0})

# Calculate precision and recall
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/31250 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)



Inference Complete.
Total Records Evaluated: 1000000
High-Confidence Records: 949496 (94.95%)
Accuracy on High-Confidence: 0.9468

Output saved to: final_predictions_cleaned.csv
Precision: 0.9556
Recall: 0.9728
