In [23]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import pandas as pd
import re
from datetime import datetime           
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import os
import glob
# -----------------------------
# Load the model and tokenizer
# -----------------------------
model_path = "./results/final_model"
for f in glob.glob(f"{model_path}/*"):
    t = datetime.fromtimestamp(os.path.getmtime(f))
    print(f"{os.path.basename(f):20s}  {t}")
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# -----------------------------
# Connect to the database (NEVER STORE CREDS IN A SCRIPT)
# -----------------------------
db_type = 'postgresql'
host = 'localhost'
dbname = 'Qrious'
user = 'postgres'
password = quote_plus('Postgres@qrious')
port = '5432'

conn_str = f'{db_type}://{user}:{password}@{host}:{port}/{dbname}'
engine = create_engine(conn_str)

# query = """
#     SELECT DISTINCT product_name_raw
#     FROM cleaned_retailer_events
#     WHERE relevant_code_binary IS NULL
#     LIMIT 10000
# """

#df = pd.read_sql(query, engine)
df = df_split = pd.read_csv("/Users/ramana/Documents/shopper_analysis/inferred_inputs/InputForInference_202504270250.csv")

# -----------------------------
# Clean the product names
# -----------------------------
def clean_product_name(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Drop missing or empty values
df = df.dropna(subset=["product_name_raw"])
df["product_name_raw"] = df["product_name_raw"].astype(str).str.strip()
df = df[df["product_name_raw"] != ""]

# Remove duplicates
df = df.drop_duplicates(subset=["product_name_raw"])

# Apply cleaning
df["cleaned_product_name"] = df["product_name_raw"].apply(clean_product_name)

df.head()

# -----------------------------
# Save cleaned input
# -----------------------------
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs("../inferred_inputs", exist_ok=True)
cleaned_path = f"../inferred_inputs/input_to_predict_{timestamp}.csv"
df[["product_name_raw", "cleaned_product_name"]].to_csv(cleaned_path, index=False)
print(f"Cleaned input saved to: {cleaned_path}")

# -----------------------------
# Inference
# -----------------------------
def predict_label(text):
    encoded = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits
        probs = F.softmax(logits, dim=1)
        pred_label = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_label].item()
    return pred_label, confidence, probs.tolist()[0]

# Run predictions
predictions = []
for name in df["cleaned_product_name"]:
    label, conf, probs = predict_label(name)
    predictions.append((label, conf, probs))

df["predicted_label"] = [p[0] for p in predictions]
df["confidence"] = [p[1] for p in predictions]
df["prob_nonfashion"] = [p[2][0] for p in predictions]
df["prob_fashion"] = [p[2][1] for p in predictions]

# -----------------------------
# Save output
# -----------------------------
os.makedirs("../inferred_outputs", exist_ok=True)
inferred_path = f"../inferred_outputs/predictedoutput_{timestamp}.csv"
df.to_csv(inferred_path, index=False)
print(f"Inference results saved to: {inferred_path}")

model.safetensors     2025-04-27 01:24:13.947944
tokenizer_config.json  2025-04-27 01:24:13.962625
special_tokens_map.json  2025-04-27 01:24:13.962734
config.json           2025-04-27 01:24:13.716560
training_args.bin     2025-04-27 01:24:13.961356
vocab.txt             2025-04-27 01:24:13.968700
Cleaned input saved to: ../inferred_inputs/input_to_predict_20250427_025122.csv
Inference results saved to: ../inferred_outputs/predictedoutput_20250427_025122.csv
