### Model - 1 Food Classifier
We build a zero-shot classifier using `facebook/bart-large-mnli` to label product names as Food vs Non-Food. The steps:
- Install dependencies and Import required libraries
- Load & clean sample data
- Load zero-shot pipeline and define labels
- Run batched classification and evaluate
- Test on example products and save the model

In [None]:
!pip install transformers torch pandas scikit-learn 

In [35]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, classification_report
import torch 

In [36]:
food_df = pd.read_csv('food_sample.csv')
nonfood_df = pd.read_csv('non_food_sample.csv') 

df = pd.concat([
    food_df.assign(true_label=1), 
    nonfood_df.assign(true_label=0) 
], ignore_index=True)

def clean_text(text):
    if isinstance(text, str):
        return ' '.join(text.lower().strip().split())
    return None  

df['product_name'] = df['product_name'].apply(clean_text)

df = df.dropna(subset=['product_name'])
df = df[df['product_name'].str.strip() != '']

print(f"Total valid samples: {len(df)} (Food: {sum(df['true_label'] == 1)}, Non-Food: {sum(df['true_label'] == 0)})")
print("\nSample data:")
print(df[['product_name', 'true_label']].head(10))

Total valid samples: 9226 (Food: 849, Non-Food: 8377)

Sample data:
                                        product_name  true_label
0  véritable pâte à tartiner noisettes chocolat noir           1
1                               chamomile herbal tea           1
2                     lagg's, herbal tea, peppermint           1
3                                 linden flowers tea           1
4                               herbal tea, hibiscus           1
5                               apple & cinnamon tea           1
6                                          green tea           1
7                             shave grass herbal tea           1
8               lagg's, herbal tea, chamomile * mint           1
9                               artichoke herbal tea           1


In [None]:
# Define candidate labels
candidate_labels = ["grocery food product", "household product"]

# Load zero-shot pipeline
classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli", 
    device=-1 if not torch.cuda.is_available() else 0
)

print("Zero-shot classifier loaded!")
print(f"Candidate labels: {candidate_labels}")

In [None]:
def classify_batch(texts, classifier, labels, batch_size=16):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        batch = [t for t in batch if t and isinstance(t, str) and t.strip()]
        if not batch:
            continue
        try:
            batch_results = classifier(batch, labels, multi_label=False)
            results.extend(batch_results)
        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            results.extend([{"labels": [labels[0]], "scores": [0.5]} for _ in batch])
    return results

valid_texts = df['product_name'].tolist()

predictions = classify_batch(valid_texts, classifier, candidate_labels, batch_size=16)

df['predicted_label'] = [1 if pred['labels'][0] == 'grocery food product' else 0 for pred in predictions]

accuracy = accuracy_score(df['true_label'], df['predicted_label'])
print(f"\nZero-Shot Model Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(df['true_label'], df['predicted_label'], target_names=['Non-Food (0)', 'Food (1)']))

print("\nSample Predictions with Scores:")
for i in range(min(10, len(df))):
    prod = df.iloc[i]['product_name']
    true = df.iloc[i]['true_label']
    pred = df.iloc[i]['predicted_label']
    scores = {label: score for label, score in zip(predictions[i]['labels'], predictions[i]['scores'])}
    print(f"'{prod}' | True: {true}, Pred: {pred} | Scores: Food={scores['grocery food product']:.2f}, Non-Food={scores['household product']:.2f}")

In [None]:

tesco_products = [
    "Tesco Choco Snaps Cereal 350g",       
    "Tesco Gold Instant Coffee 200g",      
    "Tesco Whole Cucumber",                 
    "Tesco Malted Milk Biscuits 200g",      
    "Tesco 4711 Acqua Colonia “Floral Fields of Ireland” Eau de Cologne 50ml",  
    "Tesco F&F 3-in-1 Ottoman Storage Chair", 
    "Tesco Household Washing Liquid",       
    "Tesco Toothpaste (Own-brand)",         
]


tesco_cleaned = [clean_text(p) for p in tesco_products]
tesco_results = classify_batch(tesco_cleaned, classifier, candidate_labels, batch_size=2)

print("\nTesco Product Predictions (Fixed):")
for product, result in zip(tesco_products, tesco_results):
    pred_label = result['labels'][0]
    pred_score = result['scores'][0]
    label = "Food" if pred_label == "grocery food product" else "Non-Food"
    print(f"{product}: {label} (confidence: {pred_score:.2f}, top label: {pred_label})")

In [43]:
classifier.save_pretrained('zero_shot_food_classifier')
print("Zero-shot classifier saved to 'zero_shot_food_classifier' directory!")

Non-default generation parameters: {'forced_eos_token_id': 2}


Zero-shot classifier saved to 'zero_shot_food_classifier' directory!
