In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords
import re
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sentence_transformers import SentenceTransformer, util, InputExample
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_torch_sdpa_available 
import torch
import math
import os
import json
from tqdm import tqdm
import random
model = ''

In [2]:
file_path = "winemag-data-130k-v2.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "zynicide/wine-reviews",
  file_path,
)

  df = kagglehub.load_dataset(




In [None]:
del model
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

if is_torch_sdpa_available():
    torch._dynamo.config.suppress_errors = True  # optional
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
    torch.backends.cuda.enable_mem_efficient_sdp(False)

model_name = "Qwen/Qwen2-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
model.config.sliding_window = None
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"

In [None]:
import re

def trim_query(text, max_parts=3):
    # Lowercase the text for normalization
    text = text.strip()

    # Split by common descriptor joiners
    parts = re.split(r'\s*(?:,| and | with |\. )\s*', text)

    # Keep only the first few parts
    trimmed_parts = parts[:max_parts]

    # Re-join them with commas or 'and' for flow
    if len(trimmed_parts) > 1:
        result = ', '.join(trimmed_parts[:-1]) + ' and ' + trimmed_parts[-1]
    else:
        result = trimmed_parts[0]

    return result.strip().capitalize()

def generate_sample_query(wine_desc):
    
    prompt_variants = [
    """You are a helpful assistant that creates short, human-like wine search queries from wine descriptions. The queries should be realistic, not too detailed, and focus on 1–3 key aspects of the wine.""",

    """You are an assistant that turns wine descriptions into simple, casual search phrases that a real person might use. Do not list everything—just highlight a few main traits.""",

    """You generate concise, informal wine search queries based on tasting notes. Keep it brief and include only the most noticeable characteristics.""",

    """You're helping wine lovers describe wines in simple search terms. Summarize the wine's essence in 1–2 quick flavor notes or attributes.""",

    """You generate short, realistic search queries from wine descriptions. Keep the language natural and intuitive. Mention just a couple key traits.""",

    """Your job is to turn wine descriptions into casual search-style queries using only the most important descriptors. Avoid over-detailing or repetition.""",

    """You create short user-style search queries based on wine descriptions. Stick to 1–3 high-impact descriptors. Be realistic and natural, not formal.""",

    """You are generating imperfect but useful search queries from wine descriptions. The goal is to sound like a real wine shopper, not list every flavor.""",

    """Turn the wine descriptions into realistic, search-friendly phrases a person might type. Include only standout flavors or textures.""",

    """You're writing very short, human-friendly search terms based on full wine descriptions. Focus on the most prominent flavors or textures."""
    ]   

    # Select a random prompt variant
    prompt = prompt_variants[random.randint(0, len(prompt_variants) - 1)]
    
    messages = [
       {"role": "system", "content": f"""{prompt} Queries must sound like a real person searching online. No more than 2 descriptors. Avoid full sentences or wine jargon.

        Examples:

        Wine: "Flavors of blackberry, plum, leather, and dried herbs with firm tannins and a smoky finish."  
        Query: "bold wine with dark fruit."

        Wine: "Aromas of green apple, lime zest, and flinty minerality with bright acidity."  
        Query: "Crisp white with citrus notes"
        
        Wine: "Bright citrus and minerality with crisp acidity."
        Query: "Zesty white with citrus"

        Wine: "Ripe red cherry, baking spice, and light oak tannins."
        Query: "Fruity red with spice"

        Now generate a query for this wine:
        """},
        {"role": "user", "content": f"Write a natural user-style query for the following wine:\n{wine_desc}\nQuery:"}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer(text, return_tensors="pt", padding=True, return_attention_mask=True).to(model.device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=100,
        temperature=0.8,
        top_p=0.85,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    #prune off exess description and ""
    response = response.replace('"', "")
    response = trim_query(response)
    
    restructure_patterns = [
    lambda desc: f"{desc}",  
    lambda desc: f"A {desc} wine", 
    lambda desc: f"{desc}, perfect with food",  
    lambda desc: f"Looking for a {desc} wine",  
    lambda desc: f"{desc} ideal with dinner",  
    lambda desc: f"somthing that is {desc}",
    lambda desc: f"Wine with {desc}",
    lambda desc: f"Searching for {desc}",
    lambda desc: f"Wine that is {desc}",
    ]
    
    # Randomly select a restructuring pattern
    restructure = random.choice(restructure_patterns)
    # Apply the restructuring
    response = restructure(response)
    return response



In [None]:
def generate_positive_pairs(
    df: pd.DataFrame,
    num_samples=1000,
    output_file="positive_pairs.jsonl"
):
    # Step 1: Drop NaNs and prepare repeat logic
    df = df.dropna(subset=["description"]).reset_index(drop=True)
    if len(df) == 0:
        raise ValueError("No valid wine descriptions found in the dataset.")

    if num_samples > len(df):
        reps = math.ceil(num_samples / len(df))
        df = pd.concat([df] * reps, ignore_index=True).sample(n=num_samples).reset_index(drop=True)
    else:
        df = df.sample(n=num_samples).reset_index(drop=True)

    examples = []

    with open(output_file, "w", encoding="utf-8") as f:
        for i, row in tqdm(df.iterrows(), total=num_samples, desc="Generating positive pairs"):
            wine_desc = row["description"]
            try:
                user_query = generate_sample_query(wine_desc)
                example = InputExample(texts=[user_query, wine_desc], label=1.0)
                examples.append(example)

                f.write(json.dumps({
                    "texts": [user_query, wine_desc],
                    "label": 1.0
                }) + "\n")
            except Exception as e:
                print(f"[x] Skipped row {i}: {e}")
                continue

    return examples

In [None]:
def generate_negative_pairs_from_positive(positive_path, output_path="negative_pairs.jsonl"):
    # Load the positive examples
    with open(positive_path, "r", encoding="utf-8") as f:
        positive_data = [json.loads(line) for line in f]

    queries = [item["texts"][0] for item in positive_data]
    descriptions = [item["texts"][1] for item in positive_data]

    # Shuffle descriptions until none are in original positions
    shuffled_descriptions = descriptions.copy()
    attempts = 0
    while any(qd == sd for qd, sd in zip(descriptions, shuffled_descriptions)) and attempts < 10:
        random.shuffle(shuffled_descriptions)
        attempts += 1

    if attempts == 10:
        print("⚠️ Could not perfectly avoid overlaps after 10 tries. Still writing.")

    # Build negative examples
    negative_examples = []
    with open(output_path, "w", encoding="utf-8") as f_out:
        for query, wrong_desc in tqdm(zip(queries, shuffled_descriptions), total=len(queries), desc="Generating negative pairs"):
            example = {
                "texts": [query, wrong_desc],
                "label": 0.0
            }
            f_out.write(json.dumps(example) + "\n")
            negative_examples.append(InputExample(texts=[query, wrong_desc], label=0.0))

    return negative_examples

In [None]:
positive_test_pairs = generate_positive_pairs(
    df,
    num_samples=6000,
    output_file="positive_pairs.jsonl"
)

negative_test_pairs = generate_negative_pairs_from_positive(
    positive_path="positive_pairs.jsonl",
    output_path="negative_pairs.jsonl"
)

#print 5 positive and negative pairs
def print_pairs(pairs, label):
    print(f"\n{label} pairs:")
    for i, pair in enumerate(pairs[:5]):
        query, desc = pair.texts
        print(f"Pair {i+1}:")
        print(f"Query: {query}")
        print(f"Description: {desc}")
        print("-" * 60)
print_pairs(positive_test_pairs, "Positive")
print_pairs(negative_test_pairs, "Negative")

In [None]:
# 🔢 Price bucket function
def price_bucket(price):
    try:
        price = float(price)
        if price < 15:
            return "low"
        elif price < 30:
            return "medium"
        else:
            return "high"
    except:
        return None

# 🧪 Proportional stratified sampling
def stratified_sample_fixed_total(df, target_total=3000):
    df = df.dropna(subset=["description", "country", "price"]).copy()
    df["price_bucket"] = df["price"].apply(price_bucket)
    df = df.dropna(subset=["price_bucket"])

    group_keys = ["country", "price_bucket"]
    grouped = df.groupby(group_keys)

    # Get group sizes
    group_sizes = grouped.size()
    total_groups = len(group_sizes)

    # Calculate proportional samples per group
    total_available = group_sizes.sum()
    group_targets = (group_sizes / total_available * target_total).round().astype(int)

    # Sample from each group
    sampled_dfs = []
    for group, size in group_targets.items():
        group_df = grouped.get_group(group)
        sampled = group_df.sample(min(size, len(group_df)), random_state=42)
        sampled_dfs.append(sampled)

    stratified_df = pd.concat(sampled_dfs).reset_index(drop=True)
    return stratified_df

def generate_stratified_queries_fixed_total(df, target_total=3000, output_path="fixed_stratified_pairs.jsonl"):
    stratified_df = stratified_sample_fixed_total(df, target_total)

    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in tqdm(stratified_df.iterrows(), total=len(stratified_df), desc="Generating queries"):
            desc = row["description"]
            try:
                query = generate_sample_query(desc)
                item = {
                    "texts": [query, desc],
                    "label": 1.0
                }
                f.write(json.dumps(item) + "\n")
            except Exception as e:
                print(f"[x] Skipping row due to error: {e}")
                continue

    print(f"\n✅ Saved {len(stratified_df)} examples to: {output_path}")

generate_stratified_queries_fixed_total(
    df,
    target_total=20000,
    output_path="fixed_stratified_pairs.jsonl"
)

In [None]:
#generate negatives from fixed_stratified_pairs.jsonl
negative_test_pairs = generate_negative_pairs_from_positive(
    positive_path="fixed_positive_stratified_pairs.jsonl",
    output_path="fixed_negative_stratified_pairs.jsonl"
)

In [None]:
#diversity check to make sure we dont have fake queries that are too similar or too different 

#read in positinve and negative pairs
positive_test_pairs = []
with open("fixed_positive_stratified_pairs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        positive_test_pairs.append(InputExample(**json.loads(line)))
negative_test_pairs = []
with open("fixed_negative_stratified_pairs.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        negative_test_pairs.append(InputExample(**json.loads(line)))
        
        
combined_pairs = positive_test_pairs + negative_test_pairs
#pull out just user queries
user_queries = []
for example in combined_pairs:
    user_queries.append(example.texts[0])

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(user_queries, convert_to_tensor=True)
sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
upper_tri_sim = sim_matrix.triu(diagonal=1)
print(1.0 - upper_tri_sim[upper_tri_sim != 0].mean().item())

In [15]:
def load_examples_from_jsonl(path):
    examples = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            examples.append(InputExample(texts=item["texts"], label=float(item["label"])))
    return examples

positive_examples = load_examples_from_jsonl("fixed_positive_stratified_pairs.jsonl")
negative_examples = load_examples_from_jsonl("fixed_negative_stratified_pairs.jsonl")

# Combine and shuffle
all_examples = positive_examples + negative_examples
random.shuffle(all_examples)

In [16]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(all_examples, shuffle=True, batch_size=16)  # Adjust batch size if needed

In [17]:
from sentence_transformers import SentenceTransformer, losses
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

# Use CosineSimilarityLoss (good for binary positive/negative matching)
train_loss = losses.CosineSimilarityLoss(model=model)

In [18]:
#get basline accuracy
def compute_accuracy(model, query_desc_pairs, candidates_per_query=10):
    correct = 0
    top_5_correct = 0
    total = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    for pair in query_desc_pairs:
        query = pair.texts[0]
        correct_desc = pair.texts[1]

        # Get negative samples
        incorrect = random.sample([p.texts[1] for p in query_desc_pairs if p.texts[1] != correct_desc], k=candidates_per_query - 1)
        candidates = [correct_desc] + incorrect
        random.shuffle(candidates)

        # Encode and ensure everything is on the same device
        query_emb = model.encode(query, convert_to_tensor=True).to(device)
        cand_embs = model.encode(candidates, convert_to_tensor=True).to(device)

        # Compute similarity
        sims = util.cos_sim(query_emb, cand_embs)[0]
        correct_idx = candidates.index(correct_desc)
        top5_indices = torch.topk(sims, k=5).indices.tolist()

        if top5_indices[0] == correct_idx:
            correct += 1
        if correct_idx in top5_indices:
            top_5_correct += 1
        total += 1
        

    return correct / total, top_5_correct / total



compute_accuracy(model, positive_examples[:5000], candidates_per_query=50)


(0.4564, 0.6796)

In [19]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    warmup_steps=100,
    show_progress_bar=True,
    output_path="fine-tuned-minilm"
)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1157
1000,0.0835
1500,0.0775
2000,0.0726
2500,0.0712
3000,0.063
3500,0.0616
4000,0.0601
4500,0.0601
5000,0.06


In [21]:
perfect, top5 = compute_accuracy(model, positive_examples[:5000], candidates_per_query=50)
print(f"Perfect accuracy: {perfect:.4f}")
print(f"Top 5 accuracy: {top5:.4f}")

Perfect accuracy: 0.8112
Top 5 accuracy: 0.9828


In [22]:
model.save("fine-tuned-minilm")

In [23]:
query = "I'm looking for a bold red wine with black fruit and spice."
descriptions = df["description"].dropna().tolist()

query_emb = model.encode(query, convert_to_tensor=True).to(device)
desc_embs = model.encode(descriptions, batch_size=32, convert_to_tensor=True).to(device)

# ✅ Compute cosine similarities
cosine_scores = util.cos_sim(query_emb, desc_embs)[0]

# ✅ Get top N results
top_k = 5
top_results = torch.topk(cosine_scores, k=top_k)

# ✅ Show results
print("\nTop matching wines:\n" + "-" * 40)
for score, idx in zip(top_results.values, top_results.indices):
    print(f"Score: {score.item():.4f}")
    print(f"Wine Description: {descriptions[idx]}")
    print("-" * 40)


Top matching wines:
----------------------------------------
Score: 0.9609
Wine Description: A slam-dunk to have with tonight’s lasagna. It’s loaded with black fruit and spices, both on the nose and on the palate. Medium-bodied and well balanced, the finish is delightfully long.
----------------------------------------
Score: 0.9605
Wine Description: This ripe and spicy wine from the Dop family's 27 acres offers layers of black fruits. It has a concentrated texture and a firm structure that is juicy while dense. With this richness, the wine will develop into a smooth and generous wine. Drink from 2019.
----------------------------------------
Score: 0.9593
Wine Description: Big, bold and spicy, this wine is ripe with black fruit. Tannins and a rich texture confirm its upfront style. Likely to take its time to develop, it will be ready to drink from 2020.
----------------------------------------
Score: 0.9571
Wine Description: This wine is rich, spicy and full of black fruit. With soli

In [None]:
#upload moderl to hub
from kagglehub import KaggleHub
from huggingface_hub import HfApi, upload_folder

api = HfApi()
api.create_repo(
    repo_id="fine-tuned-minilm-wine",
    private=False
)

upload_folder(
    repo_id="SpencerCreveling99/fine-tuned-minilm-wine",
    folder_path="./fine-tuned-minilm",
    commit_message="Initial commit",
    repo_type="."
)