In [None]:
#%pip install --upgrade transformers accelerate safetensors einops

import pandas as pd
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
import kagglehub
import torch
import json
from tqdm import tqdm
import random
import re

### Step 1 | Load our data set
we are using the winemag wine review data set that contains 130k wine reviews from several reviewers on wines from all over the world

In [None]:
path = kagglehub.dataset_download("zynicide/wine-reviews")
df = pd.read_csv(path + "/winemag-data-130k-v2.csv")

### Step 2 | Generate Synthetic User Queries
in order to train a Bert based model to fine semantic similarity between user queries and the desired wine we need to create good and bad samples of what search queries is paired with which wine
because we don't have those already in our data set we will use prompt engineering along with so post processing to create the sample user queries we expect to see


In [None]:
#load the model we want to use for generating samples
#we chose Qwen because its a free model we have had experience with in the past

model_name = "Qwen/Qwen2-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
model.config.sliding_window = None
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = "cuda"

In [None]:
#trim the query to a max of 3 parts even though we tell the model to do this
#it sometimes likes to not do that so this is a safety net
def trim_query(text, max_parts=3):
    # Lowercase the text for normalization
    text = text.strip()

    # Split by common descriptor joiners
    parts = re.split(r'\s*(?:,| and | with |\. )\s*', text)

    # Keep only the first few parts
    trimmed_parts = parts[:max_parts]

    # Re-join them with commas or 'and' for flow
    if len(trimmed_parts) > 1:
        result = ', '.join(trimmed_parts[:-1]) + ' and ' + trimmed_parts[-1]
    else:
        result = trimmed_parts[0]

    return result.strip().capitalize()

def generate_sample_query(wine_desc):
    
    #inorder to prevent the model from generating responses that are too similar to each other we create a list of prompt variants to choose from
    #this is a list of 10 different prompts that are all similar but slightly different
    prompt_variants = [
    """You are a helpful assistant that creates short, human-like wine search queries from wine descriptions. The queries should be realistic, not too detailed, and focus on 1–3 key aspects of the wine.""",

    """You are an assistant that turns wine descriptions into simple, casual search phrases that a real person might use. Do not list everything—just highlight a few main traits.""",

    """You generate concise, informal wine search queries based on tasting notes. Keep it brief and include only the most noticeable characteristics.""",

    """You're helping wine lovers describe wines in simple search terms. Summarize the wine's essence in 1–2 quick flavor notes or attributes.""",

    """You generate short, realistic search queries from wine descriptions. Keep the language natural and intuitive. Mention just a couple key traits.""",

    """Your job is to turn wine descriptions into casual search-style queries using only the most important descriptors. Avoid over-detailing or repetition.""",

    """You create short user-style search queries based on wine descriptions. Stick to 1–3 high-impact descriptors. Be realistic and natural, not formal.""",

    """You are generating imperfect but useful search queries from wine descriptions. The goal is to sound like a real wine shopper, not list every flavor.""",

    """Turn the wine descriptions into realistic, search-friendly phrases a person might type. Include only standout flavors or textures.""",

    """You're writing very short, human-friendly search terms based on full wine descriptions. Focus on the most prominent flavors or textures."""
    ]   

    # Select a random prompt variant
    prompt = prompt_variants[random.randint(0, len(prompt_variants) - 1)]
    
    #this is the prompt we use to generate the query 
    #we use the randomly selected prompt variant and add the wine description to it
    #we also add a few examples of what we want the model to do to help it understand the task
    messages = [
       {"role": "system", "content": f"""{prompt} Queries must sound like a real person searching online. No more than 2 descriptors. Avoid full sentences or wine jargon.

        Examples:

        Wine: "Flavors of blackberry, plum, leather, and dried herbs with firm tannins and a smoky finish."  
        Query: "bold wine with dark fruit."

        Wine: "Aromas of green apple, lime zest, and flinty minerality with bright acidity."  
        Query: "Crisp white with citrus notes"
        
        Wine: "Bright citrus and minerality with crisp acidity."
        Query: "Zesty white with citrus"

        Wine: "Ripe red cherry, baking spice, and light oak tannins."
        Query: "Fruity red with spice"

        Now generate a query for this wine:
        """},
        {"role": "user", "content": f"Write a natural user-style query for the following wine:\n{wine_desc}\nQuery:"}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    model_inputs = tokenizer(text, return_tensors="pt", padding=True, return_attention_mask=True).to(model.device)

    generated_ids = model.generate(
        model_inputs.input_ids,
        attention_mask=model_inputs.attention_mask,
        max_new_tokens=100,
        temperature=0.8,
        top_p=0.85,
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    #prune off exess description and ""
    response = response.replace('"', "")
    response = trim_query(response)
    
    #same as the random prompt variants we used to generate the query we also have a list of random restructuring patterns
    #this is to help keep the data more diverse and prevent over fitting in the bert model
    restructure_patterns = [
    lambda desc: f"{desc}",  
    lambda desc: f"A {desc} wine", 
    lambda desc: f"{desc}, perfect with food",  
    lambda desc: f"Looking for a {desc} wine",  
    lambda desc: f"{desc} ideal with dinner",  
    lambda desc: f"somthing that is {desc}",
    lambda desc: f"Wine with {desc}",
    lambda desc: f"Searching for {desc}",
    lambda desc: f"Wine that is {desc}",
    ]
    
    # Randomly select a restructuring pattern
    restructure = random.choice(restructure_patterns)
    # Apply the restructuring
    response = restructure(response)
    return response



Now that the model code has been set up we can now make the code to generate the pairs. Because we are using a subset of our entire dataset we want to use some stratification to ensure we get a good subset. In this case we are using 
- price buckets
- country of origin


note: if you don't have a powerful gpu you can skip this step and use the pre-generated data

In [None]:
#generate price buckets for stratified sampling
def price_bucket(price):
    try:
        price = float(price)
        if price < 15:
            return "low"
        elif price < 30:
            return "medium"
        else:
            return "high"
    except:
        return None

#Proportional stratified sampling to ensure we have a balanced dataset
def stratified_samples(df, target_total=3000):
    df = df.dropna(subset=["description", "country", "price"]).copy()
    df["price_bucket"] = df["price"].apply(price_bucket)
    df = df.dropna(subset=["price_bucket"])

    group_keys = ["country", "price_bucket"]
    grouped = df.groupby(group_keys)

    # Get group sizes
    group_sizes = grouped.size()

    # Calculate proportional samples per group
    total_available = group_sizes.sum()
    group_targets = (group_sizes / total_available * target_total).round().astype(int)

    # Sample from each group
    sampled_dfs = []
    for group, size in group_targets.items():
        group_df = grouped.get_group(group)
        sampled = group_df.sample(min(size, len(group_df)), random_state=42)
        sampled_dfs.append(sampled)

    stratified_df = pd.concat(sampled_dfs).reset_index(drop=True)
    return stratified_df


def generate_stratified_queries(df, target_total=3000, output_path="fixed_stratified_pairs.jsonl"):
    stratified_df = stratified_samples(df, target_total)

    #save the stratified samples to a jsonl file
    with open(output_path, "w", encoding="utf-8") as f:
        for _, row in tqdm(stratified_df.iterrows(), total=len(stratified_df), desc="Generating queries"):
            desc = row["description"]
            try:
                query = generate_sample_query(desc)
                item = {
                    "texts": [query, desc],
                    "label": 1.0
                }
                f.write(json.dumps(item) + "\n")
            except Exception as e:
                print(f"[x] Skipping row due to error: {e}")
                continue

    print(f"Saved {len(stratified_df)} examples to: {output_path}")

In [None]:
#take the positive examples and generate negative examples by shuffling the descriptions around
def generate_negative_pairs_from_positive(positive_path, output_path="negative_pairs.jsonl"):
    # Load the positive examples
    with open(positive_path, "r", encoding="utf-8") as f:
        positive_data = [json.loads(line) for line in f]

    queries = [item["texts"][0] for item in positive_data]
    descriptions = [item["texts"][1] for item in positive_data]

    # Shuffle descriptions until none are in original positions
    shuffled_descriptions = descriptions.copy()
    attempts = 0
    while any(qd == sd for qd, sd in zip(descriptions, shuffled_descriptions)) and attempts < 10:
        random.shuffle(shuffled_descriptions)
        attempts += 1

    # Build negative examples
    negative_examples = []
    with open(output_path, "w", encoding="utf-8") as f_out:
        for query, wrong_desc in tqdm(zip(queries, shuffled_descriptions), total=len(queries), desc="Generating negative pairs"):
            example = {
                "texts": [query, wrong_desc],
                "label": 0.0
            }
            f_out.write(json.dumps(example) + "\n")
            negative_examples.append(InputExample(texts=[query, wrong_desc], label=0.0))

now that we have code that selects and generates queries it is time to run it if you don't have the gpu to run this model just skip the first code cell and load in one of the pre-generated data files

In [None]:
#generate the stratified queries and data pairs
generate_stratified_queries(df, target_total=100, output_path="stratified_positive_pairs_mini.jsonl")
#generate the negative pairs from the positive pairs
generate_negative_pairs_from_positive("stratified_positive_pairs_mini.jsonl", output_path="stratified_negative_pairs_mini.jsonl")

In [None]:
#load pre generated data
def load_examples_from_jsonl(path):
    examples = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            examples.append(InputExample(texts=item["texts"], label=float(item["label"])))
    return examples

positive_examples = load_examples_from_jsonl("stratified_positive_pairs_mini.jsonl")
negative_examples = load_examples_from_jsonl("stratified_negative_pairs_mini.jsonl")

all_examples = positive_examples + negative_examples
random.shuffle(all_examples)

now that we have our generated data we want to check how diverse it is ie how different the queries are if the value is too high > .7 this would result in the bert model being to focused on one type of query and not perform well
if its to low < .3 then the data we made is meaningless and no trend can be found, in general use data from things like google searches has around a .4 to .6 diversity so that is what we want to get 

In [None]:
#load in the bert based sentence transformer model as we need its embedding function to calculate the similarity scores
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2").to(device)

In [None]:
#extract all user queries
user_queries = []
for example in positive_examples:
    user_queries.append(example.texts[0])

#get the embeddings for the user queries
embeddings = model.encode(user_queries, convert_to_tensor=True)

#calculate the cosine similarity between the embeddings
sim_matrix = util.pytorch_cos_sim(embeddings, embeddings)
upper_tri_sim = sim_matrix.triu(diagonal=1)
print(1.0 - upper_tri_sim[upper_tri_sim != 0].mean().item())

### Step 3 | Train the Model
now that we created our complete data set using prompt engineering we can now began training the bert based model in order to better select wine recommendations based off of taste profile descriptors


Before we do that we will first take a baseline so we can see how well the model performs before and after. in-order to do this we will pre compute all embeddings and for each query we will calculate the similarity score the wine descriptions to decide which one the model would recommend 

In [None]:
class InputExampleEmbedding:
    """Structure for one input example with texts, the label and a unique id"""

    def __init__(self, guid: str = "", texts: list[str] = None, label: int | float = 0, prompt: torch.Tensor = None, desc: torch.Tensor = None):
        """
        Creates one InputExample with the given texts, guid and label

        Args:
            guid: id for the example
            texts: the texts for the example
            label: the label for the example
            prompt: the prompt embedding for the example
            desc: the description embedding for the example
        """
        self.guid = guid
        self.texts = texts
        self.label = label
        self.prompt = prompt
        self.desc = desc

    def __str__(self):
        return "<InputExample> label: {}, texts: {}, prompt: {}, desc: {}".format(str(self.label), "; ".join(self.texts), str(self.prompt), str(self.desc))
    
# Pre-encode pairs
def encode_all(query_desc_pairs):
    embeddings = []

    for pair_idx in range(len(query_desc_pairs)):
        query = query_desc_pairs[pair_idx].texts[0]
        desc = query_desc_pairs[pair_idx].texts[1]

        # Encode and ensure everything is on the same device
        query_emb = model.encode(query, convert_to_tensor=True).to(device)
        desc_emb = model.encode(desc, convert_to_tensor=True).to(device)

        # Store the embeddings in the pair
        embeddings.append(InputExampleEmbedding(
            texts=[query, desc],
            label=query_desc_pairs[pair_idx].label,
            prompt=query_emb,
            desc=desc_emb
        ))
    return embeddings

# Save the encoded pairs to a file for future use
def save_encoded_pairs(encoded_pairs, file_path="encoded_pairs.jsonl"):
    with open(file_path, 'w') as f:
        for pair in encoded_pairs:
            record = {
                'texts': pair.texts,
                'label': pair.label,
                'query_embedding': pair.prompt.cpu().tolist(),
                'desc_embedding': pair.desc.cpu().tolist()
            }
            f.write(json.dumps(record) + '\n')

# Load saved pairs from a file
def load_encoded_pairs(file_path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loaded_data = []
    with open(file_path, 'r') as f:
        for line in f:
            record = json.loads(line)

            # Convert embeddings back to tensors
            query_emb = torch.tensor(record['query_embedding'], device=device)
            desc_emb = torch.tensor(record['desc_embedding'], device=device)
            
            # Reconstruct the structure
            loaded_data.append(InputExampleEmbedding(
                texts=record['texts'],
                label=record['label'],
                prompt=query_emb,
                desc=desc_emb
            ))
    return loaded_data

now that we have the code to generate the embeddings lets go ahead and do that, as before if your gpu cant handel this feel free to use the provided saved embeddings

In [None]:
# Pre-encode the embeddings for the training data pairs
encoded_pairs = encode_all(positive_examples[:100])
# Save the encoded pairs to a file
save_encoded_pairs(encoded_pairs, file_path="pre_train_encoded_pairs.jsonl")

In [None]:
# Load the encoded pairs from the file
encoded_pairs = load_encoded_pairs(file_path="pre_train_encoded_pairs.jsonl")

now that the embeddings have been pre-computed we can use them to test the accuracy of the model

In [None]:
# Get baseline accuracy
def compute_accuracy_pre_emb(encoded_pairs, candidates_per_query=10):
    correct = 0
    top_5_correct = 0
    total = 0

    for pair in encoded_pairs:
        correct_desc = pair.texts[1]
        query_emb = pair.prompt
        correct_desc_emb = pair.desc

        # Get negative samples
        incorrect = random.sample([[p.texts[1], p.desc] for p in encoded_pairs if p.texts[1] != correct_desc], k=candidates_per_query - 1)
        candidates = [[correct_desc, correct_desc_emb]] + incorrect

        cand_embs = [pair[1] for pair in candidates]
        candidates = [pair[0] for pair in candidates]

        # Compute similarity
        cand_embs_tensor = torch.stack(cand_embs)
        query_emb_tensor = query_emb.unsqueeze(0)
        sims = util.cos_sim(query_emb_tensor, cand_embs_tensor)[0]
        correct_idx = candidates.index(correct_desc)
        top5_indices = torch.topk(sims, k=5).indices.tolist()

        if top5_indices[0] == correct_idx:
            correct += 1
        if correct_idx in top5_indices:
            top_5_correct += 1
        total += 1

    return correct / total, top_5_correct / total

In [None]:
#feel free to change the number of candidates per query to get a more accurate accuracy score at the cost of computation time
perfect, top5 = compute_accuracy_pre_emb(encoded_pairs, candidates_per_query=10)
print(f"Perfect accuracy: {perfect:.4f}")
print(f"Top 5 accuracy: {top5:.4f}")

final now that we have a baseline its time to train the model lucky for use the sentence transformer library has a training function built in so fine tuning is easy from here

In [None]:
train_dataloader = DataLoader(all_examples, shuffle=True, batch_size=16)  # Adjust batch size if needed
train_loss = losses.CosineSimilarityLoss(model=model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=5,
    warmup_steps=100,
    show_progress_bar=True,
    output_path="fine-tuned-minilm"
)


Now lets take a look at model results, if they are not the best don't worry we have a fine tuned version that can be downloaded and played with later to see its results as before we will pre compute embendings to use for accuracy training and provide pre done ones if you don't have a strong enough gpu

In [None]:
# Pre-encode the embeddings for the training data pairs
encoded_pairs = encode_all(positive_examples[:100])
# Save the encoded pairs to a file
save_encoded_pairs(encoded_pairs, file_path="post_train_encoded_pairs.jsonl")

In [None]:
# Load the encoded pairs from the file
encoded_pairs = load_encoded_pairs(file_path="post_train_encoded_pairs.jsonl")

In [None]:
#feel free to change the number of candidates per query to get a more accurate accuracy score at the cost of computation time
perfect, top5 = compute_accuracy_pre_emb(encoded_pairs, candidates_per_query=10)
print(f"Perfect accuracy: {perfect:.4f}")
print(f"Top 5 accuracy: {top5:.4f}")

In [None]:
model.save("fine-tuned-minilm")

### Step 4 | play with the model
now that the model is working you can feel free to test it out, just change up the query as you see fit and look at the results

In [None]:
#load in pre trained model from the cloud
model = SentenceTransformer("SpencerCreveling99/fine-tuned-minilm-wine")
#load this models embeddings
encoded_pairs = load_encoded_pairs(file_path="post_train_sample_encoded_pairs.jsonl")


In [None]:
query = "I'm looking for a bold red wine with black fruit and spice."
descriptions = df["description"].dropna().tolist()

query_emb = model.encode(query, convert_to_tensor=True).to(device)
desc_embs = torch.stack([example.desc for example in encoded_pairs]).to(device)

#Compute cosine similarities
cosine_scores = util.cos_sim(query_emb, desc_embs)[0]

#Get top N results
top_k = 5
top_results = torch.topk(cosine_scores, k=top_k)

#Show results
print("\nTop matching wines:\n" + "-" * 40)
for score, idx in zip(top_results.values, top_results.indices):
    print(f"Score: {score.item():.4f}")
    print(f"Wine Description: {descriptions[idx]}")
    print("-" * 40)