# Lab 6: Generate Synthetic Queries with Misspellings

In [1]:
import pandas as pd
import re
import json
import ollama

## 1. Data Loading

In [4]:
df = pd.read_csv('web_search_queries.csv')
print(f"Loaded {len(df)} queries")
df.head(50)

Loaded 45 queries


Unnamed: 0,Topic,Query
0,Map Search,restaurants near Central Park
1,Map Search,shortest route to Times Square from JFK Airport
2,Map Search,petrol stations open now in downtown Los Angeles
3,Map Search,walking directions to Eiffel Tower from Louvre...
4,Map Search,nearest hiking trails in Yosemite National Park
5,Job Search,software engineer jobs in San Francisco
6,Job Search,remote work opportunities in marketing
7,Job Search,freelance graphic design projects for beginners
8,Job Search,data scientist positions at Google
9,Job Search,entry-level mechanical engineering jobs in Austin


## 2. Abbreviation Detection

In [6]:
KNOWN_ABBREVIATIONS = {
    'JFK', 'LAX', 'SFO', 'ORD', 'DFW', 'ATL', 'LHR', 'CDG',
    'NBC', 'CBS', 'ABC', 'CNN', 'BBC', 'HBO', 'IBM',
    'NASA', 'FBI', 'CIA', 'USPS', 'UPS',
    'AI', 'ML', 'NLP', 'API', 'URL', 'HTML', 'CSS', 'SQL',
    'TV', 'DVD', 'CEO', 'HR', 'IT', 'FAQ',
}

def find_protected_words(query: str) -> list:
    protected = []
    for word in query.split():
        clean = re.sub(r'[^a-zA-Z]', '', word).upper()
        if clean in KNOWN_ABBREVIATIONS:
            protected.append(word)
    return protected


print(find_protected_words("shortest route from JFK Airport"))

['JFK']


## 3. LLM-Based Misspelling Generator

In [59]:
def generate_misspellings(query: str, n: int = 5, model: str = "llama3") -> list:
    protected = find_protected_words(query)
    protected_str = ', '.join(protected) if protected else 'none'
    
    prompt = f"""Generate exactly {n} misspelling variants of this query:
"{query}"

CRITICAL: These words must stay EXACTLY as-is, do not change them: [{protected_str}]
Include: omission (like: machine->machin), transposition (like: the->teh), phonetic (like: phone->fone), repetition (like: running->runnning)
Each variant: 1-3 typos in OTHER words only.

Return ONLY JSON array: ["variant1", "variant2", ...]"""
    
    response = ollama.chat(model=model, messages=[{'role': 'user', 'content': prompt}])
    content = response['message']['content']
    
    match = re.search(r'\[.*?\]', content, re.DOTALL)
    if match:
        try:
            return json.loads(match.group())[:n]
        except:
            pass
    return re.findall(r'"([^"]+)"', content)[:n]

In [12]:
MODEL = "llama3" #"mistral"

test_queries = [
    "machine learning applications",
    "shortest route to Times Square from JFK Airport",
    "software engineer jobs in San Francisco",
]

for query in test_queries:
    print(f"\nOriginal: {query}")
    print(f"Protected: {find_protected_words(query)}")
    for i, v in enumerate(generate_misspellings(query, n=5, model=MODEL), 1):
        print(f"  {i}. {v}")


Original: machine learning applications
Protected: []
  1. machin learing applications
  2. machine learnign applicaitons
  3. macnine learning appications
  4. machien learning appliacations
  5. machines lerning appliations

Original: shortest route to Times Square from JFK Airport
Protected: ['JFK']
  1. shortest rout to Times Squair from JFk Airport
  2. shorrest route to Time Square from JFK Air Port
  3. shortesrt route to Timmes Square from JKf airport
  4. shortest rote to Tiimes Square from FKJ Airport
  5. shotest route to Tymes Sqare from JFK Aiport

Original: software engineer jobs in San Francisco
Protected: []
  1. softwear engineeer jobs in San Fransico
  2. software engenier jobz in San Franiscso
  3. sworfeware engineerin' jobs in San Frahisco
  4. softwar engineer joobs in Sah Francisco
  5. sofware enjinieer jobs in San Fancisco


In [26]:
# test
MODEL = "mistral"

test_queries = [
    "machine learning applications",
    "shortest route to Times Square from JFK Airport",
    "software engineer jobs in San Francisco",
]

for query in test_queries:
    print(f"\nOriginal: {query}")
    print(f"Protected: {find_protected_words(query)}")
    for i, v in enumerate(generate_misspellings(query, n=5, model=MODEL), 1):
        print(f"  {i}. {v}")


Original: machine learning applications
Protected: []
  1. machin learning applications
  2. machine learnings applicatons
  3. machine learing applcations
  4. machine learning appilcations
  5. machine learing applications

Original: shortest route to Times Square from JFK Airport
Protected: ['JFK']
  1. shortest route to Time Square from JFK Airport
  2. shortest rout to Times Squre from JFK Airpot
  3. shortest route to Tims Square from JFK Airpot
  4. shortest route to Times Squar from JFK Airport
  5. shortest route to Times Square form JFK Airport

Original: software engineer jobs in San Francisco
Protected: []
  1. software enginer jobs in San Francico
  2. software engineer jabs in San Francisco
  3. software enginner jobs in San Franscisco
  4. software engineer jobs in San Frantisco
  5. software engineer job in San Francisco


## 4. Process All Queries

In [13]:
def process_all_queries(df, n_variants=5, model="llama3"):
    results = []
    for _, row in df.iterrows():
        variants = generate_misspellings(row['Query'], n=n_variants, model=model)
        for i, v in enumerate(variants):
            results.append({'topic': row['Topic'], 'original': row['Query'], 
                           'variant_id': i+1, 'variant': v, 'model': model})
    return pd.DataFrame(results)

results_df = process_all_queries(df.head(50), n_variants=3, model=MODEL)
results_df

Unnamed: 0,topic,original,variant_id,variant,model
0,Map Search,restaurants near Central Park,1,resturants neer Centrail Park,llama3
1,Map Search,restaurants near Central Park,2,resaturants near Centreal Parke,llama3
2,Map Search,restaurants near Central Park,3,restrant's near Centrul PArk,llama3
3,Map Search,shortest route to Times Square from JFK Airport,1,shortest rute to Times Squre from JKF Airport,llama3
4,Map Search,shortest route to Times Square from JFK Airport,2,shoret routhe to Time Square from JFK Airport,llama3
...,...,...,...,...,...
130,Local Services,affordable wedding photographers in New York,2,afordable wdding photograhers in New Yorkk,llama3
131,Local Services,affordable wedding photographers in New York,3,afrordable weddinng photagraphes in Neyw York,llama3
132,Local Services,dog groomers open on Sundays,1,dog greemers open on Sundaze,llama3
133,Local Services,dog groomers open on Sundays,2,dog groomes opwn on Sundays,llama3


In [67]:
results_df1 = process_all_queries(df.head(50), n_variants=3, model="mistral")
results_df_=pd.concat([results_df,results_df1])
results_df_.to_csv("results_week6lab2.csv")
results_df_

Unnamed: 0,topic,original,variant_id,variant,model
0,Map Search,restaurants near Central Park,1,resturants neer Centrail Park,llama3
1,Map Search,restaurants near Central Park,2,resaturants near Centreal Parke,llama3
2,Map Search,restaurants near Central Park,3,restrant's near Centrul PArk,llama3
3,Map Search,shortest route to Times Square from JFK Airport,1,shortest rute to Times Squre from JKF Airport,llama3
4,Map Search,shortest route to Times Square from JFK Airport,2,shoret routhe to Time Square from JFK Airport,llama3
...,...,...,...,...,...
128,Local Services,affordable wedding photographers in New York,2,affordable wedding phographers in New Yotk,mistral
129,Local Services,affordable wedding photographers in New York,3,affordable weding photograpers in New York,mistral
130,Local Services,dog groomers open on Sundays,1,dog groomers open on Sundaies,mistral
131,Local Services,dog groomers open on Sundays,2,dog groomers opes on Sundays,mistral


## 5. Multi-Model Comparison

In [15]:
def check_protected_preserved(original: str, variant: str) -> bool:
    for word in find_protected_words(original):
        if word not in variant:
            return False
    return True

def analyze_error_types(original: str, variant: str) -> list:
    errors = []
    for ow, vw in zip(original.lower().split(), variant.lower().split()):
        if len(vw) < len(ow): errors.append('omission')
        elif len(vw) > len(ow): errors.append('repetition')
        elif ow != vw:
            errors.append('transposition' if sorted(ow) == sorted(vw) else 'substitution')
    return list(set(errors))

In [21]:

MODELS = ["llama3"]  
TEST_QUERY = "shortest route to Times Square from JFK Airport"
N=50
for model in MODELS:
    print(f"\n=== {model} ===")
    try:
        variants = generate_misspellings(TEST_QUERY, n=N, model=model)
        preserved_count = 0
        for v in variants:
            preserved = check_protected_preserved(TEST_QUERY, v)
            preserved_count += preserved
            status = "✓" if preserved else "✗"
            errors = analyze_error_types(TEST_QUERY, v)
            print(f"  {status} {v} [{', '.join(errors)}]")
        print(f"  Protected preserved: {preserved_count}/{N}")
    except Exception as e:
        print(f"  Error: {e}")


=== llama3 ===
  ✗ shorist rout to Times Square from JKF Airport [omission, transposition]
  ✓ shortest rute to Time Squre from JFK Airpot [omission]
  ✗ shotest rout to Tims Square from JKf Airport [omission, transposition]
  ✓ shiortest route to Times Sqaure from JFK Arpoirt [repetition, transposition]
  ✗ shorturst rouete to Times Sqare from JFk Airport [omission, repetition]
  ✓ shourtest route to Timze Square from JFK Aiprport [repetition, substitution]
  ✗ schorist route to Time Squre from JKF Airpordt [omission, repetition, transposition, substitution]
  ✗ shorest rout to Timsqurare from JKf Airpotr [omission, repetition]
  ✓ shorteset rout to Times Squear from JFK Airopirt [omission, repetition, transposition]
  ✗ shotest rouete to Timze Square from JFk Aipport [omission, repetition, substitution]
  ✓ shorist rooute to Time Squre from JFK Arpoirtt [omission, repetition]
  ✗ shorest route to Timsqurare from JKf Airpordt [omission, repetition]
  ✗ shorturst rout to Times Sqare f

In [62]:

MODELS = ["mistral"]  
TEST_QUERY = "shortest route to Times Square from JFK Airport"
N=26
for model in MODELS:
    print(f"\n=== {model} ===")

    variants = generate_misspellings(TEST_QUERY, n=N, model=model)
    preserved_count = 0
    for v in variants:
        preserved = check_protected_preserved(TEST_QUERY, v)
        preserved_count += preserved
        status = "✓" if preserved else "✗"
        errors = analyze_error_types(TEST_QUERY, v)
        print(f"  {status} {v} [{', '.join(errors)}]")
    print(f"  Protected preserved: {preserved_count}/{len(variants)}")



=== mistral ===
  ✓ shortest route to Times Square from JFK Airpot [omission]
  ✓ shoertest rout to Time Square from JFK Airport [omission, repetition]
  ✓ shortest route to Tines Square from JFK Airport [substitution]
  ✓ shortest rute to Times Square from JFK Airport [omission]
  ✓ shortest route too Times Square from JFK Airport [repetition]
  ✓ shortest route to Timess Square from JFK Airport [repetition]
  ✓ shortest route to Time Squre from JFK Airport [omission]
  ✓ shorrest rout to Times Square from JFK Airpot [omission, substitution]
  ✓ shorthst route to Tines Square from JFK Airport [substitution]
  ✓ shortest rout too Timess Square from JFK Airport [omission, repetition]
  ✓ shortest route to Timess Squre from JFK Airport [omission, repetition]
  ✓ shortest route to Time Squr from JFK Airport [omission]
  ✓ shortest rout to Times Sqare from JFK Airpot [omission]
  ✓ shortest rout too Tines Sqare from JFK Airport [omission, repetition, substitution]
  ✓ shortest rout too Ti

In [35]:

MODELS = ["mistral"]  
TEST_QUERY = "free streaming plaftorms for documentary"
N=50
for model in MODELS:
    print(f"\n=== {model} ===")
    try:
        variants = generate_misspellings(TEST_QUERY, n=N, model=model)
        preserved_count = 0
        for v in variants:
            preserved = check_protected_preserved(TEST_QUERY, v)
            preserved_count += preserved
            status = "✓" if preserved else "✗"
            errors = analyze_error_types(TEST_QUERY, v)
            print(f"  {status} {v} [{', '.join(errors)}]")
        print(f"  Protected preserved: {preserved_count}/{N}")
    except Exception as e:
        print(f"  Error: {e}")


=== mistral ===
  ✓ free streaming platfroms for documentary [transposition]
  ✓ free streaming plaftorms for documetary [omission]
  ✓ free streaming platfomrs for documentry [omission, transposition]
  ✓ free streaming plaftorms for documentary []
  ✓ free streaming plaftorms for doocumentary [repetition]
  ✓ free streaming platfoms for documetnary [omission, transposition]
  ✓ free streaming plaftorms for documentarie [repetition]
  ✓ free streaming platfroms for documetaryy [transposition, substitution]
  ✓ free streaming platfroms for documeny [omission, transposition]
  ✓ free streaming platfroms for doocumentaries [repetition, transposition]
  ✓ free streaming platfroms for documentariesy [repetition, transposition]
  ✓ free streaming platfroms for documentarites [repetition, transposition]
  ✓ free streaming platfroms for documentariesies [repetition, transposition]
  ✓ free streaming platfroms for documetaryi [transposition, substitution]
  ✓ free streaming platfroms for docu

## 6. Search Engine Testing

In [14]:
import urllib.parse

query = "machine learning applications"
variants = generate_misspellings(query, n=3, model=MODEL)

print(f"Original: https://www.google.com/search?q={urllib.parse.quote(query)}")
for i, v in enumerate(variants, 1):
    print(f"Variant {i}: https://www.google.com/search?q={urllib.parse.quote(v)}")

Original: https://www.google.com/search?q=machine%20learning%20applications
Variant 1: https://www.google.com/search?q=machien%20lerning%20applicaitons
Variant 2: https://www.google.com/search?q=maachine%20leerinig%20appliacations
Variant 3: https://www.google.com/search?q=machiene%20learnin%20appilications


In [None]:
https://www.google.com/search?q=shoret+routhe+to+Time+Square+from+JFK+Airpor&oq=shoret+routhe+to+Time+Square+from+JFK+Airpor