This notebook implements a natural language processing system for restaurant recommendation using semantic embeddings and similarity search. The system processes Turkish restaurant data through character normalization, location input correction, semantic embedding generation, and hybrid similarity matching. Built on Sentence-BERT architecture with the all-MiniLM-L6-v2 model, it generates dense vector representations of restaurant summaries and keywords to enable semantic understanding of user queries. The pipeline incorporates Turkish language processing through Zemberek morphology library for proper text normalization and character handling. Location input correction utilizes a hybrid approach combining local language model inference through LM Studio API with fuzzy string matching to automatically correct misspelled city and district names. The semantic search system implements weighted similarity scoring that combines restaurant summary embeddings with keyword embeddings using an 80-20 weighting scheme. Query processing involves normalization of user input, embedding generation using Sentence-BERT, and cosine similarity computation against pre-computed restaurant embeddings. The final recommendation ranking combines semantic similarity scores with restaurant quality metrics through Total Weighted Score, ensuring recommendations are both semantically relevant and of high quality. The system supports location-based filtering with automatic correction capabilities and includes comprehensive Turkish character normalization and format standardization for consistent embedding generation.

In [None]:
pip install zemberek-python

Note: you may need to restart the kernel to use updated packages.


In [None]:
# First, let's examine the CSV data and normalize Turkish characters
import pandas as pd

def normalize_turkish(text):
    """
    Normalize Turkish characters to their ASCII equivalents for comparison.
    """
    if not text or pd.isna(text):
        return text

    # Turkish character mappings
    turkish_chars = {
        'ç': 'c', 'Ç': 'C',
        'ğ': 'g', 'Ğ': 'G',
        'ı': 'i', 'I': 'I',
        'İ': 'I', 'i': 'i',
        'ö': 'o', 'Ö': 'O',
        'ş': 's', 'Ş': 'S',
        'ü': 'u', 'Ü': 'U'
    }

    normalized = text.lower()
    for turkish, ascii_char in turkish_chars.items():
        normalized = normalized.replace(turkish.lower(), ascii_char.lower())

    return normalized

# Load the data
df = pd.read_csv('/Users/Serra/Desktop/bitirme/kullanılan csvler/Final_Data.csv')

# Normalize and get unique cities and districts
df['Il_normalized'] = df['Il'].apply(normalize_turkish)
df['Ilce_normalized'] = df['Ilce'].apply(normalize_turkish)

unique_cities = df['Il_normalized'].dropna().unique()
unique_districts = df['Ilce_normalized'].dropna().unique()

print(f"Number of unique cities: {len(unique_cities)}")
print(f"Cities: {sorted(unique_cities)}")
print(f"\nNumber of unique districts: {len(unique_districts)}")
print(f"First 10 districts: {sorted(unique_districts)[:10]}")

# Show data structure
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Number of unique cities: 3
Cities: ['ankara', 'istanbul', 'izmir']

Number of unique districts: 75
First 10 districts: ['adalar', 'akyurt', 'alsancak', 'altindag', 'arnavutkoy', 'atasehir', 'avcilar', 'ayas', 'bagcilar', 'bahcelievler']

Dataset shape: (2365, 13)
Columns: ['Mekan_Adı', 'Ilce', 'Il', 'Total_Weighted_Score', 'Özet', 'Fiyat-Performans', 'Hizmet', 'Menü Çeşitliliği', 'Ortam', 'Tat', 'Temizlik', 'Il_normalized', 'Ilce_normalized']


In [None]:
import pandas as pd

# Configuration for LM Studio
LM_STUDIO_BASE_URL = "http://localhost:1234"
MODEL_NAME = "turkish-gemma-9b-v0.1-i1"

import requests
import json
from difflib import get_close_matches

def query_local_llm(prompt, system_prompt, temperature=0):
    """
    Query the local LLM via LM Studio API (OpenAI-compatible).
    """
    try:
        response = requests.post(
            f"{LM_STUDIO_BASE_URL}/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json={
                "model": MODEL_NAME,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": prompt}
                ],
                "temperature": temperature,
                "max_tokens": 50,
                "stream": False
            },
            timeout=10
        )

        if response.status_code == 200:
            result = response.json()
            return result["choices"][0]["message"]["content"].strip()
        else:
            print(f"LLM API error: {response.status_code}")
            return None

    except Exception as e:
        print(f"LLM connection failed: {e}")
        return None

def normalize_location(text, allowed_values, location_type="location"):
    """
    Normalize Turkish location name using LLM + fuzzy fallback.
    Skips LLM correction for valid provinces and standalone districts.
    """
    if not text or not text.strip():
        return None

    text = text.strip()

    # Check if already valid (exact match)
    normalized_input = normalize_turkish(text)
    normalized_allowed = {normalize_turkish(val): val for val in allowed_values}

    if normalized_input in normalized_allowed:
        return normalized_allowed[normalized_input]

    # Skip LLM correction in specific cases:
    # 1. For provinces (cities) - if it's a recognized province, don't try to correct
    # 2. For districts entered without province context
    skip_llm = False

    # Check if input is a known Turkish province (even if not in current allowed_values)
    turkish_provinces = [
        "adana", "adiyaman", "afyonkarahisar", "agri", "aksaray", "amasya", "ankara", "antalya",
        "ardahan", "artvin", "aydin", "balikesir", "bartin", "batman", "bayburt", "bilecik",
        "bingol", "bitlis", "bolu", "burdur", "bursa", "canakkale", "cankiri", "corum",
        "denizli", "diyarbakir", "duzce", "edirne", "elazig", "erzincan", "erzurum", "eskisehir",
        "gaziantep", "giresun", "gumushane", "hakkari", "hatay", "igdir", "isparta", "istanbul",
        "izmir", "izmit", "kahramanmaras", "karabuk", "karaman", "kars", "kastamonu", "kayseri", "kilis",
        "kirikkale", "kirklareli", "kirsehir", "kocaeli", "konya", "kutahya", "malatya", "manisa",
        "mardin", "mersin", "mugla", "mus", "nevsehir", "nigde", "ordu", "osmaniye", "rize",
        "sakarya", "samsun", "sanliurfa", "siirt", "sinop", "sivas", "sirnak", "tekirdag",
        "tokat", "trabzon", "tunceli", "usak", "van", "yalova", "yozgat", "zonguldak"
    ]

    if location_type == "city" and normalize_turkish(text) in turkish_provinces:
        # If it's a valid Turkish province, check if it exists in our dataset
        if normalized_input in normalized_allowed:
            print(f"Valid Turkish province '{text}' found in dataset")
            return normalized_allowed[normalized_input]
        else:
            print(f"Valid Turkish province '{text}' but not available in our dataset")
            return None
    elif location_type == "district":
        # For districts entered standalone, check if it exists in dataset first
        if normalized_input in normalized_allowed:
            print(f"District '{text}' found in dataset")
            return normalized_allowed[normalized_input]
        else:
            # Skip LLM correction for districts, only use fuzzy matching
            skip_llm = True
            print(f"District '{text}' not found - trying fuzzy matching only")

    # Try LLM correction only if not skipping
    if not skip_llm:
        system_prompt = ("You are a Turkish location normalizer. Return only the corrected city or district name in Turkish, "
                        "matching diacritics (İ/ı, Ş, Ğ, Ü, Ö, Ç), with correct title casing. "
                        "Only correct obvious spelling errors, not valid location names. "
                        "If unsure, return the closest valid option from the provided list. "
                        "Output only the name, nothing else.")

        allowed_list = ", ".join(sorted(allowed_values))
        prompt = f"Correct this Turkish {location_type} name: '{text}'\nValid options: {allowed_list}"

        llm_result = query_local_llm(prompt, system_prompt)

        if llm_result:
            # Check if LLM result is in allowed values
            llm_normalized = normalize_turkish(llm_result)
            if llm_normalized in normalized_allowed:
                corrected = normalized_allowed[llm_normalized]
                if corrected != text:
                    print(f"LLM corrected '{text}' → '{corrected}'")
                return corrected

    # Fallback to fuzzy matching
    fuzzy_matches = get_close_matches(text.lower(),
                                    [val.lower() for val in allowed_values],
                                    n=1, cutoff=0.6)

    if fuzzy_matches:
        # Find original case version
        for val in allowed_values:
            if val.lower() == fuzzy_matches[0]:
                print(f"Fuzzy matched '{text}' → '{val}'")
                return val

    return None  # No correction found

print("LM Studio client and location normalizer ready!")

LM Studio client and location normalizer ready!


In [None]:
def get_location_and_filter_restaurants_enhanced():
    """
    Enhanced function with auto-correction for Turkish location names.
    1. Gets location input from user
    2. Auto-corrects using local LLM + fuzzy matching
    3. Filters restaurants based on corrected input
    4. Returns filtered DataFrame
    """
    # Load data and normalize Turkish characters
    df = pd.read_csv('/Users/Serra/Desktop/bitirme/kullanılan csvler/Final_Data.csv')
    df['Il_normalized'] = df['Il'].apply(normalize_turkish)
    df['Ilce_normalized'] = df['Ilce'].apply(normalize_turkish)

    available_cities = list(df['Il'].dropna().unique())
    available_districts = list(df['Ilce'].dropna().unique())

    print("Restaurant Recommendation System (with Auto-Correction)")
    print("Please provide location information:")

    # Get user input with auto-correction
    while True:
        city_input = input("Enter city (or press Enter to skip): ").strip()
        district_input = input("Enter district (or press Enter to skip): ").strip()

        # Check if at least one is provided
        if not city_input and not district_input:
            print("Warning: At least one location (city or district) must be provided!")
            continue

        # Auto-correct city if provided
        city_corrected = None
        if city_input:
            city_corrected = normalize_location(city_input, available_cities, "city")
            if not city_corrected:
                print(f"City '{city_input}' is not available in our dataset. Please try again.")
                print(f"Available cities in dataset: {sorted(available_cities)}")
                continue

        # Auto-correct district if provided
        district_corrected = None
        if district_input:
            district_corrected = normalize_location(district_input, available_districts, "district")
            if not district_corrected:
                print(f"District '{district_input}' is not available in our dataset. Please try again.")
                print(f"First 10 available districts in dataset: {sorted(available_districts)[:10]}...")
                continue

        # If we get here, both inputs are valid
        break

    print(f"Final location: City='{city_corrected}', District='{district_corrected}'")

    # Filter restaurants based on corrected input
    filtered_df = df.copy()

    # Apply city filter if provided
    if city_corrected:
        filtered_df = filtered_df[filtered_df['Il'] == city_corrected]
        print(f"Filtered by city '{city_corrected}': {len(filtered_df)} restaurants found")

    # Apply district filter if provided
    if district_corrected:
        filtered_df = filtered_df[filtered_df['Ilce'] == district_corrected]
        print(f"Filtered by district '{district_corrected}': {len(filtered_df)} restaurants found")

    print(f"Final filtered results: {len(filtered_df)} restaurants")

    # Show sample results
    if len(filtered_df) > 0:
        print(f"\nSample restaurants:")
        print(filtered_df[['Mekan_Adı', 'Il', 'Ilce', 'Total_Weighted_Score']].head())
    else:
        print("No restaurants found for the given location.")

    return filtered_df

# Test the enhanced function
# Start LM Studio first, then uncomment below:
filtered_restaurants = get_location_and_filter_restaurants_enhanced()


Restaurant Recommendation System (with Auto-Correction)
Please provide location information:
Final location: City='İstanbul', District='Kadıköy'
Filtered by city 'İstanbul': 1003 restaurants found
Filtered by district 'Kadıköy': 39 restaurants found
Final filtered results: 39 restaurants

Sample restaurants:
                           Mekan_Adı        Il     Ilce  Total_Weighted_Score
28                     5masa_kalamis  İstanbul  Kadıköy              0.149464
29                          700_gram  İstanbul  Kadıköy              0.226943
63                    affan_ocakbasi  İstanbul  Kadıköy              0.296856
106                    alef_ocakbasi  İstanbul  Kadıköy              0.198858
179  antebi_restaurant_ciftehavuzlar  İstanbul  Kadıköy              0.137211


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from zemberek import TurkishSentenceNormalizer, TurkishMorphology

df = filtered_restaurants
morph = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morph)
sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# embed all summaries
summ_texts = df['Özet'].fillna("").astype(str).tolist()
summ_emb = sbert.encode(summ_texts, convert_to_numpy=True, normalize_embeddings=True)

# embed all keywords
keywords_texts = df['Keywords'].fillna("").astype(str).tolist()
keywords_emb = sbert.encode(keywords_texts, convert_to_numpy=True, normalize_embeddings=True)

# query
q = input("Cümleni yaz: ")
q_norm = normalizer.normalize(q)
q_emb = sbert.encode(q_norm, convert_to_numpy=True, normalize_embeddings=True)

# similarity
sims_summ = summ_emb @ q_emb
sims_kw = keywords_emb @ q_emb

# weighted similarity (80% summaries + 20% keywords)
sims = (0.8 * sims_summ) + (0.2 * sims_kw)

# top-5 by weighted similarity
top5_idx = np.argsort(-sims)[:5]
top5_df = df.iloc[top5_idx].copy()
top5_df["sim_score"] = sims[top5_idx]

# sort again by Total_Weighted_Score (desc)
top5_sorted = top5_df.sort_values(by="Total_Weighted_Score", ascending=False)

# print results
for _, row in top5_sorted.iterrows():
    print(f"{row['Mekan_Adı']}  (sim={row['sim_score']:.3f}, total={row['Total_Weighted_Score']})")

2025-09-03 21:27:48,839 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 1.9773387908935547

2025-09-03 21:27:52,074 - sentence_transformers.SentenceTransformer - INFO
Msg: Use pytorch device_name: mps

2025-09-03 21:27:52,074 - sentence_transformers.SentenceTransformer - INFO
Msg: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2



Batches:   0%|          | 0/2 [00:00<?, ?it/s]

KeyError: 'Keywords'