In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
import re

# Download required NLTK data
nltk.download('punkt')

# Load the data
df = pd.read_csv('lastt.csv')
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())


[nltk_data] Downloading package punkt to /Users/Serra/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset shape: (27108, 19)

Columns: ['Unnamed: 0', 'Mekan_Adı', 'Kullanıcı_Adı', 'Yorum', 'Puan', 'Yorum_Tarihi', 'Restoran_Kategorisi', 'Restoran_Toplam_Yorum_Sayısı', 'Ortalama_Restoran_Puanı', 'Restoran_Adresi', 'sentiment_score', 'label', 'AI_Result', 'Tat', 'Hizmet', 'Ortam', 'Fiyat-Performans', 'Menü Çeşitliliği', 'Temizlik']


# 1.1 Veri Hazırlığı & Temizlik

## 1. Null Değer Kontrolü


In [2]:
# Check null values in each column
null_counts = df.isnull().sum()
print("Null value counts per column:")
print(null_counts)

# Calculate percentage of null values
null_percentages = (null_counts / len(df)) * 100
print("\nNull value percentages per column:")
print(null_percentages)

# List columns with null values
columns_with_nulls = null_counts[null_counts > 0].index.tolist()
print("\nColumns containing null values:", columns_with_nulls)


Null value counts per column:
Unnamed: 0                          0
Mekan_Adı                           0
Kullanıcı_Adı                       0
Yorum                               0
Puan                                0
Yorum_Tarihi                        0
Restoran_Kategorisi                 0
Restoran_Toplam_Yorum_Sayısı        0
Ortalama_Restoran_Puanı             0
Restoran_Adresi                     0
sentiment_score                     3
label                               3
AI_Result                           1
Tat                             10595
Hizmet                          12499
Ortam                           19167
Fiyat-Performans                20356
Menü Çeşitliliği                25165
Temizlik                        23473
dtype: int64

Null value percentages per column:
Unnamed: 0                       0.000000
Mekan_Adı                        0.000000
Kullanıcı_Adı                    0.000000
Yorum                            0.000000
Puan                           

In [3]:
df = df.replace('', np.nan)
df = df.dropna(subset=['sentiment_score', 'label', 'AI_Result'], how='any')
null_counts = df.isnull().sum()
print("Null value counts per column:")
print(null_counts)

Null value counts per column:
Unnamed: 0                          0
Mekan_Adı                           0
Kullanıcı_Adı                       0
Yorum                               0
Puan                                0
Yorum_Tarihi                        0
Restoran_Kategorisi                 0
Restoran_Toplam_Yorum_Sayısı        0
Ortalama_Restoran_Puanı             0
Restoran_Adresi                     0
sentiment_score                     0
label                               0
AI_Result                           0
Tat                             10592
Hizmet                          12497
Ortam                           19164
Fiyat-Performans                20353
Menü Çeşitliliği                25161
Temizlik                        23470
dtype: int64


## 2. Kategori Skorlarının Normalizasyonu


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27104 entries, 0 to 27107
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    27104 non-null  int64  
 1   Mekan_Adı                     27104 non-null  object 
 2   Kullanıcı_Adı                 27104 non-null  object 
 3   Yorum                         27104 non-null  object 
 4   Puan                          27104 non-null  int64  
 5   Yorum_Tarihi                  27104 non-null  object 
 6   Restoran_Kategorisi           27104 non-null  object 
 7   Restoran_Toplam_Yorum_Sayısı  27104 non-null  int64  
 8   Ortalama_Restoran_Puanı       27104 non-null  float64
 9   Restoran_Adresi               27104 non-null  object 
 10  sentiment_score               27104 non-null  float64
 11  label                         27104 non-null  object 
 12  AI_Result                     27104 non-null  object 
 13  Tat   

In [10]:
col = 'Tat'  # or any column you want
# Try to convert, marking errors as NaN
converted = pd.to_numeric(df[col], errors='coerce')
# Find problematic cells (where conversion failed but value is not null/empty)
problematic = df.loc[converted.isna() & df[col].notna(), col]
print("⚠️ Problematic cells that could not be converted to float:")
print(problematic)


⚠️ Problematic cells that could not be converted to float:
11696    {'Roast beef pizza': 1.0, 'Cannoli tatlısı': -...
19776                                       Balıklar taze.
Name: Tat, dtype: object


In [11]:
cols_to_check = ['Tat', 'Hizmet']
for col in cols_to_check:
    converted = pd.to_numeric(df[col], errors='coerce')
    # Drop problematic rows
    df = df.loc[~(converted.isna() & df[col].notna())].copy()
    # Convert the column to float
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [12]:
df['Tat'] = df['Tat'].astype(float)
df['Hizmet'] = df['Hizmet'].astype(float)

In [7]:
# Assuming we have category score columns, let's normalize them to 0-1 range
# First, identify category score columns (you might need to adjust this based on your actual column names)
# Select columns that contain 'kategori' or 'score' in the name AND are float type
category_columns = [
    col for col in df.columns 
    if  (pd.api.types.is_float_dtype(df[col]))]

if category_columns:
    # Min-Max normalization for category scores
    for col in category_columns:
        min_val = df[col].min()
        max_val = df[col].max()
        if pd.notnull(min_val) and pd.notnull(max_val) and max_val != min_val:  # avoid division by zero
            df[f'{col}_normalized'] = (df[col] - min_val) / (max_val - min_val)
            print(f"Normalized {col}: Min={df[f'{col}_normalized'].min():.2f}, Max={df[f'{col}_normalized'].max():.2f}")
        else:
            print(f"Skipped {col}: constant or empty values")
else:
    print("No category score float columns found. Please specify the correct column names.")



Normalized Ortalama_Restoran_Puanı: Min=0.00, Max=1.00
Normalized sentiment_score: Min=0.00, Max=1.00
Normalized Ortam: Min=0.00, Max=1.00
Normalized Fiyat-Performans: Min=0.00, Max=1.00
Normalized Menü Çeşitliliği: Min=0.00, Max=1.00
Normalized Temizlik: Min=0.00, Max=1.00


## 3. Adres Parsing (Şehir, İlçe Çıkarma)


In [16]:
import re
import unicodedata
import pandas as pd

# Full list of Turkish provinces (İl)
TR_CITIES = {'ankara','istanbul','izmir'}

def _norm(s: str) -> str:
    """Basic normalize: unicode, collapse spaces, trim."""
    s = unicodedata.normalize("NFKC", str(s))
    s = re.sub(r'\s+', ' ', s).strip(' ,.;-')
    return s

def _clean_piece(s: str) -> str:
    """Drop postal codes & extra symbols from a piece likely holding İlçe."""
    s = re.sub(r'\b\d{4,6}\b', ' ', s)      # remove postal codes like 06490
    s = re.sub(r'[,.;]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def _turkish_title(s: str) -> str:
    """Simple title-case that preserves Turkish chars reasonably."""
    # Fix common dotted-i artifacts if present
    s = s.replace('i̇', 'i').replace('İ', 'İ')
    return s.title()

def extract_location_info(address: str):
    """Return (Ilce, Il) with Turkish casing; None, None if not found."""
    if pd.isna(address):
        return None, None

    addr = _norm(address)

    # 1) Strong pattern: "... <ilçe> / <il>" at the END of address
    # Allow letters, Turkish diacritics, dots and spaces in ilçe; only letters in il
    m = re.search(r'([A-Za-zÇĞİIÖŞÜçğıiöşü\.\-\s]+)\s*/\s*([A-Za-zÇĞİIÖŞÜçğıiöşü]+)\s*$', addr)
    if m:
        raw_ilce = _clean_piece(m.group(1))
        raw_il   = _clean_piece(m.group(2))
        il_l = raw_il.lower()
        if il_l in TR_CITIES:
            ilce = _turkish_title(raw_ilce.split()[-1]) if raw_ilce else None
            il   = _turkish_title(raw_il)
            return ilce or None, il

    # 2) No slash? Try to detect a city name near the end of string
    # Find last occurrence of a known city token
    tokens = [t.strip(' ,.;') for t in addr.split()]
    lowers = [t.lower() for t in tokens]
    city_pos = None
    for i in range(len(lowers)-1, -1, -1):
        if lowers[i] in TR_CITIES:
            city_pos = i
            break

    if city_pos is not None:
        il = _turkish_title(tokens[city_pos])
        # Guess district as the closest previous alpha token skipping numbers/abbr
        # e.g., "... 06490 Çankaya Ankara" -> take "Çankaya"
        ilce = None
        for j in range(city_pos-1, -1, -1):
            token = tokens[j]
            if re.fullmatch(r'\d+|no:?|sk\.?|cd\.?|mah\.?|mahallesi', token.lower()):
                continue
            if re.search(r'[A-Za-zÇĞİIÖŞÜçğıiöşü]', token):
                ilce = _turkish_title(token)
                break
        return ilce, il

    # 3) Give up if we can't confidently parse
    return None, None

# ---- Apply to your DataFrame ----
address_column = 'Restoran_Adresi'  # change if needed
if address_column in df.columns:
    pairs = df[address_column].apply(extract_location_info)
    df[['Ilce', 'Il']] = pd.DataFrame(pairs.tolist(), index=df.index)

    # Quick sanity prints
    print("İl dağılımı (top 10):")
    print(df['Il'].value_counts(dropna=True).head(10))
    print("\nİlçe dağılımı (top 10):")
    print(df['Ilce'].value_counts(dropna=True).head(10))
else:
    print(f"Address column '{address_column}' not found. Available columns:", df.columns.tolist())


İl dağılımı (top 10):
Il
Ankara    26206
Name: count, dtype: int64

İlçe dağılımı (top 10):
Ilce
Çankaya          4717
Etimesgut        3626
Keçiören         3218
Altındağ         2962
Mamak            2903
Pursaklar        2421
Beypazarı        1630
Sincan           1460
Kahramankazan    1230
Yenimahalle      1208
Name: count, dtype: int64


In [21]:
df = df.drop(columns=['extracted_city', 'extracted_district', 'Restoran_Adresi'])
df

Unnamed: 0.1,Unnamed: 0,Mekan_Adı,Kullanıcı_Adı,Yorum,Puan,Yorum_Tarihi,Restoran_Kategorisi,Restoran_Toplam_Yorum_Sayısı,Ortalama_Restoran_Puanı,sentiment_score,...,Menü Çeşitliliği,Temizlik,Ortalama_Restoran_Puanı_normalized,sentiment_score_normalized,Ortam_normalized,Fiyat-Performans_normalized,Menü Çeşitliliği_normalized,Temizlik_normalized,Ilce,Il
0,0,aynen_street_food,Meric Esmebasi,Ankarada denenmesi gereken burger’cilerin en b...,5,2024-11-20,Restoran,208,4.7,0.994092,...,,,0.925,0.997103,,,,,Çankaya,Ankara
1,1,aynen_street_food,Berkay ÇOBANOĞLU,Mükemmel bir mekan Güleryüz ve lezzet konusund...,5,2024-11-20,Restoran,208,4.7,0.996734,...,,,0.925,0.998424,1.0,,,,Çankaya,Ankara
2,2,aynen_street_food,Ozan Özkan,Aynen burger çok iyiydi. Özellikle içerisinde ...,5,2024-11-20,Restoran,208,4.7,0.934129,...,,,0.925,0.967116,,,,,Çankaya,Ankara
3,3,aynen_street_food,Salih Karagöz,Ankaradaki favori hamburgercilerimden. Özellik...,5,2024-10-20,Restoran,208,4.7,0.954556,...,,,0.925,0.977331,,,,,Çankaya,Ankara
4,4,aynen_street_food,Alp Ankara,Hamburgeri ve patatesi çok lezzetli. Soğan hal...,5,2024-10-20,Restoran,208,4.7,-0.976560,...,-0.7,,0.925,0.011592,,,0.15,,Çankaya,Ankara
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27097,27105,aynen_street_food,Toprak Bircan,Patatesleri aşırı aşırı aşırı yağlı. Biz beğen...,2,2025-03-20,Restoran,208,4.7,-0.999584,...,,,0.925,0.000078,,0.05,,,Çankaya,Ankara
27098,27106,aynen_street_food,KÖKSAL BAĞDU,Ayak üstü Google yorumlarıyla gittiğimiz bir y...,5,2025-03-20,Restoran,208,4.7,-0.947900,...,0.5,,0.925,0.025925,,,0.75,,Çankaya,Ankara
27099,27107,aynen_street_food,Rümeysa Çakmak,Gerçekten en başarılı en lezzetli hamburgerler...,5,2025-03-20,Restoran,208,4.7,0.968671,...,,,0.925,0.984390,,,,,Çankaya,Ankara
27100,27108,aynen_street_food,Umut Yıldırım,10 masalı bir işletme burgerlar çok güzel özel...,5,2025-02-20,Restoran,208,4.7,0.982163,...,,,0.925,0.991137,,0.85,,,Çankaya,Ankara


In [22]:
df["Ilce"].value_counts()

Ilce
Çankaya          4711
Etimesgut        3620
Keçiören         3210
Altındağ         2950
Mamak            2903
Pursaklar        2405
Beypazarı        1630
Sincan           1451
Kahramankazan    1228
Yenimahalle      1208
Elmadağ           346
Gölbaşı           175
Eryaman            90
Çayyolu            88
Kalecik            64
35;;A              53
No:66/C            15
Name: count, dtype: int64

# 1.2 NLP Altyapısı Kurma


In [None]:
import stanza
import torch
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Download Turkish model if not already present
stanza.download("tr")
nlp_stanza = stanza.Pipeline("tr", processors="tokenize,pos,lemma", use_gpu=torch.cuda.is_available())

# Stopword set
TR_STOP = {"ve","veya","ile","de","da","ki","bu","şu","o","bir","çok","az","en","mi","mu","mı","mü",
           "için","gibi","ama","fakat","hem","daha","her","şey","şimdi","ne","niçin","neden","çünkü",
           "olan","oldu","oluyor","yani","ise","ya","ya da","şöyle"}

def clean_token(t: str) -> str:
    t = t.lower()
    t = re.sub(r"[^\wçğıöşüâîû]", "", t)
    return t

def tr_lemma_tokenizer(text: str):
    doc = nlp_stanza(text)
    toks = []
    for sent in doc.sentences:
        for w in sent.words:
            lemma = clean_token(w.lemma or w.text)
            if lemma and lemma not in TR_STOP:
                toks.append(lemma)
    return toks


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Token:         Bu | Lemma:         bu | POS: DET
Token:   restoran | Lemma:   restoran | POS: NOUN
Token:        çok | Lemma:        çok | POS: ADV
Token:      güzel | Lemma:      güzel | POS: ADJ
Token:          . | Lemma:          . | POS: PUNCT
Token:   Yemekler | Lemma:      yemek | POS: NOUN
Token:     lezzet | Lemma:     lezzet | POS: NOUN
Token:         li | Lemma:         li | POS: ADP
Token:         ve | Lemma:         ve | POS: CCONJ
Token:     servis | Lemma:     servis | POS: NOUN
Token:      hızlı | Lemma:      hızlı | POS: ADJ
Token:          . | Lemma:          . | POS: PUNCT

Embeddings: (1, 384)

TF-IDF: (1, 7) ['güzel' 'hızlı' 'lezzet' 'li' 'restoran' 'servis' 'yemek']




# KONTROL GEREK 
Token:  burger’cilerin | Lemma:       burgeroci | POS: ADJ

In [38]:
# --- Take first 100 non-empty rows from df["Yorum"] ---
assert 'Yorum' in df.columns, "Column 'Yorum' not found in df."
texts = (
    df['Yorum']
      .dropna()
      .astype(str)
      .map(lambda s: s.strip())
      .loc[lambda s: s.ne('')]
      .head(100)
      .tolist()
)
print(f"Processing {len(texts)} rows from df['Yorum'].")

# --- Embeddings ---
device = "cuda" if torch.cuda.is_available() else "cpu"
embed_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', device=device)
embeddings = embed_model.encode(texts)
print("Embeddings shape:", embeddings.shape)  # (N, 384)

# --- TF-IDF (lemma-based) ---
tfidf = TfidfVectorizer(
    tokenizer=tr_lemma_tokenizer,
    lowercase=False,
    ngram_range=(1,2),
    max_df=0.95
)
X = tfidf.fit_transform(texts)
feat_names = tfidf.get_feature_names_out()
print("TF-IDF shape:", X.shape)
print("Vocab size:", len(feat_names))
print("Feature sample (first 30):", feat_names[:30])

# --- Detailed token/lemma/POS print for the first few rows (adjust MAX_PRINT) ---
MAX_PRINT = 3  # set to 100 if you want all
for i, text in enumerate(texts[:MAX_PRINT], 1):
    print(f"\n=== Row {i} ===")
    print(text)
    doc = nlp_stanza(text)
    for s in doc.sentences:
        for w in s.words:
            print(f"Token: {w.text:>15} | Lemma: {w.lemma:>15} | POS: {w.upos}")


Processing 100 rows from df['Yorum'].
Embeddings shape: (100, 384)




TF-IDF shape: (100, 2416)
Vocab size: 2416
Feature sample (first 30): ['1' '1 lokma' '10' '10 adet' '10 numar' '100' '100 gr' '1010'
 '1010 mekan' '12' '12 adet' '1215' '1215 te' '150' '150 tl' '175tl'
 '175tl dene' '180' '180 tl' '2' '2 kilo' '20' '20 tane' '20 tl' '4'
 '4 kişi' '4 çeşit' '40' '40 derece' '5']

=== Row 1 ===
Ankarada denenmesi gereken burger’cilerin en başında geliyor
Çok başarılı
Token:        Ankarada | Lemma:          ankara | POS: NOUN
Token:       denenmesi | Lemma:            dene | POS: VERB
Token:         gereken | Lemma:           gerek | POS: VERB
Token:  burger’cilerin | Lemma:       burgeroci | POS: ADJ
Token:              en | Lemma:              en | POS: ADV
Token:         başında | Lemma:             baş | POS: NOUN
Token:         geliyor | Lemma:             gel | POS: VERB
Token:             Çok | Lemma:             çok | POS: ADV
Token:        başarılı | Lemma:        başarılı | POS: ADJ

=== Row 2 ===
Mükemmel bir mekan Güleryüz ve lezzet konusunda