## Gerekli kutuphanleri install etme 

In [3]:
!pip install torch transformers datasets scikit-learn pandas



## GPU Kontrolu

In [6]:
import torch

if torch.cuda.is_available():
    device="cuda"
else:
    device="cpu"

print(f"Using device:{device} ")

Using device:cpu 


## Veri Setini indirme ve inceleme

In [9]:
import zipfile
import pandas as pd 

zip_path="ml-1m.zip"

with zipfile.ZipFile(zip_path,"r") as archive:
    # movies.dat dosyasini acma ve okuma 
    with archive.open("ml-1m/movies.dat") as file:
        movies=pd.read_csv(file, sep="::", engine="python", names=["MovieID", "Title", "Genres"], encoding="ISO-8859-1")

print(movies.head(10))

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
5        6                         Heat (1995)         Action|Crime|Thriller
6        7                      Sabrina (1995)                Comedy|Romance
7        8                 Tom and Huck (1995)          Adventure|Children's
8        9                 Sudden Death (1995)                        Action
9       10                    GoldenEye (1995)     Action|Adventure|Thriller


### Gereksiz sutunlari temizleme Genres sutunu nu silme 
### Eksik Verileri Kontrol etme 
### Title sutunundaki yil degerlerini silme filmlerin yil degerlerini silme cunku tum veriler string formatta olmali

In [12]:
import re 

invalid_years=[]

for title in movies["Title"]:
    years = re.findall(r"\((\d{4})\)", title)
    if not years:  # Eğer (YYYY) formatında yıl yoksa
        invalid_years.append(title)


# Sonuçları yazdıralım
print(f"Toplam film sayısı: {len(movies)}")
print(f"Yıl formatında olmayan film sayısı: {len(invalid_years)}")

if invalid_years:
    print("Örnek yanlış formatlar:")
    print(invalid_years[:5])  # İlk 5 hatalı filmi göster
else:
    print("Tüm filmlerde yıl bilgisi doğru formatta! ✅")

Toplam film sayısı: 3883
Yıl formatında olmayan film sayısı: 0
Tüm filmlerde yıl bilgisi doğru formatta! ✅


In [14]:
movies["Title"]=movies["Title"].apply(lambda x: x.replace("(1995)",""))

print(movies.head())

   MovieID                         Title                        Genres
0        1                    Toy Story    Animation|Children's|Comedy
1        2                      Jumanji   Adventure|Children's|Fantasy
2        3             Grumpier Old Men                 Comedy|Romance
3        4            Waiting to Exhale                   Comedy|Drama
4        5  Father of the Bride Part II                         Comedy


In [16]:
import re

movies=movies[["MovieID","Title"]]

print("Eksik veri sayisi:\n", movies.isnull().sum())



Eksik veri sayisi:
 MovieID    0
Title      0
dtype: int64


### Bert Modelini Kullanarak Embedding Cikartma

In [19]:
pip install transformers torch

Note: you may need to restart the kernel to use updated packages.


### Bert modeli ve tokenizer i yukleyerek film basliklarini islemeye hazir hale getirme
### bert-base-uncased modelini kullanma

In [22]:
from transformers import BertTokenizer, BertModel

import torch

# Bert modelini yukleme

tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')

model=BertModel.from_pretrained('bert-base-uncased')

# modeli degerlendirme moduna al (dropout ve benzeri ozellikleri kapat)

model.eval()



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [23]:
def get_bert_embeddings(text):
    # Başlıkları token'ize et
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=50) # pt pytorch un bir dosya uzantisi  
    # pytorch tensorlarinin kaydedilmesi icin kullanilan bir formati ifade eder.
    # 50 olmasinin sebebi film basliklari genellikle 50 karakterden olusur 
    # kisa metinleri belirli bir uzunluga cikarmak icin kullanilan bir tekniktir.
    # modelin bekledigi maksimum uzunlugu asan metinleri kisaltma islemi 

    # Modeli çalıştır ve çıkışları al
    with torch.no_grad():
        outputs = model(**inputs)

    # Embedding'leri al (son katmandan [CLS] token'ının çıkışı)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Örnek olarak birkaç film başlığı için embedding çıkaralım
sample_titles = movies['Title'].head(5)  # İlk 5 başlık
embeddings = [get_bert_embeddings(title) for title in sample_titles]

# İlk embedding'i kontrol edelim
print(embeddings[0])



tensor([[ 1.3750e-01, -1.0786e-01, -2.5206e-01, -1.3676e-01,  6.6372e-01,
          4.6134e-05,  2.3826e-01,  1.7072e-01, -4.9719e-01,  1.9115e-02,
          5.9916e-02, -1.8349e-01,  1.4102e-01,  1.7501e-01, -4.9600e-01,
         -1.3410e-01,  1.0248e-02,  3.5867e-01, -1.2135e-01, -4.5477e-02,
         -2.3480e-02, -2.2080e-01, -2.0359e-01,  2.6954e-02,  2.4596e-01,
         -8.1455e-02, -1.7963e-01,  1.5606e-01,  7.7688e-02, -1.8981e-01,
         -2.3555e-01, -1.6214e-01,  4.3008e-01,  1.4024e-01, -1.2392e-01,
         -1.3268e-01, -7.6439e-02,  1.1116e-01, -7.9166e-01, -2.8659e-01,
         -8.5754e-02, -2.3410e-01,  2.3850e-01, -8.2590e-02,  1.1459e-01,
         -3.5211e-01, -1.0808e-01, -1.4607e-01,  9.5167e-02,  3.5866e-01,
          1.1067e-01,  5.0732e-02,  4.2638e-02,  8.0183e-02,  2.7185e-01,
          5.9407e-01,  3.7081e-03, -1.8570e-01, -6.2255e-02,  1.7736e-01,
          5.8885e-03,  3.6821e-01,  2.6649e-01, -4.2125e-01, -3.6577e-03,
          2.2430e-01,  3.1332e-01,  4.

In [36]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import torch

# Embedding'leri film başlıkları için hesapla ve sakla
movies["Embeddings"] = movies["Title"].apply(lambda x: get_bert_embeddings(x))

# Tüm filmlerin embedding'lerini numpy array'e çevir
embedding_matrix = np.vstack(movies["Embeddings"].values)

# Cosine similarity matrisini hesapla
similarity_matrix = cosine_similarity(embedding_matrix)

print("Benzerlik matrisi hesaplandı! 🎯")




Benzerlik matrisi hesaplandı! 🎯


In [38]:
import pandas as pd

# İlk 5 film için similarity matrisini al
similarity_df = pd.DataFrame(similarity_matrix[:5, :5], 
                             index=movies["Title"].head(5), 
                             columns=movies["Title"].head(5))

# Benzerlik değerlerini görüntüle
print(similarity_df)


Title                         Toy Story   Jumanji   Grumpier Old Men   \
Title                                                                   
Toy Story                       1.000000  0.515668           0.538995   
Jumanji                         0.515668  1.000000           0.423040   
Grumpier Old Men                0.538995  0.423040           1.000000   
Waiting to Exhale               0.526111  0.480692           0.484154   
Father of the Bride Part II     0.554487  0.472173           0.478333   

Title                         Waiting to Exhale   Father of the Bride Part II   
Title                                                                           
Toy Story                               0.526111                      0.554487  
Jumanji                                 0.480692                      0.472173  
Grumpier Old Men                        0.484154                      0.478333  
Waiting to Exhale                       1.000000                      0.441255  
Fa