<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucarenz1997/recommender_systems/blob/main/NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

** Authors **: Rafaella and Luca
## Provisorischer Data Import

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

# Loading the dataset
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/Recommender/preprocessed_train.csv")

import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")

Mounted at /content/drive


## 2. Data Prep?

In [9]:
#Encoding für `user_id` und `media_id` (nur Collaborative Filtering)**
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()
data["user_id"] = user_encoder.fit_transform(data["user_id"])
data["media_id"] = item_encoder.fit_transform(data["media_id"])

## Train-Test-Split

In [10]:
# 3. Train-Test-Split
train, test = train_test_split(data, test_size=0.2, random_state=42)
train.head(100)


Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,...,last_listen,days_since_release,genre_popularity,media_popularity,artist_popularity,album_popularity,songs_listened,song_popularity_7d,artist_popularity_7d,album_popularity_7d
141061,22,2016-11-11 10:39:29,28324,16590,0.220868,2012-11-19,0.175726,0.804247,0.336207,0.313345,...,0.091299,0.035789,0.072560,0.031216,0.032770,0.005134,0.394231,0.058252,0.033728,0.006652
101826,6,2016-11-08 09:35:14,5558,2976,0.009166,1993-12-31,0.141284,0.141284,0.556034,0.686655,...,0.220489,0.196966,0.252084,0.000000,0.009910,0.012569,0.099359,0.000000,0.007293,0.008869
39223,9,2016-11-03 16:51:50,14267,8058,0.127203,2010-05-07,0.682990,0.804247,0.590517,0.686655,...,0.043068,0.057271,0.074606,0.000000,0.002246,0.000885,0.022436,0.000000,0.003646,0.002217
42170,12,2016-11-03 20:10:01,33064,19297,0.434585,1994-04-12,0.175726,0.804247,0.788793,0.686655,...,0.115495,0.194465,0.048558,0.022605,0.026427,0.004957,0.016026,0.009709,0.021878,0.001109
104449,0,2016-11-08 13:28:38,40145,23651,0.434585,2015-03-30,0.682990,0.054469,0.418103,0.686655,...,0.004206,0.015592,1.000000,0.027987,0.037923,0.015401,0.336538,0.058252,0.041933,0.013304
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22933,0,2016-11-02 15:25:59,48566,28510,0.127203,2016-04-22,0.682990,0.804247,0.780172,0.686655,...,0.070007,0.006358,1.000000,0.059203,0.130814,0.049212,0.214744,0.097087,0.160438,0.052106
20482,169,2016-11-02 12:41:42,9442,5350,0.054653,2006-06-23,0.682990,0.804247,0.452586,0.686655,...,0.162471,0.090301,0.038209,0.000000,0.019688,0.000177,0.278846,0.000000,0.020966,0.000000
238103,3,2016-11-19 17:00:36,19510,11327,0.029810,2011-10-17,0.682990,0.804247,0.366379,0.313345,...,0.095882,0.045303,0.019046,0.022605,0.017178,0.003717,0.044872,0.077670,0.030994,0.008869
74249,0,2016-11-05 23:25:50,40727,23953,0.434585,2015-04-22,0.682990,0.804247,0.534483,0.686655,...,0.075809,0.014984,1.000000,0.023681,0.193050,0.003894,0.035256,0.038835,0.176846,0.004435


##  3. PyTorch Dataset erstellen

In [14]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim

class ListenDataset(Dataset):
    def __init__(self, data):
        self.users = torch.tensor(data["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(data["media_id"].values, dtype=torch.long)
        self.labels = torch.tensor(data["is_listened"].values, dtype=torch.float32)  # Boolean zu Float

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

# Dataset in DataLoader packen (Batch-Verarbeitung)
train_data = ListenDataset(train)
test_data = ListenDataset(test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
test_loader = DataLoader(test_data, batch_size=512, shuffle=False)


## 4. Neural Collaborative Filtering (NCF) Modell definieren

In [15]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=64):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        self.fc_layers = nn.Sequential(
            nn.Linear(emb_size * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Sigmoid für binäre Klassifikation
        )

    def forward(self, user, item):
        user_embedded = self.user_emb(user)
        item_embedded = self.item_emb(item)
        x = torch.cat([user_embedded, item_embedded], dim=-1)  # Kombinierte Embeddings
        return self.fc_layers(x)


## 5. Initiate Model

In [16]:
num_users = data["user_id"].nunique()
num_items = data["media_id"].nunique()

model = NCF(num_users, num_items)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()  # Perfekt für Boolean-Labels!


## 6. Train the Model

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):
    model.train()
    total_loss = 0
    for user, item, label in train_loader:
        user, item, label = user.to(device), item.to(device), label.to(device)

        optimizer.zero_grad()
        preds = model(user, item).squeeze()  # 1D Output
        loss = criterion(preds, label)  # Binary Cross-Entropy Loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Loss: 0.6082
Epoch 2, Loss: 0.5561
Epoch 3, Loss: 0.5083
Epoch 4, Loss: 0.4710
Epoch 5, Loss: 0.4355
Epoch 6, Loss: 0.3978
Epoch 7, Loss: 0.3583
Epoch 8, Loss: 0.3181
Epoch 9, Loss: 0.2782
Epoch 10, Loss: 0.2403


## 7. Test model

In [25]:
from torch.utils.data import Dataset, DataLoader

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for user, item, label in test_loader:
        # Correctly unpack and move to device
        user, item, label = user.to(device), item.to(device), label.to(device)
        preds = model(user, item).squeeze()
        predicted = (preds >= 0.5).float()  # Schwelle bei 0.5 für binäre Klassifikation
        correct += (predicted == label).sum().item()
        total += label.size(0)

print(f"Test Accuracy: {correct / total:.4f}")

Test Accuracy: 0.6946
