Transformer Based Rec System!

In [1]:
import pandas as pd
import torch
from tqdm import tqdm
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np

In [2]:
import torch.cuda

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
users = pd.read_csv(
    "../data/ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    encoding="latin1",
    engine = "python"
)

ratings = pd.read_csv(
    "../data/ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    encoding="latin1",
    engine = "python"
)

movies = pd.read_csv(
    "../data/ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"],
    encoding="latin1",
    engine = "python"
)

In [5]:
movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
movies.year = pd.Categorical(movies.year)
movies["year"] = movies.year.cat.codes
## Users
users.sex = pd.Categorical(users.sex)
users["sex"] = users.sex.cat.codes


users.age_group = pd.Categorical(users.age_group)
users["age_group"] = users.age_group.cat.codes

users.occupation = pd.Categorical(users.occupation)
users["occupation"] = users.occupation.cat.codes


users.zip_code = pd.Categorical(users.zip_code)
users["zip_code"] = users.zip_code.cat.codes

#Ratings
ratings['unix_timestamp'] = pd.to_datetime(ratings['unix_timestamp'],unit='s')

In [6]:
movies_metadata = pd.read_csv("../data/ml-1m/movies_metadata.csv", low_memory=False)

In [7]:
movies["year_join"] = movies['title'].apply(lambda x : int(x[-5:-1]))
movies["title"] = movies["title"].apply(lambda x : x[:-6].strip())

In [8]:
movies_metadata = movies_metadata[movies_metadata.release_date.notnull()]

In [9]:
movies_metadata["year_join"] = movies_metadata["release_date"].apply(lambda x : int(x[:4]))

In [10]:
merged_movies = pd.merge(movies, movies_metadata[['title', 'year_join', 'overview']], on=['title', 'year_join'], how='inner')

In [11]:
movies = merged_movies

In [12]:
movies.dropna(inplace=True)

In [13]:
overview_dict = {}
for id, overview in zip(movies.movie_id, movies.overview):
    overview_dict[id] = overview

In [18]:
import json
with open("../data/ml-1m/over_view.json","w") as f:
    json.dump(overview_dict, f)

In [19]:
ratings["is_true"] = ratings['movie_id'].apply(lambda x : int(x) in overview_dict.keys())

In [20]:
ratings = ratings[ratings.is_true]

In [21]:
ratings.drop("is_true", axis = 1, inplace=True)

In [22]:
ratings.reset_index(inplace=True)

In [23]:
ratings.drop(["index"], axis=1, inplace = True)

In [24]:
if not os.path.exists('data'):
    os.makedirs('data')


users.to_csv("data/users.csv",index=False)
movies.to_csv("data/movies.csv",index=False)
ratings.to_csv("data/ratings.csv",index=False)

In [25]:
## Movies
movies["movie_id"] = movies["movie_id"].astype(str)
## Users
users["user_id"] = users["user_id"].astype(str)

##Ratings
ratings["movie_id"] = ratings["movie_id"].astype(str)
ratings["user_id"] = ratings["user_id"].astype(str)

In [26]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )

In [27]:
movies.reset_index(inplace=True)

In [28]:
movie_genere_dict = {}
for idx in range(len(movies)):
    row = movies.iloc[idx]
    movie_genere_dict[row.movie_id] = torch.tensor(list(row[genres].values), dtype=torch.float)

In [29]:
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)

In [30]:
sequence_length = 8
step_size = 1


def create_sequences(values, window_size, step_size):
    sequences = []
    start_index = 0
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break
        sequences.append(seq)
        start_index += step_size
    return sequences


ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
)

del ratings_data["timestamps"]

In [31]:
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
)
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("user_id"), on="user_id"
)
ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply(
    lambda x: ",".join(x)
)
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
    lambda x: ",".join([str(v) for v in x])
)


ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
)

In [32]:
ratings_data_transformed["overviews"] = ratings_data_transformed["sequence_movie_ids"].apply(lambda x : [overview_dict[int(i)] for i in x.split(',')])

In [33]:
random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv("data/train_data.csv", index=False, sep=",")
test_data.to_csv("data/test_data.csv", index=False, sep=",")

In [34]:
import json
with open('../data/ml-1m/overview_embeddings.json',"r") as f:
    embed = json.load(f)

In [35]:
embed_dict = {k["movies_id"] : torch.tensor(k["overviews"]) for k in embed}

In [36]:
import pandas as pd
import torch
import torch.utils.data as data
from torchvision import transforms
import ast
from torch.nn.utils.rnn import pad_sequence



class MovieDataset(data.Dataset):
    """Movie dataset."""

    def __init__(
        self, ratings_file,test=False
    ):
        """
        Args:
            csv_file (string): Path to the csv file with user,past,future.
        """
        self.ratings_frame = pd.read_csv(
            ratings_file,
            delimiter=",",
            # iterator=True,
        )
        self.test = test

    def __len__(self):
        return len(self.ratings_frame)

    def __getitem__(self, idx):
        data = self.ratings_frame.iloc[idx]
        user_id = data.user_id

        movie_history = eval(data.sequence_movie_ids)
        movie_history_ratings = eval(data.sequence_ratings)
        overview_embed = torch.cat([embed_dict[str(id)].unsqueeze(0) for id in movie_history], axis = 0)
        #print(overview_embed.shape)
        movie_genere = torch.cat([movie_genere_dict[str(id)].unsqueeze(0) for id in movie_history], axis = 0)

        target_movie_id = movie_history[-1:][0]
        target_movie_rating = movie_history_ratings[-1:][0]

        movie_history = torch.LongTensor(movie_history[:-1])
        movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1])

        sex = data.sex
        age_group = data.age_group
        occupation = data.occupation
        return user_id, movie_history, target_movie_id,  movie_history_ratings, target_movie_rating, sex, age_group, occupation, overview_embed, movie_genere

In [37]:
class PositionalEmbedding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pos_embed = nn.Embedding(max_len, d_model)
        self.to(device)
    def forward(self, x):
        batch_size = x.size(0)
        return self.pos_embed.weight.unsqueeze(0).repeat(batch_size, 1, 1)

In [38]:
users.user_id = users.user_id.apply(lambda x : int(x))
movies.movie_id = movies.movie_id.apply(lambda x : int(x))


In [39]:
USER_LEN = users.user_id.max() + 1
MOVIE_LEN = movies.movie_id.max() + 1
AGE_LEN = len(users.age_group.unique())
OCC_LEN = len(users.occupation.unique())
ZIP_LEN = len(users.zip_code.unique())
SEX_LEN = len(users.sex.unique())

In [40]:
class Embedding(nn.Module):
    def __init__(self, max_len):
        super().__init__()
        self.pos_embed = PositionalEmbedding(max_len, int(math.sqrt(MOVIE_LEN))+2).to(device)
        self.user_embed = nn.Embedding(USER_LEN, int(math.sqrt(USER_LEN))).to(device)
        self.age_embed = nn.Embedding(AGE_LEN, int(math.sqrt(AGE_LEN))).to(device)
        # self.zipcode_embed
        self.sex_embed = nn.Embedding(SEX_LEN, int(math.sqrt(SEX_LEN))).to(device)
        self.occupation_embed = nn.Embedding(OCC_LEN, int(math.sqrt(OCC_LEN))).to(device)
        self.movie_embed = nn.Embedding(MOVIE_LEN, int(math.sqrt(MOVIE_LEN))+2).to(device)
        self.overview_embed_pos = PositionalEmbedding(max_len, 768).to(device)#overview embed dim
        self.genere_embed_pos = PositionalEmbedding(max_len, 18).to(device)

    def forward(self, batch):
        user_id, movie_history, target_movie_id,  movie_history_ratings, _ , sex, age_group, occupation, overview_embed, movie_genere = batch
        movie_genere += self.genere_embed_pos(movie_genere)
        overview_embed = overview_embed + self.overview_embed_pos(overview_embed)
        user_embed = self.user_embed(user_id)
        # print(user_embed.shape)
        movie_history_embed = self.movie_embed(movie_history)
        # print(movie_history_embed.shape)
        movie_history_embed = movie_history_embed*movie_history_ratings.unsqueeze(-1)
        # print(movie_history_embed.shape)
        # print(movie_history_ratings.unsqueeze(-1).shape)

        movie_target_embed = self.movie_embed(target_movie_id)
        # print(movie_target_embed.unsqueeze(1).shape)
        movie_embed = torch.cat((movie_history_embed, movie_target_embed.unsqueeze(1)), axis = 1)
        # print(movie_embed.shape)

        sex_embed = self.sex_embed(sex)
        # print(sex_embed.shape)
        age_embed = self.age_embed(age_group)
        # print(age_embed.shape)
        occ_embed = self.occupation_embed(occupation)
        # print(occ_embed.shape)
        # print(self.pos_embed(movie_history).shape)
        user_features = (user_embed, sex_embed, age_embed, occ_embed)
        return movie_embed + self.pos_embed(movie_history), user_features, overview_embed, movie_genere

In [41]:
class BST(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = nn.TransformerEncoderLayer(64, 4, dim_feedforward=128)
        self.embedding = Embedding(8)
        self.fc1 = nn.Linear(6884, 1024).to(device)
        self.fc2 = nn.Linear(1024, 512).to(device)
        self.fc3 = nn.Linear(512, 256).to(device)
        self.fc_out = nn.Linear(256, 1).to(device)
        self.leaky_relu1 = nn.LeakyReLU().to(device)
        self.leaky_relu2 = nn.LeakyReLU().to(device)
        self.leaky_relu3 = nn.LeakyReLU().to(device)
    def forward(self, batch):
        movie_embed, user_features, overview_embed, movie_genre = self.embedding(batch)
        transformer_out = self.transformer(movie_embed)
        BS = transformer_out.size(0)
        transformer_out = transformer_out.view(BS, -1, 1).squeeze(-1)
        overview_embed = overview_embed.view(BS, -1, 1).squeeze(-1)
        movie_genre = movie_genre.view(BS, -1, 1).squeeze(-1)
        #print(overview_embed.shape)
        user_out = torch.cat(user_features, axis = 1)
        # print(user_out.shape)
        # print(transformer_out.shape)
        flattened_vector = torch.cat((user_out, movie_genre, transformer_out, overview_embed), axis = 1)
        #print(flattened_vector.shape)
        out = self.fc1(flattened_vector)
        out = self.leaky_relu1(out)
        out = self.fc2(out)
        out = self.leaky_relu2(out)
        out = self.fc3(out)
        out = self.leaky_relu3(out)
        out = self.fc_out(out)
        return out

In [42]:
train_ds = MovieDataset("data/train_data.csv")
data = torch.utils.data.DataLoader(train_ds, batch_size=128, shuffle=True)
test_ds = MovieDataset("data/test_data.csv")
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=128, shuffle=True)

In [43]:
model = BST().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

total_params = sum(p.numel() for p in model.parameters())
print(f'Total number of parameters: {total_params}')

Total number of parameters: 8465082


In [44]:
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    total_mae = 0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
            batch = tuple(tensor.to(device) for tensor in batch)
            outputs = model(batch)
            y = batch[4]
            loss = criterion(outputs, y.float().view(-1, 1))
            mae = F.l1_loss(outputs, y.float().view(-1, 1))
            total_loss += loss.item()
            total_mae += mae
    return total_loss/len(data_loader), total_mae/len(data_loader)

In [45]:
from tqdm import tqdm
import torch.nn.functional as F

epochs = 4
for epoch in range(epochs):
    total_loss = 0
    total_mae = 0
    model.train()
    for batch in tqdm(data, desc=f'Epoch {epoch + 1}/{epochs}', leave=False):
        batch = tuple(tensor.to(device) for tensor in batch)
        outputs = model(batch)
        y = batch[4]
        loss = criterion(outputs, y.float().view(-1, 1))
        mae = F.l1_loss(outputs, y.float().view(-1, 1))
        total_mae += mae
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    val_loss, val_mae = evaluate(test_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Training, Validation Loss: {total_loss/len(data):.3f}, {val_loss:.3f}')
    print(f'Epoch {epoch + 1}/{epochs}, Training, Validation Loss: {total_mae/len(data):.3f}, {val_loss:.3f}')



Epoch 1/4, Training, Validation Loss: 0.962, 0.843
Epoch 1/4, Training, Validation Loss: 0.780, 0.843




Epoch 2/4, Training, Validation Loss: 0.838, 0.822
Epoch 2/4, Training, Validation Loss: 0.726, 0.822




Epoch 3/4, Training, Validation Loss: 0.819, 0.819
Epoch 3/4, Training, Validation Loss: 0.717, 0.819


                                                            

Epoch 4/4, Training, Validation Loss: 0.831, 0.810
Epoch 4/4, Training, Validation Loss: 0.720, 0.810




In [46]:
torch.save(model.state_dict(), 'model_weights.pth')

In [47]:
#Just select any row from test and you can see what model will predict and whats the ground truth

In [56]:
sample_1 = test_data.iloc[11]#change the index to get different outputs

In [57]:
user_id = torch.tensor([int(sample_1.user_id)])

movie_history = eval(sample_1.sequence_movie_ids)
movie_history_ratings = eval(sample_1.sequence_ratings)
overview_embed = torch.cat([embed_dict[str(id)].unsqueeze(0) for id in movie_history], axis = 0).unsqueeze(0)
movie_genere = torch.cat([movie_genere_dict[str(id)].unsqueeze(0) for id in movie_history], axis = 0).unsqueeze(0)

target_movie_id = torch.tensor([movie_history[-1:][0]])
target_movie_rating = torch.tensor([movie_history_ratings[-1:][0]])

movie_history = torch.LongTensor(movie_history[:-1]).unsqueeze(0)
movie_history_ratings = torch.LongTensor(movie_history_ratings[:-1]).unsqueeze(0)

sex = torch.tensor([int(sample_1.sex)])
age_group = torch.tensor([int(sample_1.age_group)])
occupation = torch.tensor([int(sample_1.occupation)])

In [58]:
with torch.no_grad():
    model.eval()
    print('predicted rating',model((user_id.to(device), movie_history.to(device), target_movie_id.to(device),  movie_history_ratings.to(device), target_movie_rating.to(device), sex.to(device), age_group.to(device), occupation.to(device), overview_embed.to(device), movie_genere.to(device))))

predicted rating tensor([[4.7005]], device='cuda:0')


In [59]:
print('Actual Rating:- ',(target_movie_rating))

Actual Rating:-  tensor([5])
