#### **MovieLens 20M Dataset**

The dataset is obtained [here](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset?select=link.csv)

In [105]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

df = pd.read_csv('data/movie.csv')

Augment the dataset with movie descriptions

In [106]:
def get_full_imdb_plot(imdb_id):
    full_imdb_id = f"tt{int(imdb_id):07d}"
    url = f"https://www.imdb.com/title/{full_imdb_id}/plotsummary"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": f"https://www.imdb.com/title/{full_imdb_id}/"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        plot_containers = soup.select(".ipc-html-content-inner-div")
        
        if plot_containers:
            full_text = max([p.get_text(separator=" ", strip=True) for p in plot_containers], key=len)
            
            clean_text = re.sub(r'\s*\.\.\.\s*Read (full )?(summary|all).*$', '', full_text, flags=re.IGNORECASE)
            
            return clean_text
        
        return get_plot_from_main_page(full_imdb_id)
    
    except Exception as e:
        return np.nan

def get_plot_from_main_page(imdb_id):
    url = f"https://www.imdb.com/title/{imdb_id}/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        full_plot = soup.find("span", {"data-testid": "plot-xl"})
        if full_plot:
            return full_plot.text.strip()
        
        short_plot = soup.find("span", {"data-testid": "plot-l"})
        if short_plot:
            return short_plot.text.strip()
        
        summary = soup.find("div", class_="summary_text")
        if summary:
            return summary.text.strip()
        
        return "Empty"
    
    except Exception as e:
        return None


def clean_description(description):
    if description is np.nan:
        return np.nan
    description = re.sub(r"—\s*Julian\s*Reischl\s*<.*?>", "", description, flags=re.IGNORECASE)
    
    description = re.sub(r"\bStar[ -]?Child\b", "", description, flags=re.IGNORECASE)
    
    description = re.sub(r"\S+@\S+\.\S+", "", description)
    
    if "." in description:
        description = description.rsplit(".", 1)[0] + "."
    
    return description.strip()

In [None]:
movies = pd.read_csv("movie.csv")
links = pd.read_csv("link.csv")

merged = pd.merge(movies, links, on="movieId", how="left")

descriptions = []
for idx, row in merged.iterrows():
    description = clean_description(get_full_imdb_plot(row["imdbId"]))
    descriptions.append(description)

merged["description"] = descriptions

df = merged.drop(['imdbId', 'tmdbId'], axis=1)

In [12]:
df['text'] = df['description'] + ' (' + df['genres'].apply(lambda x: ', '.join(x.split('|'))) + ')'

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

sentences = df['text'].to_numpy()

embeddings = model.encode(
    sentences,
    batch_size=32,
    convert_to_numpy=True
)

df['embedding'] = list(embeddings)

In [87]:
for idx, row in df[df['description'].apply(lambda x: "Read all" in str(x))].iterrows():
    df.loc[idx, ['description']] = np.nan

In [164]:
df[df['text'].apply(lambda x: x is np.nan)]

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,description,text,embedding
708,720,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,118114,,,,
17126,86668,Louis Theroux: Law & Disorder (2008),Documentary,1347439,241620.0,,,


No description found. Consider as empty:

In [237]:
df.at[17126, 'embedding'] = np.zeros(768)
df.at[708, 'embedding'] = np.zeros(768)

PCA (saving 95% of variance)

In [238]:
embeddings = df['embedding'].to_numpy()
embeddings_list = [vec for vec in embeddings]
embeddings_matrix = np.vstack(embeddings_list)

In [239]:
pca = PCA(n_components=0.99)
compressed_embeddings = pca.fit_transform(embeddings_matrix)

From 768 to 206

In [240]:
compressed_embeddings.shape

(27278, 409)

In [241]:
compressed_embeddings = [row for row in compressed_embeddings]
df['embedding'] = compressed_embeddings

In [242]:
df.to_csv('data/preprocessed_movie.csv')