In [19]:
import json
import os
import pickle
from collections import Counter

from sentence_transformers import SentenceTransformer, util


class SimilarityRecommenderService:
    def __init__(self):
        self._cuda_enabled = True if os.getenv("CUDA_ENABLED") == 1 else False
        cache_dir = os.getenv("CACHE_DIR")
        self._cache_dir = cache_dir if cache_dir else ".cache_dir"
        self.model = SentenceTransformer(
            "paraphrase-MiniLM-L6-v2",
            device=None if not self._cuda_enabled else "cuda",
            cache_folder=self._cache_dir,
        )
        self._embedded_corpus = list()
        self._corpus_mapped_data = list()  # recommended movie
        self._corpus = list()  # Corpus of

    def _train_from_jsonl(self, file_path=None):
        if not file_path:
            file_path = "Data/train_data.jsonl"
        with open(file_path, "r") as file:
            for line in file:
                # Load each line as JSON
                json_data = json.loads(line.strip())
                joined_text = " ".join([d["text"] for d in json_data["messages"]])
                self._corpus.append(joined_text)
                self._corpus_mapped_data.append(json_data["movieMentions"])
        self._embedded_corpus = self.model.encode(self._corpus, convert_to_tensor=True)

    def _first_run_setup(self):
        self._train_from_jsonl()
        with open(f"{self._cache_dir}/corpus_embedded.pkl", "wb") as f0, open(
            f"{self._cache_dir}/corpus_mapped_data.pkl", "wb"
        ) as f1:
            pickle.dump(self._embedded_corpus, f0)
            pickle.dump((self._corpus, self._corpus_mapped_data), f1)

    def init(self, *args, **kwargs):
        if not os.path.exists(f"{self._cache_dir}/.initialized"):
            self._first_run_setup()
            with open(f"{self._cache_dir}/.initialized", mode="a"):
                pass
        with open(f"{self._cache_dir}/corpus_embedded.pkl", "rb") as f0, open(
            f"{self._cache_dir}/corpus_mapped_data.pkl", "rb"
        ) as f1:
            self._embedded_corpus = pickle.load(f0)
            self._corpus, self._corpus_mapped_data = pickle.load(f1)
        if self._cuda_enabled:
            self._embedded_corpus = util.normalize_embeddings(
                self._embedded_corpus.to("cuda")
            )

    def _retrieve_top_k_similar_queries(self, query, k=5):
        embeded_query = self.model.encode(query, convert_to_tensor=True)
        similarities = []
        for i in range(len(self._corpus)):
            sim = util.cos_sim(
                embeded_query.reshape(1, 384), self._corpus[i].reshape(1, 384)
            )
            similarities.append((sim, i))
        similarities.sort(reverse=True)  # Sort in descending order of similarity

        top_k_similar_queries = []
        for sim, idx in similarities[:k]:
            query, movie_mentions = self.query_movie_map[idx]
            top_k_similar_queries.append((idx, query, movie_mentions, sim))
        return top_k_similar_queries

    def simi_top_k(self, query, k=5):
        query_embeddings = self.model.encode(query, convert_to_tensor=True)
        if self._cuda_enabled:
            query_embeddings = query_embeddings.to("cuda")
            try:
                query_embeddings = util.normalize_embeddings(query_embeddings)
            except IndexError:
                pass
        return util.semantic_search(
            query_embeddings,
            self._embedded_corpus,
            score_function=util.dot_score,
            top_k=k,
        )[0]

    def recommend(self, query, k=3):
        similar_queries = self.simi_top_k(query, k=5)
        movie_mentions_list = []
        for similar_query in similar_queries:
            movie_mentions = self._corpus_mapped_data[similar_query["corpus_id"]]
            if movie_mentions:
                movie_mentions_list.extend(movie_mentions.values())

        # Count occurrences of each movie
        movie_mentions_counter = Counter(movie_mentions_list)
        # Find the top k most mentioned movies
        movie_to_count_map = movie_mentions_counter.most_common(k)
        # Extract movie titles from the top k movies
        # recommended_movies = [movie for movie, nb_ in movie_to_count_map]
        retval = list(zip(*movie_to_count_map))
        return retval[0], retval[1]

    def _deprecated_recommend(self, query, k=1):
        similar_queries = self._retrieve_top_k_similar_queries(query)
        movie_mentions_list = []
        for similar_query in similar_queries:
            movie_mentions = similar_query[2]
            movie_mentions_list.extend(list(movie_mentions.values()))

        # Count occurrences of each movie
        movie_mentions_counter = Counter(movie_mentions_list)

        # Find the top k most mentioned movies
        top_k_movies = movie_mentions_counter.most_common(k)

        # Extract movie titles from the top k movies
        recommended_movies = [movie for movie, _ in top_k_movies]

        return recommended_movies

In [2]:
import ssl

import nltk
import numpy as np
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel, BertTokenizer

## load data
df = pd.read_csv("Data/IMDB_top_1000.csv")
plots = df["Overview"].values.tolist()
for i in range(len(plots)):
    plots[
        i
    ] = f"{plots[i]}  {df.loc[i,'Genre']}  {df.loc[i,'Director']} {df.loc[i,'Star1']} {df.loc[i,'Star2']} {df.loc[i,'Star3']} {df.loc[i,'Star4']}"

# Preprocessing
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download("punkt")
nltk.download("stopwords")

stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))


def preprocess_text(text):
    # Tokenize text
    tokens = word_tokenize(text)

    # Remove punctuation and stopwords, and apply stemming
    processed_tokens = [
        stemmer.stem(word.lower())
        for word in tokens
        if word.isalnum() and word.lower() not in stop_words
    ]

    # Join the tokens back into a single string
    return " ".join(processed_tokens)


preprocessed_plots = [preprocess_text(plot) for plot in plots]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them into TF-IDF vectors
tfidf = tfidf_vectorizer.fit_transform(preprocessed_plots)


def recomend_me(query, top_k=1):
    movie_name = []
    movie_plot = []
    preprocessed_query = preprocess_text(query)

    tfidf_query = tfidf_vectorizer.transform([preprocessed_query])

    cos_similarities = cosine_similarity(tfidf_query, tfidf)

    sorted_idx = np.argsort(cos_similarities.squeeze())

    for idx in reversed(sorted_idx[-top_k:]):
        movie_name.append(df.loc[idx]["Series_Title"])
        movie_plot.append(plots[idx])

    return movie_name, list(reversed(cos_similarities[0, sorted_idx[-top_k:]]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shelt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shelt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
service = SimilarityRecommenderService()
service.init()

In [24]:
query = "Leonardo"

r1, c1 = service.recommend(query)
m2, c2 = recomend_me(query, top_k=3)

In [21]:
list(r1)

['The Revenant  (2015)', 'The Wolf of Wall Street  (2013)', 'Inception (2010)']

In [22]:
L = []
for m, c in zip(r1, c1):
    a = m.rsplit(" ", 1)[0].strip()
    L.append((a, c))


L

[('The Revenant', 4), ('The Wolf of Wall Street', 3), ('Inception', 3)]

In [26]:
m2, c2

(['Shutter Island', 'Titanic', 'The Departed'],
 [0.22160209763092573, 0.2113956480183037, 0.208104277409475])

In [8]:
from collections import defaultdict

score = defaultdict(lambda: 0)

In [9]:
total = 0
for movie in L:
    total += movie[1]
total

0

In [10]:
for movie in L:
    score[movie[0]] = movie[1] / total
score

defaultdict(<function __main__.<lambda>()>, {})

In [11]:
for movie, score2 in zip(m2[0], m2[1]):
    score[movie] += score2 * 1.3

score

defaultdict(<function __main__.<lambda>()>,
            {'Shutter Island': 0.2880827269202035,
             'Titanic': 0.2748143424237948,
             'The Departed': 0.27053556063231754})

In [12]:
sorted_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
sorted_score

[('Shutter Island', 0.2880827269202035),
 ('Titanic', 0.2748143424237948),
 ('The Departed', 0.27053556063231754)]

In [13]:
k = 3
recommended_movies = []
for movie in sorted_score[:k]:
    recommended_movies.append(movie[0])

recommended_movies

['Shutter Island', 'Titanic', 'The Departed']

In [34]:
import re

date_expr = re.compile(r"(\([0-9]{4}\))$")


def remove_date_from_movie(string: str):
    return date_expr.sub("", string).strip()


def mixture_recommend(query):
    r1, c1 = service.recommend(query, k=3)
    r2, c2 = recomend_me(query, top_k=3)
    L = []
    for m, c in zip(r1, c1):
        a = remove_date_from_movie(m)
        L.append((a, c))
    score = defaultdict(lambda: 0)
    total = 0
    for movie in L:
        total += movie[1]
    for movie in L:
        score[movie[0]] = movie[1] / total
    for movie, score2 in zip(r2, c2):
        score[movie] += score2 * 1.5
    sorted_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
    k = 5
    movies = []
    confidence = 0
    for m in sorted_score[:k]:
        movies.append(m[0])
        confidence += m[1]
    return movies, confidence / k


In [35]:
mixture_recommend("Natalie Portman drama")

(['Black Swan', 'V for Vendetta', 'Léon', 'American Made', 'Jackie'],
 0.4735568372632484)