In [None]:
# Import Libraries

import pandas as pd
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load nlp model from spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
def preprocess_text(text):
    """
    Preprocess text: lowercase, lemmatize, and remove stopwords & non-alpha tokens.
    """
    
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

In [4]:
def combine_features(row):
    """
    Combine and weight features.
    Here, we double the 'Plot' to emphasize it and include Genre, Directors, and Stars.
    """

    # Duplicate Plot to increase its weight
    weighted_plot = row['Plot']*2

    # Combined the Various Field like Genre, weighted plot, directors and stars to broaden the search
    combined = f"{row['Genre']} {weighted_plot} Directors: {row['Directors']} Stars: {row['Stars']}"

    return combined

In [5]:
def load_and_prepare_data(filepath):
    """
    Load CSV data and prepare a processed text column for embedding.
    """

    df = pd.read_csv(filepath)

    # Combine selected features into one text field
    df['combined_features'] = df.apply(combine_features, axis=1)

    # Preprocess the combined text using spaCy
    df['processed_text'] = df['combined_features'].apply(preprocess_text)
    
    return df

In [6]:
def compute_tfidf_embeddings(corpus):
    """
    Compute TF-IDF vectors for a list of texts.
    """
    
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

In [7]:
def compute_embeddings(texts, model):
    """
    Compute dense embeddings for a list of texts using a SentenceTransformer model.
    """

    embeddings = model.encode(texts, convert_to_numpy=True)

    return embeddings

In [8]:
def cluster_agglomerative(embeddings, n_clusters=10):
    """
    Use Agglomerative Algorithm to cluster movie embeddings for diversity in recommendations.
    """

    agg_cluster = AgglomerativeClustering(n_clusters=n_clusters)

    return agg_cluster.fit_predict(embeddings)

In [None]:
def get_recommendations(query,tfidf_vectorizer,tfidf_matrix, model, movie_embeddings, df, total_n=10,alpha=0.2):
    """
    Get top recommendations for a query.
    
    Returns a list of recommendations (total length = total_n).
    """
    
    query_processed = preprocess_text(query)
    query_tfidf = tfidf_vectorizer.transform([query_processed])
    sim_tfidf = cosine_similarity(query_tfidf, tfidf_matrix).flatten()


    # Compute embedding for the query
    query_embedding = model.encode([query], convert_to_numpy=True)

    # Calculate cosine similarity between the query and each movie embedding
    similarities = cosine_similarity(query_embedding, movie_embeddings).flatten()
    hybrid_sim = alpha * sim_tfidf + (1-alpha)*similarities
    df['similarity'] = hybrid_sim


    # Sort the DataFrame by similarity in descending order
    df_sorted = df.sort_values(by=['similarity', 'Rating'], ascending=[False, False])

    # select the cluster with highest number of appearence in top 10
    top_cluster = df_sorted.head(10).mode().iloc[0]['cluster']

    # Select movies from the top cluster, sorted by similarity descending
    top_cluster_movies = df_sorted[df_sorted['cluster'] == top_cluster]

    # 66% of Total N will be from top_cluster and remaining are from other clusters
    selected_top_cluster = []
    for idx, row in top_cluster_movies.iterrows():
        if len(selected_top_cluster) < total_n*2//3:
            selected_top_cluster.append(row)
        else:
            break

    # Select remaining Movies
    remaining_movies = df_sorted[df_sorted['cluster'] != top_cluster]

    selected_remaining = []
    for idx, row in remaining_movies.iterrows():
        if len(selected_top_cluster) + len(selected_remaining) < total_n:
            selected_remaining.append(row)
        else:
            break


    return selected_top_cluster + selected_remaining

In [10]:
def main():
    user_query = input(""" Example Query :: I love thrilling space adventures with unexpected twists.""").strip()

    # Load and prepare the dataset
    df = load_and_prepare_data(r'Dataset\movies_subset_cleaned.csv')

    tfidf_matrix, tfidf_vectorizer = compute_tfidf_embeddings(df['processed_text'].tolist())

    # Load the pre-trained SentenceTransformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Compute embeddings for each processed movie text
    movie_embeddings = compute_embeddings(df['processed_text'].tolist(), model)

    # Compute clusters to later ensure diversity in recommendations
    n_clusters = 10
    cluster_labels = cluster_agglomerative(movie_embeddings, n_clusters)
    df['cluster'] = cluster_labels

    # Get diverse recommendations for the user's query
    recommendations = get_recommendations(user_query,tfidf_vectorizer,tfidf_matrix, model, movie_embeddings, df,alpha=0.3)
    recommendations = sorted(recommendations, key=lambda row: row['similarity'], reverse=True)

    print(f"\nTop recommendations for: '{user_query}'\n")
    for i, rec in enumerate(recommendations):
        movie_name = rec['Movie Name']
        rating = rec['Rating']
        similarity = rec['similarity']
        cluster = rec['cluster']
        print(f"{i+1}. {movie_name} (Rating: {rating}, Similarity: {similarity:.4f}, Cluster: {cluster})")

In [None]:
if __name__ == "__main__":
    main()