In [2]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import nltk
from nltk.tokenize import word_tokenize
import re
import gensim.downloader as api

nltk.download('punkt_tab')
nltk.download('punkt')


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/Patron/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Patron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load the data
DATAPATH = 'movies.csv'
df = pd.read_csv(DATAPATH,encoding='utf-8')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [None]:

movies_df = df[['genres','keywords','tagline','cast','overview','director','title']]


4803

In [20]:

# movies_df.info()

movies_df = movies_df.fillna('')
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split() if x != '' else [])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: x.split() if x != '' else [])
movies_df['cast'] = movies_df['cast'].apply(lambda x: x.split() if x != '' else [])
movies_df['director'] = movies_df['director'].apply(lambda x: x.split() if x != '' else [])
movies_df['tagline'] = movies_df['tagline'].apply(lambda x: x.split() if x != '' else [])
movies_df['overview'] = movies_df['overview'].apply(lambda x: x.split() if x != '' else [])
movies_df['title_list'] = movies_df['title'].apply(lambda x: x.split() if x != '' else [])
movies_df['soup'] = movies_df['genres'] + movies_df['keywords'] + movies_df['tagline'] + movies_df['cast'] + movies_df['director'] + movies_df['overview'] + movies_df['title_list']
movies_df['soup'] = movies_df['soup'].apply(lambda x: ' '.join(x))


movies_df.head()


Unnamed: 0,genres,keywords,tagline,cast,overview,director,title,title_list,soup
0,"[Action, Adventure, Fantasy, Science, Fiction]","[culture, clash, future, space, war, space, co...","[Enter, the, World, of, Pandora.]","[Sam, Worthington, Zoe, Saldana, Sigourney, We...","[In, the, 22nd, century,, a, paraplegic, Marin...","[James, Cameron]",Avatar,[Avatar],Action Adventure Fantasy Science Fiction cultu...
1,"[Adventure, Fantasy, Action]","[ocean, drug, abuse, exotic, island, east, ind...","[At, the, end, of, the, world,, the, adventure...","[Johnny, Depp, Orlando, Bloom, Keira, Knightle...","[Captain, Barbossa,, long, believed, to, be, d...","[Gore, Verbinski]",Pirates of the Caribbean: At World's End,"[Pirates, of, the, Caribbean:, At, World's, End]",Adventure Fantasy Action ocean drug abuse exot...
2,"[Action, Adventure, Crime]","[spy, based, on, novel, secret, agent, sequel,...","[A, Plan, No, One, Escapes]","[Daniel, Craig, Christoph, Waltz, L\u00e9a, Se...","[A, cryptic, message, from, Bond’s, past, send...","[Sam, Mendes]",Spectre,[Spectre],Action Adventure Crime spy based on novel secr...
3,"[Action, Crime, Drama, Thriller]","[dc, comics, crime, fighter, terrorist, secret...","[The, Legend, Ends]","[Christian, Bale, Michael, Caine, Gary, Oldman...","[Following, the, death, of, District, Attorney...","[Christopher, Nolan]",The Dark Knight Rises,"[The, Dark, Knight, Rises]",Action Crime Drama Thriller dc comics crime fi...
4,"[Action, Adventure, Science, Fiction]","[based, on, novel, mars, medallion, space, tra...","[Lost, in, our, world,, found, in, another.]","[Taylor, Kitsch, Lynn, Collins, Samantha, Mort...","[John, Carter, is, a, war-weary,, former, mili...","[Andrew, Stanton]",John Carter,"[John, Carter]",Action Adventure Science Fiction based on nove...


In [27]:
movies_df.to_csv('movies_cleaned_preprocessed.csv')

In [26]:
# Load a pretrained embedding model (GloVe, 50 dimensions)
embedding_model = api.load("glove-wiki-gigaword-50")

def preprocess_text(text):
    """
    Clean and tokenize text by lowercasing, removing punctuation, and tokenizing.
    """
    text = re.sub(r'[^\w\s]', '', str(text).lower())
    return word_tokenize(text)

def get_column_vector(text, model):
    """
    Compute an average vector for a text column.
    Returns a zero vector if no tokens are found.
    """
    tokens = preprocess_text(text)
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

def get_weighted_movie_vector(row, weights, model):
    """
    Compute a weighted document vector for a movie from specific columns.
    
    Args:
        row (pd.Series): A row from the DataFrame.
        weights (dict): A dictionary mapping column names to their weights.
        model: Pretrained word embeddings model.
        
    Returns:
        A numpy array representing the weighted average vector.
    """
    weighted_vector = np.zeros(model.vector_size)
    total_weight = 0
    # For each column, compute the column vector and add weight * column_vector.
    for col, weight in weights.items():
        col_text = ' '.join(row[col]) if isinstance(row[col], list) else str(row[col])
        vec = get_column_vector(col_text, model)
        weighted_vector += weight * vec
        total_weight += weight
    if total_weight > 0:
        return weighted_vector / total_weight
    return weighted_vector

def recommend_movies_weighted(user_query, df, weights, top_n=5, reg=True):
    """
    Recommend movies based on a user's query using a weighted system over specific columns.
    
    Args:
        user_query (str): User input description.
        df (pd.DataFrame): DataFrame containing movie data.
        weights (dict): Column weights (e.g., {'overview': 2, 'keywords': 3, 'genres': 1, ...}).
        top_n (int): Number of recommendations.
        reg (bool): If True, applies L2 normalization to the vectors (regularization).
        
    Returns:
        DataFrame of top N movies with similarity scores.
    """
    # Precompute weighted document vectors for each movie
    doc_vectors = np.array([
        get_weighted_movie_vector(row, weights, embedding_model) 
        for _, row in df.iterrows()
    ])
    
    # Compute query vector (simple average)
    query_vector = get_column_vector(user_query, embedding_model).reshape(1, -1)
    
    # Apply L2 normalization to regularize vector magnitudes
    if reg:
        doc_vectors = normalize(doc_vectors, norm='l2')
        query_vector = normalize(query_vector, norm='l2')
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vector, doc_vectors).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = similarities[top_indices]
    return recommendations[['title', 'genres', 'cast', 'similarity_score']]

# Main Functionality
if __name__ == "__main__":
    # Assume movies_df is preloaded and already preprocessed.
    # Example weights (adjust based on your domain)
    column_weights = {
        'overview': 2.0,
        'genres': 2.0,
        'keywords': 2.5,
        'tagline': 1.0,
        'cast': 2.0,
        'director': 1.0,
        'title_list': 2.0
    }
    
    user_query = "mission impossible"
    
    recommendations = recommend_movies_weighted(user_query, movies_df, column_weights, top_n=5, reg=True)
    
    print("Weighted Recommendations with Regularization:")
    print(recommendations.head())

Weighted Recommendations with Regularization:
                                     title  \
425                    Mission: Impossible   
213                 Mission: Impossible II   
1271                              Pandorum   
373                        Mission to Mars   
153   Mission: Impossible - Ghost Protocol   

                                                 genres  \
425                       [Adventure, Action, Thriller]   
213                       [Adventure, Action, Thriller]   
1271  [Action, Horror, Mystery, Science, Fiction, Th...   
373                                  [Science, Fiction]   
153                       [Action, Thriller, Adventure]   

                                                   cast  similarity_score  
425   [Tom, Cruise, Jon, Voight, Emmanuelle, B\u00e9...          0.822898  
213   [Tom, Cruise, Dougray, Scott, Thandie, Newton,...          0.811080  
1271  [Ben, Foster, Dennis, Quaid, Cam, Gigandet, An...          0.795581  
373   [Gary, Sinis