In [31]:
import argparse
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import string


In [32]:
df = pd.read_csv('tmdb_5000_movies.csv')

In [49]:
df.shape

(4803, 20)

In [40]:
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [42]:
df[['overview','tagline','genres', 'keywords', 'production_companies', 'production_countries']].head(2)

Unnamed: 0,overview,tagline,genres,keywords,production_companies,production_countries
0,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."
1,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o..."


In [50]:
def extract_names(x): 
    try:
        return ' '.join([d['name'] for d in eval(x)])
    except:
        return ''
            
def load_data(csv_path):
    """Load and preprocess movie dataset with multiple features"""
    df = pd.read_csv(csv_path)
    
    # Handle missing values
    df['overview'] = df['overview'].fillna('')
    df['tagline'] = df['tagline'].fillna('')
    
            
    for col in ['genres', 'keywords', 'production_companies', 'production_countries']:
        df[col] = df[col].apply(extract_names)
    
    # Create combined text feature
    df['metadata'] = (
        df['overview'] + ' ' +
        df['tagline'] + ' ' +
        df['genres'] + ' ' +
        df['keywords'] + ' ' +
        df['original_language'] + ' ' +
        df['production_companies']
    )
    
    return df[['title', 'metadata']].dropna().reset_index(drop=True)

def build_tfidf_matrix(df):
    """Create TF-IDF vectors"""
    tfidf = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  # Capture bi-grams
        max_features=1000
    )
    tfidf_matrix = tfidf.fit_transform(df['metadata'])
    return tfidf_matrix, tfidf

In [34]:
def recommend_movies(query, df , tfidf_matrix, vectorizer, n=5):
    """Generate movie recommendations based on text similarity"""
    # Transform query to TF-IDF
    query_vec = vectorizer.transform([query])
    
    # Calculate cosine similarities
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top N indices
    top_indices = similarities.argsort()[::-1][:n]
    
    return [
        (df.iloc[i]['title'], similarities[i])
        for i in top_indices
        if similarities[i] > 0  # Filter out zero similarity matches
    ]



In [35]:
movies_df = load_data('tmdb_5000_movies.csv')


In [36]:
# tf - idf
tfidf_matrix, vectorizer = build_tfidf_matrix(movies_df)

In [47]:
dense_matrix = tfidf_matrix.todense()
tfidf_df = pd.DataFrame(dense_matrix, columns=vectorizer.get_feature_names_out())

In [48]:
tfidf_df

Unnamed: 0,000,1970s,2000,2000 pictures,3d,3d en,abuse,accident,accidentally,action,...,years,york,york city,young,young adult,young man,young woman,younger,youth,zombie
0,0.0,0.0,0.0,0.0,0.125959,0.135001,0.000000,0.0,0.0,0.061602,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.189964,0.0,0.0,0.085256,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.078864,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.067993,...,0.103051,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.163598,0.175341,0.000000,0.0,0.0,0.080010,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.074682,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4799,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4800,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4801,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,...,0.000000,0.175435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
query = "comedy action"
recommendations = recommend_movies(
        query,
        movies_df,
        tfidf_matrix,
        vectorizer,
        5
    )    

In [56]:
print(f"\nTop {5} recommendations for '{query}':")
for title, score in recommendations:
    print(f"- {title} (score: {score:.3f})")



Top 5 recommendations for 'comedy action':
- The Helix... Loaded (score: 0.403)
- The Hebrew Hammer (score: 0.359)
- Khiladi 786 (score: 0.353)
- Lethal Weapon 4 (score: 0.349)
- Silver Medalist (score: 0.342)
