In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import joblib


In [None]:
def load_movie_dataset(file_path):
    data = pd.read_csv(file_path)
    essential_columns = ['genres', 'keywords', 'tagline', 'cast', 'director', 'overview']
    data[essential_columns] = data[essential_columns].fillna('')
    return data

df = load_movie_dataset("/content/drive/MyDrive/DATASETS/movies.csv")


In [3]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text.strip()

def create_feature_signature(row):
    weight_config = {
        'overview': 3,
        'keywords': 2,
        'genres': 1,
        'tagline': 1,
        'cast': 1,
        'director': 1
    }
    combined_text = ""
    for feature, weight in weight_config.items():
        cleaned = preprocess_text(row[feature])
        combined_text += (cleaned + ' ') * weight
    return combined_text.strip()

df['movie_signature'] = df.apply(create_feature_signature, axis=1)


In [4]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['movie_signature'])

# Reduce dimensions with TruncatedSVD (Latent Semantic Analysis)
svd = TruncatedSVD(n_components=250, random_state=42)
reduced_matrix = svd.fit_transform(tfidf_matrix)


In [5]:
def compute_similarity(movie_idx, reduced_matrix, top_k=30):
    similarity_vector = cosine_similarity([reduced_matrix[movie_idx]], reduced_matrix)[0]
    similar_indices = similarity_vector.argsort()[::-1][1:top_k+1]
    return similar_indices


In [None]:
movie_titles = df['title'].tolist()
title_to_index = pd.Series(df.index, index=df['title'])

def retrieve_recommendations(input_title, top_k=20):
    input_title = input_title.lower()
    matched_titles = [title for title in movie_titles if input_title in title.lower()]
    if not matched_titles:
        return f"Movie title similar to '{input_title}' not found.", pd.DataFrame()

    selected_title = matched_titles[0]
    movie_idx = title_to_index[selected_title]
    recommended_indices = compute_similarity(movie_idx, reduced_matrix, top_k)
    recommendations = df.loc[recommended_indices, ['title', 'vote_average', 'release_date']]
    return selected_title, recommendations.reset_index(drop=True)


In [7]:
movie_input = input("Enter a movie you like: ")
matched, results = retrieve_recommendations(movie_input)

if isinstance(results, pd.DataFrame):
    print(f"\nTop 30 recommendations similar to '{matched}':\n")
    for i, row in results.iterrows():
        print(f"{i+1}. {row['title']} ({row['release_date'][:4]}) - Rating: {row['vote_average']}")
else:
    print(matched)


Enter a movie you like: ironman

Top 30 recommendations similar to 'Movie title similar to 'ironman' not found.':



In [9]:
import pickle

# Bundle all components into a single dictionary
movie_rec_system = {
    'vectorizer': vectorizer,
    'svd': svd,
    'reduced_matrix': reduced_matrix,
    'title_to_index': title_to_index,
    'titles': df[['title']]
}

# Save to a single .sav file
with open('movie_recommendation_model.sav', 'wb') as f:
    pickle.dump(movie_rec_system, f)


In [None]:
from google.colab import files

# Download the saved model file
files.download('movie_recommendation_model.sav')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install streamlit


Collecting streamlit
  Downloading streamlit-1.45.0-py3-none-any.whl.metadata (8.9 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.45.0-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hIns