In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import re
!pip install imdbpy
from imdb import IMDb
from difflib import get_close_matches
!pip install rapidfuzz
from rapidfuzz import process



In [None]:
# Load MovieLens movies and links for IMDb mapping
movies = pd.read_csv('movies.csv')      # columns: movieId, title, genres
links = pd.read_csv('links.csv')        # columns: movieId, imdbId

# Merge MovieLens IDs to get IMDb IDs
movies = movies.merge(links[['movieId', 'imdbId']], on='movieId', how='left')
movies['imdbId'] = movies['imdbId'].astype(str).str.zfill(7)

In [None]:
ia = IMDb()

def fetch_movie_details(title):
    """Fetch IMDb data for a single movie by title."""
    try:
        search_results = ia.search_movie(title)
        if not search_results:
            return None
        movie = ia.get_movie(search_results[0].movieID)
        return {
            'title': movie.get('title', title),
            'plot': movie.get('plot outline') or (movie.get('plot') or [''])[0],
            'genres': movie.get('genres', []),
            'cast': [c['name'] for c in movie.get('cast', [])[:3]],
            'director': [d['name'] for d in movie.get('directors', [])[:1]]
        }
    except Exception as e:
        print(f"Error fetching {title}: {str(e)}")
        return None

def get_movielens_genres(title):
    """Get genres from MovieLens data for a given title."""
    row = movies[movies['title'] == title]
    if not row.empty:
        # MovieLens genres are pipe-separated, convert to list
        return row.iloc[0]['genres'].split('|')
    return ["N/A"]

In [None]:
def normalize_title(title):
    """Lowercase and remove year in parentheses."""
    return re.sub(r'\s*\(\d{4}\)', '', title).strip().lower()

def get_movie_index(user_input):
    titles = movies['title'].tolist()
    titles_norm = [normalize_title(t) for t in titles]
    user_input_norm = normalize_title(user_input)
    match, score, idx = process.extractOne(user_input_norm, titles_norm)
    print(f"Best match: {titles[idx]} (Score: {score})")
    return idx if score > 60 else None


In [None]:
def get_local_candidates(user_genres, exclude_title, top_n=20):
    """Find movies with genre overlap using local data."""
    def genre_overlap(row):
        return len(set(user_genres) & set(row['genres'].split('|')))

    movies['score'] = movies.apply(genre_overlap, axis=1)
    return movies[
        (movies['score'] > 0) &
        (movies['title'] != exclude_title)
    ].sort_values('score', ascending=False).head(top_n)


In [None]:
def recommend_movies_optimized(user_input, rec_n=5, shortlist_n=20):
    # 1. Fuzzy match user input to local data
    idx = get_movie_index(user_input)
    if idx is None:
        print("Movie not found in local database.")
        return []

    local_movie = movies.iloc[idx]
    print(f"Found local match: {local_movie['title']}")

    # 2. Fetch IMDb data for input movie
    input_details = fetch_movie_details(local_movie['title'])
    if not input_details:
        print("Couldn't fetch details for input movie.")
        return []

    # 3. Get local candidates using MovieLens genres
    candidates = get_local_candidates(
        local_movie['genres'].split('|'),
        local_movie['title'],
        top_n=shortlist_n
    )

    # 4. Fetch IMDb data only for top candidates
    recommendations = []
    for _, row in candidates.iterrows():
      details = fetch_movie_details(row['title'])
      if details:
        # If IMDb genres are missing, use MovieLens genres
        if not details['genres']:
            details['genres'] = get_movielens_genres(row['title'])
        recommendations.append(details)
      if len(recommendations) >= rec_n:
        break

    # 5. Display results
    print(f"\nInput movie: {input_details['title']}")
    print("Genres:", ", ".join(input_details['genres']))
    print("Plot:", input_details['plot'][:200] + "...\n")

    print("Top recommendations:")
    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. {rec['title']}")
        print("Genres:", ", ".join(rec['genres']) if rec['genres'] else "N/A")
        print("Plot:", rec['plot'][:200] + "...")

    return recommendations


In [None]:
!pip install requests



In [None]:
# Movie Plot Generation Engine using Hugging Face Transformers Model
import requests

HF_API_TOKEN = " INPUT YOUR HUGGING FACE API TOKEN HERE"  # Replace with your Hugging Face API token

def plot_gen(genres, model="HuggingFaceH4/zephyr-7b-beta", max_tokens=400):

    genre_str = ', '.join([g.strip().capitalize() for g in genres if g.strip()])
    prompt = f"""Write a detailed movie plot for a {genre_str} film.
Describe the main characters, the setting, and the central conflict.
Present the story from the protagonist's point of view, line by line, and include a timeline of events.
The story should start with a shocking, exciting event and end with a quiet, melancholic resolution.
Structure your answer as follows:

TITLE: [Creative Title]
MAIN CHARACTERS:
- [Name]: [Description]
SETTING: [Time/Place Description]
CENTRAL CONFLICT: [1-2 sentence conflict]
STORYLINE (Protagonist's POV):
1. [Dramatic opening event]
2. [Key romantic or genre moment]
3. [Genre escalation]
4. [Climactic choice]
5. [Melancholic resolution]
TIMELINE:
- Day 1: [Event]
- Day 3: [Event]
- Day 7: [Final event]
"""
    API_URL = f"https://api-inference.huggingface.co/models/{model}"
    headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens": max_tokens,
            "temperature": 0.9,
            "do_sample": True
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        result = response.json()
        if isinstance(result, list) and "generated_text" in result[0]:
            return result[0]["generated_text"]
        elif isinstance(result, dict) and "generated_text" in result:
            return result["generated_text"]
        else:
            return str(result)
    else:
        return f"Error: {response.status_code} {response.text}"


In [None]:
def show_menu():
    print("\nWelcome to the Movie Assistant!")
    print("1. Get movie recommendations based on a movie name")
    print("2. Generate a movie plot from genres")
    print("3. Exit")
    choice = input("Enter your choice (1/2/3): ")
    return choice.strip()

def menu_recommend_movies():
    user_input = input("Enter a movie name: ")
    recommend_movies_optimized(user_input, rec_n=5, shortlist_n=20)

def menu_generate_plot():
    user_input = input("Enter genres for your movie plot (comma separated): ")
    genres = user_input.split(',')
    plot = plot_gen(genres)
    print("\nGenerated Plot:\n")
    print(plot)

# Main menu loop
while True:
    choice = show_menu()
    if choice == '1':
        menu_recommend_movies()
    elif choice == '2':
        menu_generate_plot()
    elif choice == '3':
        print("Goodbye!")
        break
    else:
        print("Invalid choice. Please enter 1, 2, or 3.")



Welcome to the Movie Assistant!
1. Get movie recommendations based on a movie name
2. Generate a movie plot from genres
3. Exit
Enter your choice (1/2/3): 3
Goodbye!


In [None]:
# Choose a manageable subset (ex first 1000 movies)
N = 3000  # Lower this number if it is hitting RAM limits
movies_small = movies.head(N).copy()

#Combine features (adjust as needed for columns)
def combine_features(row):
    return ' '.join([
        str(row['genres']),
        str(row.get('keywords', '')),
        str(row.get('cast', '')),
        str(row.get('director', ''))
    ])

movies_small['combined'] = movies_small.apply(combine_features, axis=1)

#Compute similarity matrix (CountVectorizer only, no plot)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_small['combined'].fillna(''))

hybrid_sim = cosine_similarity(count_matrix, count_matrix)

#Evaluation metrics function (use this movies_small and hybrid_sim)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, hamming_loss

def evaluate_model_subset(movies_df, sim_matrix):
    # Prepare true genres
    true_genres_list = [g.split('|') if g else [] for g in movies_df['genres'].tolist()]

    # Generate predictions: for each movie, use genres of most similar movie (excluding itself)
    predicted_genres_list = []
    for idx in range(len(movies_df)):
        sim_scores = list(enumerate(sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = [s for s in sim_scores if s[0] != idx]
        if sim_scores:
            most_sim_idx = sim_scores[0][0]
            pred_genres = movies_df.iloc[most_sim_idx]['genres'].split('|')
        else:
            pred_genres = []
        predicted_genres_list.append(pred_genres)

    # Binarize
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(true_genres_list)
    y_pred = mlb.transform(predicted_genres_list)

    # Metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='micro')
    recall = recall_score(y_true, y_pred, average='micro')
    f1 = f1_score(y_true, y_pred, average='micro')
    hamming = hamming_loss(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=mlb.classes_, zero_division=0)

    print("\n" + "="*40)
    print(" Model Evaluation Metrics ".center(40, "="))
    print("="*40)
    print(f"Accuracy: \t{accuracy:.4f}")
    print(f"Precision: \t{precision:.4f}")
    print(f"Recall: \t{recall:.4f}")
    print(f"F1-Score: \t{f1:.4f}")
    print(f"Hamming Loss: \t{hamming:.4f}")
    print("\nTop Genre Performance:")
    print(report)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'hamming_loss': hamming
    }

#Run evaluation on the subset
metrics = evaluate_model_subset(movies_small, hybrid_sim)



Accuracy: 	0.9270
Precision: 	0.9830
Recall: 	0.9793
F1-Score: 	0.9811
Hamming Loss: 	0.0041

Top Genre Performance:
              precision    recall  f1-score   support

      Action       0.98      0.99      0.98       416
   Adventure       0.96      0.98      0.97       335
   Animation       0.95      0.95      0.95        85
    Children       0.95      0.97      0.96       235
      Comedy       0.99      0.99      0.99      1033
       Crime       0.99      0.97      0.98       298
 Documentary       1.00      0.99      0.99        99
       Drama       0.99      1.00      0.99      1457
     Fantasy       0.96      0.94      0.95       170
   Film-Noir       1.00      0.97      0.99        38
      Horror       0.99      0.99      0.99       275
        IMAX       0.50      0.33      0.40         6
     Musical       0.99      0.93      0.96       111
     Mystery       0.97      0.94      0.96       155
     Romance       0.98      0.96      0.97       521
      Sci-Fi     