# IMDB Movie Recommender System

This notebook implements a movie recommendation system using IMDB dataset files.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import logging
from pathlib import Path

import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress warnings
warnings.filterwarnings('ignore')

start_time = time.time()

In [None]:
class IMDBRecommender:
    def __init__(self, data_dir="/content/data"):
        """Initialize the recommender system with data directory path"""
        self.data_dir = Path(data_dir)             # Path to data directory
        self.movies_processed = None                # Will store processed movie data
        self.feature_matrix = None                  # Will store movie features
        self.similarity_matrix = None               # Will store movie similarities
        self.vectorizer = None                      # Will store text vectorizer

    def extract_data(self):
        """Extract data from IMDB dataset files"""
        try:
            logger.info("Starting data extraction...")

            # Load basic movie information (titles, years, genres)
            basics = pd.read_csv(
                self.data_dir / 'title.basics.tsv',
                sep='\t',
                usecols=['tconst', 'primaryTitle', 'startYear', 'genres']
            )

            # Load movie ratings data
            ratings = pd.read_csv(
                self.data_dir / 'title.ratings.tsv',
                sep='\t'
            )

            # Load director information
            crew = pd.read_csv(
                self.data_dir / 'title.crew.tsv',
                sep='\t',
                usecols=['tconst', 'directors']
            )

            return basics, ratings, crew

        except Exception as e:
            logger.error(f"Error during data extraction: {str(e)}")
            raise

    def transform_data(self, basics, ratings, crew):
        """Clean and transform the raw data"""
        try:
            logger.info("Starting data transformation...")

            # Handle missing values in all datasets
            basics['genres'] = basics['genres'].fillna('Unknown')
            basics['startYear'] = basics['startYear'].replace('\\N', np.nan)
            crew['directors'] = crew['directors'].fillna('Unknown')

            # Convert year to numeric, handling invalid years
            basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')

            # Calculate weighted ratings using IMDB formula
            C = ratings['averageRating'].mean()                    # Mean rating across all movies
            m = ratings['numVotes'].quantile(0.90)                 # Min votes threshold
            ratings['weighted_rating'] = (
                (ratings['numVotes']/(ratings['numVotes'] + m) * ratings['averageRating']) +
                (m/(ratings['numVotes'] + m) * C)
            )

            return basics, ratings, crew

        except Exception as e:
            logger.error(f"Error during data transformation: {str(e)}")
            raise



    def extract_data(self):
        try:
            logger.info("Starting data extraction...")

            basics = pd.read_csv(
                self.data_dir / 'title.basics.tsv',
                sep='\t',
                usecols=['tconst', 'primaryTitle', 'startYear', 'genres']
            )

            ratings = pd.read_csv(
                self.data_dir / 'title.ratings.tsv',
                sep='\t'
            )

            crew = pd.read_csv(
                self.data_dir / 'title.crew.tsv',
                sep='\t',
                usecols=['tconst', 'directors']
            )

            return basics, ratings, crew

        except Exception as e:
            logger.error(f"Error during data extraction: {str(e)}")
            raise

    def transform_data(self, basics, ratings, crew):
        try:
            logger.info("Starting data transformation...")

            basics['genres'] = basics['genres'].fillna('Unknown')
            basics['startYear'] = basics['startYear'].replace('\\N', np.nan)
            crew['directors'] = crew['directors'].fillna('Unknown')

            basics['startYear'] = pd.to_numeric(basics['startYear'], errors='coerce')

            C = ratings['averageRating'].mean()
            m = ratings['numVotes'].quantile(0.90)
            ratings['weighted_rating'] = (
                (ratings['numVotes']/(ratings['numVotes'] + m) * ratings['averageRating']) +
                (m/(ratings['numVotes'] + m) * C)
            )

            return basics, ratings, crew

        except Exception as e:
            logger.error(f"Error during data transformation: {str(e)}")
            raise

    def integrate_data(self, basics, ratings, crew):
      try:
        logger.info("Starting data integration...")

        movies = basics.merge(ratings, on='tconst', how='inner')
        movies = movies.merge(crew, on='tconst', how='inner')

        # Only filter for essential quality criteria
        movies = movies[
            (movies['numVotes'] >= 1000) &  # Keep minimum votes requirement
            (movies['genres'] != 'Unknown')  # Keep genre requirement
            # Removed year-based filtering
        ]

        # Sort by rating only, not year
        movies = movies.sort_values(
            'weighted_rating',
            ascending=False
        ).drop_duplicates('primaryTitle', keep='first')

        return movies

      except Exception as e:
        logger.error(f"Error during data integration: {str(e)}")
        raise

    def engineer_features(self, movies):
      try:
        logger.info("Starting feature engineering...")

        # Separate numerical and categorical features
        movies['combined_features'] = movies.apply(
            lambda x: ' '.join([
                x['genres'],                    # Keep genres as categorical
                x['directors'],                 # Keep directors as categorical
                # Remove year from text features
            ]),
            axis=1
        )

        # Create feature matrix from text features
        self.vectorizer = CountVectorizer(stop_words='english')
        text_features = self.vectorizer.fit_transform(movies['combined_features'])

        # Add year similarity as a separate factor
        #year_similarities = 1 / (1 + np.abs(movies['startYear'].values.reshape(-1, 1) - movies['startYear'].values.reshape(1, -1)))

        # Combine text similarities with year similarities
        text_similarities = cosine_similarity(text_features)
        self.similarity_matrix = 0.8 * text_similarities + 0.2 #* year_similarities

        return movies

      except Exception as e:
        logger.error(f"Error during feature engineering: {str(e)}")
        raise

    def prepare_data(self):
        try:
            logger.info("Starting ETL pipeline...")

            basics, ratings, crew = self.extract_data()
            basics, ratings, crew = self.transform_data(basics, ratings, crew)
            movies = self.integrate_data(basics, ratings, crew)
            self.movies_processed = self.engineer_features(movies)

            logger.info("ETL pipeline completed successfully")

        except Exception as e:
            logger.error(f"Error in ETL pipeline: {str(e)}")
            raise

    def get_recommendations(self, movie_title, n=5):
      try:
        if self.movies_processed is None:
            raise ValueError("Data not prepared. Call prepare_data() first.")

        # Reset index of movies_processed to ensure alignment
        self.movies_processed = self.movies_processed.reset_index(drop=True)

        # Find the movie
        movie_mask = self.movies_processed['primaryTitle'].str.lower() == movie_title.lower()
        if not movie_mask.any():
            print(f"Movie '{movie_title}' not found in database.")
            return None

        # Get the movie's index after reset_index
        idx = movie_mask.idxmax()

        # Get similarity scores
        sim_scores = list(enumerate(self.similarity_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:n+1]  # Exclude the movie itself

        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]
        recommendations = self.movies_processed.iloc[movie_indices]

        return recommendations[['primaryTitle', 'genres', 'startYear', 'weighted_rating']]

      except Exception as e:
        logger.error(f"Error getting recommendations: {str(e)}")
        print(f"Detailed error: {str(e)}")
        return None

      except Exception as e:
        logger.error(f"Error getting recommendations: {str(e)}")
        print(f"Detailed error: {str(e)}")
        raise

    def explain_recommendations(self, movie_title, recommendations_df):
        try:
            if self.movies_processed is None:
                raise ValueError("Data not prepared. Call prepare_data() first.")

            input_movie = self.movies_processed[
                self.movies_processed['primaryTitle'].str.lower() == movie_title.lower()
            ].iloc[0]

            explanations = []
            for _, rec_movie in recommendations_df.iterrows():
                # Calculate genre overlap
                input_genres = set(input_movie['genres'].split(','))
                rec_genres = set(rec_movie['genres'].split(','))
                shared_genres = input_genres.intersection(rec_genres)

                # Calculate year difference
                year_diff = abs(input_movie['startYear'] - rec_movie['startYear'])

                # Rating comparison
                rating_diff = abs(input_movie['weighted_rating'] - rec_movie['weighted_rating'])

                explanation = f"\nWhy '{rec_movie['primaryTitle']}' was recommended:\n"
                explanation += f"- Shared genres: {', '.join(shared_genres)}\n"
                explanation += f"- Year difference: {year_diff} years\n"
                explanation += f"- Rating similarity: Both rated around {rec_movie['weighted_rating']:.1f}\n"

                explanations.append(explanation)

            return explanations

        except Exception as e:
            logger.error(f"Error explaining recommendations: {str(e)}")
            raise

In [None]:
# Add this after initializing the recommender
recommender = IMDBRecommender("/content/data")
recommender.prepare_data()

# Debug prints
print(f"Is 'Saving Private Ryan' in dataset?: {recommender.movies_processed['primaryTitle'].str.contains('Saving Private Ryan').any()}")

Is 'Saving Private Ryan' in dataset?: True


In [None]:
movie_title = "Saving Private Ryan"
# Debug prints
print("Looking for movie:", movie_title)
matching_movies = recommender.movies_processed[recommender.movies_processed['primaryTitle'].str.contains(movie_title, case=False)]
print("\nFound these matching movies:")
print(matching_movies[['primaryTitle', 'startYear', 'genres']])

# Get recommendations
recommendations = recommender.get_recommendations(movie_title)
if isinstance(recommendations, pd.DataFrame):
    print(f"\nTop 5 recommendations for '{movie_title}':")
    display(recommendations)

    explanations = recommender.explain_recommendations(movie_title, recommendations)
    for explanation in explanations:
        print(explanation)
else:
    print("No recommendations found.")


Looking for movie: Saving Private Ryan

Found these matching movies:
              primaryTitle  startYear     genres
92251  Saving Private Ryan     1998.0  Drama,War

Top 5 recommendations for 'Saving Private Ryan':


Unnamed: 0,primaryTitle,genres,startYear,weighted_rating
22556,Empire of the Sun,"Drama,War",1987.0,7.697411
38056,War Horse,"Adventure,Drama,War",2011.0,7.199303
22604,The Color Purple,Drama,1985.0,7.696444
28658,The Fabelmans,Drama,2022.0,7.497899
2327,Aguner Poroshmoni,"Drama,War",1994.0,8.841608



Why 'Empire of the Sun' was recommended:
- Shared genres: War, Drama
- Year difference: 11.0 years
- Rating similarity: Both rated around 7.7


Why 'War Horse' was recommended:
- Shared genres: War, Drama
- Year difference: 13.0 years
- Rating similarity: Both rated around 7.2


Why 'The Color Purple' was recommended:
- Shared genres: Drama
- Year difference: 13.0 years
- Rating similarity: Both rated around 7.7


Why 'The Fabelmans' was recommended:
- Shared genres: Drama
- Year difference: 24.0 years
- Rating similarity: Both rated around 7.5


Why 'Aguner Poroshmoni' was recommended:
- Shared genres: War, Drama
- Year difference: 4.0 years
- Rating similarity: Both rated around 8.8



In [None]:
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")

Execution time: 175.54 seconds
