In [1]:
!pip install scikit-learn




[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def load_data():
    """Load movie dataset."""
    try:
        # Sample dataset (replace this with your actual dataset path)
        movies = pd.read_csv('movies.csv').head(1000)
        print("Dataset loaded successfully.")
        return movies
    except FileNotFoundError:
        print("Dataset not found. Ensure the file 'movies.csv' is in the current directory.")
        return None

In [4]:
def preprocess_data(movies):
    """Preprocess the movie dataset."""
    movies = movies.dropna(subset=['title', 'genres'])  # Drop rows with missing titles or genres
    movies['genres'] = movies['genres'].str.replace('|', ' ')  # Replace '|' with spaces in genres
    movies['genres'] = movies['genres'].str.lower()  # Make all genres lowercase for better comparison
    return movies

In [5]:
def build_recommendation_system(movies):
    """Build a content-based movie recommendation system using TF-IDF and cosine similarity."""
    # Convert genres to feature vectors using TF-IDF
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['genres'])

    # Compute cosine similarity between all movies
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Map movie titles to indices
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

    return cosine_sim, indices


In [6]:
def get_recommendations(title, cosine_sim, indices, movies):
    """Get top 10 movie recommendations based on a given title."""
    try:
        idx = indices[title]
    except KeyError:
        print(f"Movie '{title}' not found in the dataset.")
        return []

    # Get pairwise similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices of the top 10 most similar movies (excluding the first one as it is the movie itself)
    sim_scores = sim_scores[1:11]  # Exclude the first movie (itself)
    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices].tolist()


In [7]:
def main():
    movies = load_data()
    if movies is None:
        return

    # Preprocess the data
    movies = preprocess_data(movies)

    # Build recommendation system
    cosine_sim, indices = build_recommendation_system(movies)

    # Test the recommendation system
    test_movie = input("Enter a movie title to get recommendations: ").strip().lower()
    
    # Ensure movie title exists (case insensitive check)
    if test_movie not in [title.lower() for title in movies['title']]:
        print(f"Movie '{test_movie}' not found in the dataset.")
        return

    recommendations = get_recommendations(test_movie.title(), cosine_sim, indices, movies)

    if recommendations:
        print("Top 10 movie recommendations:")
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec}")
    else:
        print("No recommendations found.")

if __name__ == "__main__":
    main()

Dataset loaded successfully.


Enter a movie title to get recommendations:  Toy Story (1995)


Top 10 movie recommendations:
1. Pagemaster, The (1994)
2. James and the Giant Peach (1996)
3. Balto (1995)
4. Space Jam (1996)
5. Kids of the Round Table (1995)
6. All Dogs Go to Heaven 2 (1996)
7. Jumanji (1995)
8. Indian in the Cupboard, The (1995)
9. NeverEnding Story III, The (1994)
10. Escape to Witch Mountain (1975)
