In [None]:
pip install numpy pandas scikit-learn




In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os


In [None]:
movies_data = pd.read_csv("/content/movies.csv")

# Display the first few rows of the dataset
print(movies_data.head())


   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [None]:
print("\nMissing values in each column:")
print(movies_data.isna().sum())


Missing values in each column:
index                      0
budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                    0
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
director                   0
dtype: int64


In [None]:
# Loading the dataset
movies_data = pd.read_csv("movies.csv")

# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(movies_data.head())

# Handle missing values

movies_data['homepage'] = movies_data['homepage'].fillna('')
movies_data['overview'] = movies_data['overview'].fillna('No Overview')
movies_data['runtime'] = movies_data['runtime'].fillna('not mention')
movies_data['release_date'] = movies_data['release_date'].fillna('Unknown')
movies_data['keywords'] = movies_data['keywords'].fillna('no keywords')
movies_data['tagline'] = movies_data['tagline'].fillna('no tagline')
movies_data['cast'] = movies_data['cast'].fillna('Unknown')
movies_data['director'] = movies_data['director'].fillna('Unknown')

# Remove movies where 'genres' column is missing
movies_data_cleaned = movies_data[movies_data['genres'].notna()]

print("\nMissing values in column after cleaning:")
print(movies_data_cleaned.isna().sum())

# Display the cleaned dataset preview
print("\nCleaned dataset preview:")
print(movies_data.head())



First few rows of the dataset:
   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2

In [None]:
# Check for duplicate rows
print("\nNumber of duplicate rows:")
print(movies_data.duplicated().sum())
# Drop duplicate rows if any
movies_data.drop_duplicates(inplace=True)


Number of duplicate rows:
0


In [None]:
# Replace missing values in text columns with an empty string
text_columns = ['genres', 'keywords', 'tagline', 'cast', 'director']
for column in text_columns:
    movies_data[column] = movies_data[column].fillna('')

In [None]:
# Check data types of columns
print("\nData types of each column:")
print(movies_data.dtypes)


Data types of each column:
index                     int64
budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                  object
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
cast                     object
crew                     object
director                 object
dtype: object


In [None]:
print("\nCleaned dataset preview:")
print(movies_data.head(5))


Cleaned dataset preview:
   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2     

In [None]:
# Selecting relevant columns for recommendation
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']

# Check for missing values
print(movies_data[selected_features].isna().sum())

# Replace missing values with an empty string
for feature in selected_features:
    movies_data[feature] = movies_data[feature].fillna('')


genres      0
keywords    0
tagline     0
cast        0
director    0
dtype: int64


In [None]:
# Combine all selected features into a single string
combined_features = movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast'] + ' ' + movies_data['director']


In [None]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Convert the combined features to TF-IDF feature vectors
feature_vectors = vectorizer.fit_transform(combined_features)

# Print the shape of the feature vectors
print(feature_vectors.shape)


(4803, 17320)


In [None]:
# Calculate cosine similarity between all movies
similarity = cosine_similarity(feature_vectors)

# Print the shape of the similarity matrix
print(similarity.shape)


(4803, 4803)


In [None]:
import pandas as pd

# Assuming movies_data is already loaded with columns like 'title', 'genres', 'popularity', etc.
# Example: movies_data = pd.read_csv('path_to_your_dataset.csv')

# Display a menu with predefined genres for the user to choose from
print("\nSelect the type of movie you prefer:")
print("1. Action")
print("2. Comedy")
print("3. Romance")
print("4. Horror")
print("5. Sci-Fi")
print("6. Drama")
print("7. Thriller")
print("8. Animation")
print("9. Adventure")

# Get user's genre choice
genre_choice = input("Enter the number corresponding to your preferred genre: ").strip().capitalize()

genre_map = {
    '1': 'Action',
    '2': 'Comedy',
    '3': 'Romance',
    '4': 'Horror',
    '5': 'Sci-Fi',
    '6': 'Drama',
    '7': 'Thriller',
    '8': 'Animation',
    '9': 'Adventure'
}

# Get the genre preference based on the user's choice or default to 'Animation' if invalid input
genre_preference = genre_map.get(genre_choice, 'Animation')

# Ask the user about their mood for recommendations
mood_choice = input("\nDo you want highly recommended movies or less recommended movies? (Enter 'highly' or 'less'): ").strip().lower()

# Check if 'genres' column exists in the dataset
if 'genres' not in movies_data.columns:
    print("The 'genres' column is missing from the dataset.")
else:
    # Clean the genres column to make sure it's in a consistent format (string type)
    movies_data['genres'] = movies_data['genres'].astype(str)

    # Filter movies by genre
    genre_filtered_movies = movies_data[movies_data['genres'].str.contains(genre_preference, case=False, na=False)]

    # Check if any movies are found for the selected genre
    if genre_filtered_movies.empty:
        print(f"No movies found for the genre: {genre_preference}")
    else:
        # Sort the filtered movies by popularity (or rating)
        genre_filtered_movies_sorted = genre_filtered_movies.sort_values(by='popularity', ascending=False)

        # Filter movies based on user's mood choice
        if mood_choice == 'highly':
            # Show the top 50% highly recommended movies based on popularity
            top_50_percent = genre_filtered_movies_sorted.head(int(len(genre_filtered_movies_sorted) * 0.5))
            print(f"\nTop Highly Recommended {genre_choice} Movies for You: \n")
        elif mood_choice == 'less':
            # Show the bottom 50% less recommended movies based on popularity
            bottom_50_percent = genre_filtered_movies_sorted.tail(int(len(genre_filtered_movies_sorted) * 0.5))
            print(f"\nTop Less Recommended {genre_choice} Movies for You: \n")
        else:
            print("Invalid mood choice. Showing the highly recommended movies by default.")
            top_50_percent = genre_filtered_movies_sorted.head(int(len(genre_filtered_movies_sorted) * 0.5))
            print(f"\nTop Highly Recommended {genre_choice} Movies for You: \n")

        # Show only up to 15 movies initially
        movies_to_show = top_50_percent if mood_choice == 'highly' else bottom_50_percent
        movies_to_show = movies_to_show.head(15)  # Limit to 15 movies or fewer if there are less

        # Display the first 15 movies
        for i, movie in enumerate(movies_to_show['title'], start=1):
            print(f"{i}. {movie}")



Select the type of movie you prefer:
1. Action
2. Comedy
3. Romance
4. Horror
5. Sci-Fi
6. Drama
7. Thriller
8. Animation
9. Adventure
Enter the number corresponding to your preferred genre: comedy

Do you want highly recommended movies or less recommended movies? (Enter 'highly' or 'less'): highly

Top Highly Recommended Comedy Movies for You: 

1. Minions
2. Big Hero 6
3. Frozen
4. Despicable Me 2
5. Inside Out
6. Brave
7. Spirited Away
8. Despicable Me
9. Monsters, Inc.
10. How to Train Your Dragon 2
11. Ice Age
12. Aladdin
13. Up
14. The Lion King
15. The Adventures of Tintin


In [None]:
!pip install nbconvert




In [None]:
from google.colab import drive
import shutil

# Save the notebook to a specific location in your Google Drive (optional)
notebook_name = '/content/Untitled12.ipynb'
output_html = '/content/Untitled-12.html'  # You can change the path here

# Convert the notebook to HTML
!jupyter nbconvert --to html {notebook_name} --output {output_html}


[NbConvertApp] Converting notebook /content/Untitled12.ipynb to html
[NbConvertApp] Writing 324750 bytes to /content/Untitled-12.html
