<a href="https://colab.research.google.com/github/sohaniiiiii/CODSOFT/blob/main/recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd  # For handling datasets
import numpy as np  # For numerical computations
from sklearn.feature_extraction.text import TfidfVectorizer  # For text processing (genres)
from sklearn.metrics.pairwise import cosine_similarity  # For similarity calculations
import requests  # For downloading datasets
import zipfile  # For extracting files
import os  # For file handling

In [25]:
# URL for MovieLens dataset (small version)
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
dataset_path = "ml-latest-small.zip"

# Check if dataset exists, if not, download it
if not os.path.exists("ml-latest-small"):
    print("Downloading MovieLens dataset...")
    response = requests.get(url)

    with open(dataset_path, "wb") as file:
        file.write(response.content)

    # Extract the zip file
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(".")
    print("Download complete.")

# Load movie details
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")

# Display first 5 rows of each dataset
print("Movies Dataset:")
print(movies.head())

print("\nRatings Dataset:")
print(ratings.head())

Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [26]:
# Step 1: Split genres into a better format
movies['genres'] = movies['genres'].str.replace('|', ' ')  # Replace '|' with space

# Step 2: Merge movies and ratings datasets
movie_ratings = pd.merge(ratings, movies, on="movieId")

# Step 3: Check for missing values
print("Missing Values in Dataset:")
print(movie_ratings.isnull().sum())

# Step 4: Display processed dataset
print("\nProcessed Movies Data:")
print(movie_ratings.head())

Missing Values in Dataset:
userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64

Processed Movies Data:
   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure Animation Children Comedy Fantasy  
1                               Comedy Romance  
2                        Action Crime Thriller  
3                             Mystery Thriller  
4                       Crime Mystery Thriller  


In [27]:
# Step 1: Convert genres to lowercase text
movies['genres'] = movies['genres'].fillna('')  # Fill NaN values with empty string

# Step 2: Apply TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Step 3: Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Genre-based similarity matrix created!")
# content_sim_df = pd.DataFrame(cosine_sim_content, index=movies['title'], columns=movies['title'])

Genre-based similarity matrix created!


In [28]:
def recommend_movies(title, num_recommendations=5):
    # Step 1: Get movie index by title
    indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

    if title not in indices:
        return "Movie not found! Please check the title."

    idx = indices[title]  # Get index of the given movie

    # Step 2: Get similarity scores of all movies with this movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Step 3: Sort movies based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Step 4: Get top 'num_recommendations' similar movies
    sim_scores = sim_scores[1:num_recommendations + 1]

    # Step 5: Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]

# Test the recommendation system
print("Recommended movies similar to 'Toy Story (1995)':")
print(recommend_movies("Toy Story (1995)"))

Recommended movies similar to 'Toy Story (1995)':
1706                                       Antz (1998)
2355                                Toy Story 2 (1999)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
Name: title, dtype: object


In [29]:
# # Create a pivot table where rows = movies, columns = users, values = ratings
# movie_pivot = movie_ratings.pivot_table(index='title', columns='userId', values='rating')

# # Fill NaN values with 0 (unrated movies)
# movie_pivot = movie_pivot.fillna(0)

# print("User-Movie Rating Matrix Created!")
# print(movie_pivot.head())



# Create a pivot table where rows = movies, columns = users, values = ratings
movie_pivot = movie_ratings.pivot_table(index='title', columns='userId', values='rating')

# Fill NaN values with the average rating of each movie
movie_pivot = movie_pivot.apply(lambda row: row.fillna(row.mean()), axis=1)

print("User-Movie Rating Matrix Created!")
print(movie_pivot.head())

User-Movie Rating Matrix Created!
userId                                   1    2    3    4    5    6    7    \
title                                                                        
'71 (2014)                               4.0  4.0  4.0  4.0  4.0  4.0  4.0   
'Hellboy': The Seeds of Creation (2004)  4.0  4.0  4.0  4.0  4.0  4.0  4.0   
'Round Midnight (1986)                   3.5  3.5  3.5  3.5  3.5  3.5  3.5   
'Salem's Lot (2004)                      5.0  5.0  5.0  5.0  5.0  5.0  5.0   
'Til There Was You (1997)                4.0  4.0  4.0  4.0  4.0  4.0  4.0   

userId                                   8    9    10   ...  601  602  603  \
title                                                   ...                  
'71 (2014)                               4.0  4.0  4.0  ...  4.0  4.0  4.0   
'Hellboy': The Seeds of Creation (2004)  4.0  4.0  4.0  ...  4.0  4.0  4.0   
'Round Midnight (1986)                   3.5  3.5  3.5  ...  3.5  3.5  3.5   
'Salem's Lot (2004)          

In [30]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Compute similarity between movies based on user ratings
# movie_similarity = cosine_similarity(movie_pivot)

# # Convert similarity into DataFrame
# movie_sim_df = pd.DataFrame(movie_similarity, index=movie_pivot.index, columns=movie_pivot.index)

# print("Movie similarity matrix created!")
# print(movie_sim_df.head())  # Display the first few rows of the similarity matrix



from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Create a pivot table where rows = movies, columns = users, values = ratings
movie_pivot = movie_ratings.pivot_table(index='title', columns='userId', values='rating')

# Fill NaN values with the average rating of each movie
movie_pivot = movie_pivot.apply(lambda row: row.fillna(row.mean()), axis=1)

# Filter movies that have a reasonable number of ratings (e.g., at least 10 ratings)
movie_pivot_filtered = movie_pivot.loc[movie_pivot.count(axis=1) > 10]

# Compute similarity between movies based on user ratings
movie_similarity = cosine_similarity(movie_pivot_filtered)

# Convert similarity into DataFrame
movie_sim_df = pd.DataFrame(movie_similarity, index=movie_pivot_filtered.index, columns=movie_pivot_filtered.index)

print("Movie similarity matrix created!")
print(movie_sim_df.head())  # Display the first few rows of the similarity matrix

Movie similarity matrix created!
title                                    '71 (2014)  \
title                                                 
'71 (2014)                                 1.000000   
'Hellboy': The Seeds of Creation (2004)    1.000000   
'Round Midnight (1986)                     1.000000   
'Salem's Lot (2004)                        1.000000   
'Til There Was You (1997)                  0.999898   

title                                    'Hellboy': The Seeds of Creation (2004)  \
title                                                                              
'71 (2014)                                                              1.000000   
'Hellboy': The Seeds of Creation (2004)                                 1.000000   
'Round Midnight (1986)                                                  1.000000   
'Salem's Lot (2004)                                                     1.000000   
'Til There Was You (1997)                                               0.999

In [31]:
def recommend_by_ratings(movie_title, num_recommendations=5):
    if movie_title not in movie_sim_df.index:
        return "Movie not found! Please check the title."

    # Get similarity scores for the movie
    similar_scores = movie_sim_df[movie_title].sort_values(ascending=False)

    # Get top recommended movies (excluding the input movie itself)
    recommended_movies = similar_scores.iloc[1:num_recommendations + 1].index

    return recommended_movies

# Test the recommendation system
print("Recommended movies based on 'Toy Story (1995)':")
print(recommend_by_ratings("Toy Story (1995)"))

Recommended movies based on 'Toy Story (1995)':
Index(['400 Blows, The (Les quatre cents coups) (1959)',
       'Station Agent, The (2003)', 'The Nice Guys (2016)',
       'Stranger Than Paradise (1984)', 'Untitled Spider-Man Reboot (2017)'],
      dtype='object', name='title')
