In [68]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [69]:
df = pd.read_csv(r"C:\Users\Rohan\Pictures\rohan\movies.csv")
df.drop(columns=['movieId'],inplace=True)
df.head()

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy


In [70]:
df.drop_duplicates(inplace=True)

In [71]:
df.dropna(inplace=True)

In [72]:
df.shape

(62409, 2)

In [73]:
df.isnull().sum()

title     0
genres    0
dtype: int64

In [78]:
df.head()

Unnamed: 0,title,genres
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji (1995),Adventure|Children|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,Father of the Bride Part II (1995),Comedy


In [79]:
df.shape

(62409, 2)

In [80]:
df =df[:10000]

In [81]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres'])

genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

df = pd.concat([df[['title']], genre_df], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genres'] = df['genres'].apply(lambda x: x.split('|'))


In [82]:
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

def get_recommendations(title, df, cosine_sim):
    idx = df[df['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[1:6]] 
    return df['title'].iloc[movie_indices]

print(get_recommendations('Toy Story (1995)', df, cosine_sim))


2203                                       Antz (1998)
3021                                Toy Story 2 (1999)
3653    Adventures of Rocky and Bullwinkle, The (2000)
3912                  Emperor's New Groove, The (2000)
4780                             Monsters, Inc. (2001)
Name: title, dtype: object


In [90]:
def get_recommendations(title, df, n=5):
    # First, try to find the exact title
    movie_row = df[df['title'] == title]
    
    # If not found, try to find a title that starts with the given title
    if movie_row.empty:
        movie_row = df[df['title'].str.startswith(title)]
    
    # If still not found, raise an error
    if movie_row.empty:
        raise ValueError(f"Movie '{title}' not found in the dataset")
    
    # Get the index of the movie
    idx = movie_row.index[0]
    
    # Get the genres of the movie (all columns except 'title' and 'year')
    movie_genres = df.iloc[idx, 2:]
    
    # Calculate the cosine similarity between this movie and all others
    sim_scores = cosine_similarity(movie_genres.values.reshape(1, -1), df.iloc[:, 2:].values)
    
    # Get the indices of the most similar movies
    sim_scores = list(enumerate(sim_scores[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # exclude the movie itself
    
    # Get the indices of the most similar movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar movies
    return df['title'].iloc[movie_indices]

# Example usage
try:
    print(get_recommendations('Toy ', df))
except ValueError as e:
    print(e)


2203                                       Antz (1998)
3021                                Toy Story 2 (1999)
3653    Adventures of Rocky and Bullwinkle, The (2000)
3912                  Emperor's New Groove, The (2000)
4780                             Monsters, Inc. (2001)
Name: title, dtype: object


In [102]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Additional libraries for collaborative filtering (if user ratings are available)
from scipy.sparse.linalg import svds
import numpy as np
import numpy as np
import re

df = pd.read_csv(r"C:\Users\Rohan\Pictures\rohan\movies.csv")
df.drop(columns=['movieId'],inplace=True)
df = df[:10000]

def remove_year(title):
    return re.sub(r'\s*\(\d{4}\)', '', title)

df['title'] = df['title'].apply(remove_year)


df['genres'] = df['genres'].str.split('|').apply(lambda x: ' '.join(x))
# Using TF-IDF Vectorizer to convert genres into a matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genres'])

# Compute the cosine similarity matrix based on TF-IDF
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar movies
    sim_indices = [i[0] for i in sim_scores[1:11]]

    # Return the top 10 most similar movies
    return df['title'].iloc[sim_indices]

# Example Usage
recommended_movies = get_recommendations('Toy Story')
print(recommended_movies)



2203                                                Antz
3021                                         Toy Story 2
3653             Adventures of Rocky and Bullwinkle, The
3912                           Emperor's New Groove, The
4780                                      Monsters, Inc.
9949    DuckTales: The Movie - Treasure of the Lost Lamp
1944                                 Black Cauldron, The
2026                              Lord of the Rings, The
3305                      We're Back! A Dinosaur's Story
4261                           Atlantis: The Lost Empire
Name: title, dtype: object


In [104]:
print(get_recommendations('Antz'))

2203                                                Antz
3021                                         Toy Story 2
3653             Adventures of Rocky and Bullwinkle, The
3912                           Emperor's New Groove, The
4780                                      Monsters, Inc.
9949    DuckTales: The Movie - Treasure of the Lost Lamp
1944                                 Black Cauldron, The
2026                              Lord of the Rings, The
3305                      We're Back! A Dinosaur's Story
4261                           Atlantis: The Lost Empire
Name: title, dtype: object
