In [1]:
# get data from grouplens (movielens data)
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

--2024-10-08 09:25:38--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-10-08 09:25:38 (23.1 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [2]:
!unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# File paths
ratings_path = '/content/ml-100k/u.data'
movies_path = '/content/ml-100k/u.item'

# Load ratings data
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings_df = pd.read_csv(ratings_path, sep='\t', names=ratings_columns)


# Remove the 'timestamp' column as it's not used
ratings_df = ratings_df.drop(columns=['timestamp'])

# Load movies data
movies_columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date',
                  'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                  'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
                  'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                  'Thriller', 'War', 'Western']
movies_df = pd.read_csv(movies_path, sep='|', names=movies_columns, encoding='latin-1')


In [6]:
movies_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [8]:
# Clean movies data
movies_df = movies_df.drop(['release_date', 'video_release_date', 'IMDb_URL', 'unknown'], axis=1)
movies_df.set_index('movie_id', inplace=True)

In [9]:

# Handle missing values in genre columns
movies_df.iloc[:, 1:] = movies_df.iloc[:, 1:].fillna(0)

In [10]:
# Compute similarity matrix
genre_sim_matrix = cosine_similarity(movies_df.iloc[:, 1:])
genre_sim_df = pd.DataFrame(genre_sim_matrix, index=movies_df.index, columns=movies_df.index)

In [12]:
def recommend_movies(user_id, top_n=5):
    # Get user's ratings
    user_ratings = ratings_df[ratings_df['user_id'] == user_id].dropna(subset=['rating'])

    # Get movies rated 4 or higher by the user
    user_favorites = user_ratings[user_ratings['rating'] >= 4]

    # Dictionary to store movie recommendations with their scores
    movie_scores = {}

    # For each favorite movie
    for _, favorite in user_favorites.iterrows():  # Fixed: removed asterisk
        movie_id = favorite['movie_id']
        user_rating = favorite['rating']

        # Get similar movies
        similar_movies = genre_sim_df[movie_id].drop(movie_id).nlargest(top_n)


        # Add weighted scores to dictionary
        # It weighs recommendations based on the user's ratings
        for similar_movie_id, similarity in similar_movies.items():
            weighted_score = similarity * (user_rating / 5.0)
            movie_scores[similar_movie_id] = movie_scores.get(similar_movie_id, 0) + weighted_score

    # Remove movies the user has already watched
    watched_movies = set(user_ratings['movie_id'].values)
    movie_scores = {k: v for k, v in movie_scores.items() if k not in watched_movies}

    # Sort and get top recommendations
    recommended_movies = sorted(movie_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Get movie titles
    recommended_titles = []
    for movie_id, _ in recommended_movies:
        # Handle missing movie titles with a fallback to "Unknown Movie"
        title = movies_df.loc[movie_id, 'movie_title'] if pd.notnull(movies_df.loc[movie_id, 'movie_title']) else "Unknown Movie"
        recommended_titles.append(title)

    return recommended_titles


In [13]:

# Example usage
try:
    user_id = 1  # Example user
    recommendations = recommend_movies(user_id, top_n=10)

    print(f"Recommended Movies for User {user_id}:")
    for i, title in enumerate(recommendations, 1):
        print(f"{i}. {title}")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Recommended Movies for User 1:
1. As Good As It Gets (1997)
2. Deconstructing Harry (1997)
3. Apt Pupil (1998)
4. Crash (1996)
5. Mary Reilly (1996)
6. Kalifornia (1993)
7. Judge Dredd (1995)
8. Star Trek: Generations (1994)
9. Wag the Dog (1997)
10. Adventures of Priscilla, Queen of the Desert, The (1994)
