In [None]:
# get data from grouplens (movielens data)
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

In [None]:
!unzip ml-100k.zip

In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Define file paths
ratings_file = '/content/ml-100k/u.data'
movies_file = '/content/ml-100k/u.item'

# Load ratings data
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
movies_ratings_df = pd.read_csv(ratings_file, sep='\t', names=ratings_columns)

# Load movies data
movies_columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown',
                  'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama',
                  'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
                  'War', 'Western']
movies_df = pd.read_csv(movies_file, sep='|', names=movies_columns, encoding='latin-1')


In [12]:
movies_ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [15]:
# Merge ratings and movie data
movies_ratings_df = pd.merge(movies_ratings_df, movies_df[['movie_id', 'movie_title']], on='movie_id')
movies_ratings_df.drop(['timestamp'], axis=1, inplace=True)
movies_ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,movie_title
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [17]:
# Create user-item matrix for the filtered popular movies
user_item_matrix = movies_ratings_df.pivot_table(index='user_id', columns='movie_title', values='rating')

user_item_matrix.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [22]:
# Fill missing ratings with 0
user_item_matrix.fillna(0, inplace=True)

# Compute user similarity matrix using cosine similarity (user-user matrix)
similarity_matrix = cosine_similarity(user_item_matrix)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

similarity_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.168937,0.048388,0.064561,0.37967,0.429682,0.443097,0.320079,0.078385,0.377733,...,0.372213,0.11986,0.26986,0.193343,0.197949,0.118722,0.315064,0.149086,0.181612,0.399432
2,0.168937,1.0,0.113393,0.179694,0.073623,0.242106,0.108604,0.104257,0.16247,0.161273,...,0.147095,0.310661,0.363328,0.410725,0.322713,0.231096,0.228793,0.162911,0.175273,0.106732
3,0.048388,0.113393,1.0,0.349781,0.021592,0.074018,0.067423,0.084419,0.062039,0.066217,...,0.033885,0.043453,0.16714,0.071288,0.126278,0.026758,0.164539,0.102899,0.136757,0.02699
4,0.064561,0.179694,0.349781,1.0,0.031804,0.068431,0.091507,0.18806,0.101284,0.060859,...,0.054615,0.036784,0.133619,0.196561,0.146058,0.030202,0.196858,0.152041,0.171538,0.058752
5,0.37967,0.073623,0.021592,0.031804,1.0,0.238636,0.374733,0.24893,0.056847,0.201427,...,0.340183,0.08058,0.095284,0.081053,0.148607,0.071612,0.239955,0.139595,0.153799,0.313941


In [25]:
# Function to recommend movies for a user (only from popular movies)
def recommend_movies_colab_user(user_id, n_recommendations=5, similarity_threshold=0.5):
    # Get the list of similar users
    similar_users = similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users[similar_users > similarity_threshold]

    # Get watched movies for the user
    user_movies = user_item_matrix.loc[user_id]
    watched_movies = user_movies[user_movies > 0].index.tolist()

    # Get movies watched by similar users
    similar_user_ratings = user_item_matrix.loc[similar_users.index]
    similar_user_ratings = similar_user_ratings.mean().sort_values(ascending=False)

    # Recommend movies not yet watched by the user
    recommendations = [movie for movie in similar_user_ratings.index if movie not in watched_movies]

    return recommendations[:n_recommendations]


In [42]:
# Example: Recommend movies for a user (example for user with ID 5)
collaborative_recommendations_user = recommend_movies_colab_user(user_id=1, n_recommendations=5)

collaborative_recommendations_user

["Schindler's List (1993)",
 "One Flew Over the Cuckoo's Nest (1975)",
 'E.T. the Extra-Terrestrial (1982)',
 'Casablanca (1942)',
 "It's a Wonderful Life (1946)"]

In [28]:
# Get the titles of the movies User 10 has watched
watched_movies_user_1 = user_item_matrix.loc[1]
watched_movies_user_1 = watched_movies_user_1[watched_movies_user_1 > 0].index.tolist()

# Display the watched movies
print("Movies watched by User 1: ", watched_movies_user_1)

Movies watched by User 1:  ['101 Dalmatians (1996)', '12 Angry Men (1957)', '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)', 'Abyss, The (1989)', 'Ace Ventura: Pet Detective (1994)', 'Air Bud (1997)', 'Akira (1988)', 'Aladdin (1992)', 'Alien (1979)', 'Aliens (1986)', 'All Dogs Go to Heaven 2 (1996)', 'Amadeus (1984)', 'Angels and Insects (1995)', "Antonia's Line (1995)", 'Apocalypse Now (1979)', 'Apollo 13 (1995)', 'Aristocats, The (1970)', 'Army of Darkness (1993)', 'Austin Powers: International Man of Mystery (1997)', 'Babe (1995)', 'Back to the Future (1985)', 'Bad Boys (1995)', 'Basic Instinct (1992)', 'Batman & Robin (1997)', 'Batman Forever (1995)', 'Batman Returns (1992)', 'Beavis and Butt-head Do America (1996)', 'Bedknobs and Broomsticks (1971)', 'Belle de jour (1967)', 'Big Night (1996)', 'Billy Madison (1995)', 'Birdcage, The (1996)', 'Blade Runner (1982)', 'Blues Brothers, The (1980)', 'Bound (1996)', "Bram Stoker's Dracula (1992)", 'Braveheart (1995)'

In [35]:
# Create an item-user matrix (transpose of user-item matrix)
item_user_matrix = user_item_matrix.T

# Calculate item-item similarity using cosine similarity
item_similarity_matrix = cosine_similarity(item_user_matrix)

# Convert to DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity_matrix, index=item_user_matrix.index, columns=item_user_matrix.index)

item_similarity_df.head()


movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,0.0,0.024561,0.099561,0.185236,0.159265,0.0,0.052203,0.0,0.033326,...,0.0,0.0,0.0,0.027774,0.11884,0.142315,0.02907,0.0,0.110208,0.0
1-900 (1994),0.0,1.0,0.014139,0.009294,0.007354,0.004702,0.010055,0.067038,0.0,0.0,...,0.152499,0.015484,0.0,0.069284,0.018243,0.023408,0.006694,0.07964,0.042295,0.0
101 Dalmatians (1996),0.024561,0.014139,1.0,0.167006,0.061105,0.143878,0.203781,0.225803,0.027642,0.092337,...,0.0,0.021965,0.030905,0.274877,0.204267,0.101199,0.056976,0.172155,0.045714,0.0
12 Angry Men (1957),0.099561,0.009294,0.167006,1.0,0.056822,0.167235,0.304078,0.422506,0.072682,0.394854,...,0.060946,0.016502,0.0,0.40327,0.259436,0.145519,0.105226,0.038901,0.060101,0.081261
187 (1997),0.185236,0.007354,0.061105,0.056822,1.0,0.132327,0.042928,0.06506,0.043133,0.0273,...,0.0,0.141997,0.0,0.068257,0.067786,0.091293,0.09949,0.025184,0.142667,0.096449


In [38]:
# Function to recommend items based on item similarity
def recommend_movies_colab_item(user_id, n_recommendations=5):
    # Get the movies watched by the user
    user_ratings = user_item_matrix.loc[user_id]
    watched_movies = user_ratings[user_ratings > 0].index.tolist()

    # Create an empty series to store similarity scores
    movie_scores = pd.Series(dtype=float)

    # Loop through watched movies to calculate similarity scores for other movies
    for movie in watched_movies:
        similar_movies = item_similarity_df[movie]
        movie_scores = movie_scores.add(similar_movies, fill_value=0)

    # Remove already watched movies from recommendations
    movie_scores = movie_scores.drop(labels=watched_movies, errors='ignore')

    # Sort and return top N recommendations
    top_recommendations = movie_scores.sort_values(ascending=False).head(n_recommendations)

    return top_recommendations.index.tolist()


In [39]:
# Recommend movies for user (example for user with ID 1)
collaborative_recommendations_item = recommend_movies_colab_item(1)
collaborative_recommendations_item

['E.T. the Extra-Terrestrial (1982)',
 'Speed (1994)',
 'Batman (1989)',
 'True Lies (1994)',
 'Stand by Me (1986)']

In [40]:
from sklearn.decomposition import TruncatedSVD

# Apply SVD to the user-item matrix
svd = TruncatedSVD(n_components=20)  # Set the number of latent factors
svd_matrix = svd.fit_transform(user_item_matrix)

# Compute the similarity using the SVD-transformed matrix
similarity_matrix = cosine_similarity(svd_matrix)

# Convert to DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)

# Rest of the recommendation function remains the same
# Function to recommend movies for a user (only from popular movies)
def recommend_movies_svd(user_id, n_recommendations=5, similarity_threshold=0.5):
    # Get the list of similar users
    similar_users = similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users[similar_users > similarity_threshold]

    # Get watched movies for the user
    watched_movies = user_item_matrix.loc[user_id]
    watched_movies = watched_movies[watched_movies > 0].index.tolist()

    # Get movies watched by similar users
    similar_user_ratings = user_item_matrix.loc[similar_users.index]
    similar_user_ratings = similar_user_ratings.mean().sort_values(ascending=False)

    # Recommend movies not yet watched by the user
    recommendations = [movie for movie in similar_user_ratings.index if movie not in watched_movies]

    return recommendations[:n_recommendations]

# Example: Recommend movies for a user (example for user with ID 1)
svd_recommendations = recommend_movies_svd(user_id=1)

svd_recommendations

["Schindler's List (1993)",
 "One Flew Over the Cuckoo's Nest (1975)",
 'E.T. the Extra-Terrestrial (1982)',
 'Casablanca (1942)',
 "It's a Wonderful Life (1946)"]

In [44]:
# ensemble learning
# Combine the results
combined_recommendations = collaborative_recommendations_user + collaborative_recommendations_item + svd_recommendations


["Schindler's List (1993)",
 "One Flew Over the Cuckoo's Nest (1975)",
 'E.T. the Extra-Terrestrial (1982)',
 'Casablanca (1942)',
 "It's a Wonderful Life (1946)",
 'E.T. the Extra-Terrestrial (1982)',
 'Speed (1994)',
 'Batman (1989)',
 'True Lies (1994)',
 'Stand by Me (1986)',
 "Schindler's List (1993)",
 "One Flew Over the Cuckoo's Nest (1975)",
 'E.T. the Extra-Terrestrial (1982)',
 'Casablanca (1942)',
 "It's a Wonderful Life (1946)"]

In [48]:
# Count the frequency of each movie in the combined list
from collections import Counter
recommendation_counts = Counter(combined_recommendations)

# Sort by the most frequent movies (appeared in both recommendations)
ensemble_recommendations = [movie for movie, count in recommendation_counts.most_common()]

# Display the ensemble recommendations
ensemble_recommendations[:5]


['E.T. the Extra-Terrestrial (1982)',
 "Schindler's List (1993)",
 "One Flew Over the Cuckoo's Nest (1975)",
 'Casablanca (1942)',
 "It's a Wonderful Life (1946)"]