In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-100k-dataset/ml-100k/u.occupation
/kaggle/input/movielens-100k-dataset/ml-100k/u1.base
/kaggle/input/movielens-100k-dataset/ml-100k/u.info
/kaggle/input/movielens-100k-dataset/ml-100k/u4.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.item
/kaggle/input/movielens-100k-dataset/ml-100k/README
/kaggle/input/movielens-100k-dataset/ml-100k/u1.test
/kaggle/input/movielens-100k-dataset/ml-100k/ua.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.data
/kaggle/input/movielens-100k-dataset/ml-100k/u5.test
/kaggle/input/movielens-100k-dataset/ml-100k/mku.sh
/kaggle/input/movielens-100k-dataset/ml-100k/u5.base
/kaggle/input/movielens-100k-dataset/ml-100k/u.user
/kaggle/input/movielens-100k-dataset/ml-100k/ub.base
/kaggle/input/movielens-100k-dataset/ml-100k/u4.base
/kaggle/input/movielens-100k-dataset/ml-100k/u2.test
/kaggle/input/movielens-100k-dataset/ml-100k/ua.base
/kaggle/input/movielens-100k-dataset/ml-100k/u3.test
/kaggle/input/movielens-100k-dataset/ml-100k/u.

In [2]:
import pandas as pd

# Load ratings
ratings = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Load movie titles
movies = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.item', sep='|', encoding='latin-1', header=None,
                     names=['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + [f'genre_{i}' for i in range(19)],
                     usecols=[0, 1])

In [3]:
user_item_matrix = ratings.pivot(index='user_id', columns='item_id', values='rating').fillna(0)

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [6]:
def get_recommendations(user_id, user_item_matrix, user_similarity_df, n_recommendations=5):
    # Get the user's ratings
    user_ratings = user_item_matrix.loc[user_id]
    
    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    
    # Get weighted ratings from similar users
    weighted_ratings = np.zeros(user_item_matrix.shape[1])
    for similar_user, similarity_score in similar_users.items():  # Use items() instead of iteritems()
        if similar_user != user_id:  # Exclude the user themselves
            weighted_ratings += user_item_matrix.loc[similar_user] * similarity_score
    
    # Normalize the weighted ratings
    weighted_ratings /= similar_users.sum()
    
    # Get unseen movies
    unseen_movies = user_ratings[user_ratings == 0]
    
    # Get top N recommendations
    recommendations = pd.Series(weighted_ratings[unseen_movies.index]).sort_values(ascending=False)
    return recommendations.head(n_recommendations)

# Example usage
recommended_movies = get_recommendations(user_id=1, user_item_matrix=user_item_matrix, user_similarity_df=user_similarity_df)
print(recommended_movies)

item_id
318    2.035432
423    1.871222
357    1.792122
286    1.742431
288    1.696068
Name: 916, dtype: float64


In [7]:
def precision_at_k(recommended_movies, actual_movies, k):
    recommended_set = set(recommended_movies.index[:k])
    actual_set = set(actual_movies)
    
    # Calculate precision
    true_positives = len(recommended_set.intersection(actual_set))
    precision = true_positives / k
    return precision

# Example usage
actual_movies = [2, 3, 4]  # Movies that the user actually liked
precision = precision_at_k(recommended_movies, actual_movies, k=5)
print(f'Precision at 5: {precision}')

Precision at 5: 0.0


In [9]:
def get_item_based_recommendations(item_id, user_item_matrix, item_similarity_df, n_recommendations=5):
    # Get similar items
    similar_items = item_similarity_df[item_id].sort_values(ascending=False)
    
    # Get user's ratings for similar items
    user_ratings = user_item_matrix.loc[:, item_id]
    
    # Get weighted ratings from similar items
    weighted_ratings = np.zeros(user_item_matrix.shape[0])
    for similar_item, similarity_score in similar_items.items():  # Use items() instead of iteritems()
        if similar_item != item_id:  # Exclude the item itself
            weighted_ratings += user_item_matrix.loc[:, similar_item] * similarity_score
    
    # Normalize the weighted ratings
    weighted_ratings /= similar_items.sum()
    
    # Get top N recommendations
    recommendations = pd.Series(weighted_ratings).sort_values(ascending=False)
    return recommendations.head(n_recommendations)

# Example usage for item-based recommendations
item_recommendations = get_item_based_recommendations(item_id=1, user_item_matrix=user_item_matrix, item_similarity_df=item_similarity_df)
print(item_recommendations)

user_id
450    2.147032
416    2.073900
276    1.989963
13     1.886143
303    1.857540
Name: 50, dtype: float64


In [10]:
# Get actual liked movies (rated 4 or 5 stars) for a user
user_id = 1  # User to evaluate
actual_liked = ratings[
    (ratings['user_id'] == user_id) & 
    (ratings['rating'] >= 4)
]['item_id'].tolist()

print(f"User {user_id} liked these movies (rated ≥4):", actual_liked)


User 1 liked these movies (rated ≥4): [61, 33, 160, 20, 202, 171, 265, 47, 222, 253, 113, 227, 90, 64, 228, 121, 114, 132, 134, 98, 186, 221, 84, 60, 177, 174, 82, 56, 80, 229, 235, 6, 206, 76, 72, 185, 96, 258, 81, 212, 151, 51, 175, 107, 209, 108, 12, 14, 44, 163, 210, 184, 157, 150, 183, 248, 208, 128, 242, 193, 236, 250, 91, 129, 241, 267, 86, 196, 39, 230, 23, 224, 65, 190, 100, 154, 214, 161, 170, 9, 246, 22, 187, 135, 68, 146, 176, 166, 89, 249, 269, 32, 270, 133, 239, 194, 256, 93, 234, 1, 197, 173, 75, 268, 144, 119, 181, 257, 109, 182, 223, 46, 169, 162, 66, 77, 199, 57, 50, 192, 178, 87, 238, 156, 106, 115, 137, 127, 16, 79, 45, 48, 25, 251, 195, 168, 123, 191, 203, 55, 42, 7, 43, 165, 198, 124, 95, 58, 216, 204, 3, 207, 19, 18, 59, 15, 111, 52, 88, 13, 28, 172, 152]


In [11]:
recommendations = get_recommendations(
    user_id=user_id,
    user_item_matrix=user_item_matrix,
    user_similarity_df=user_similarity_df,
    n_recommendations=10
)

print("Top recommendations:", recommendations.index.tolist())

Top recommendations: [318, 423, 357, 286, 288, 483, 405, 294, 496, 655]
