In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split  

In [None]:
# Load the user-item matrix from preprocessing
user_item_matrix = pd.read_csv("C:/Users/Zenab/Desktop/inn assigment/res", index_col='user_id')

print("User-Item Matrix Shape:", user_item_matrix.shape)
print("Sample (First 5 Users, First 5 Movies):")
print(user_item_matrix.iloc[:5, :5])

User-Item Matrix Shape: (943, 1664)
Sample (First 5 Users, First 5 Movies):
         'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              0.0           0.0                    2.0   
2                              0.0           0.0                    0.0   
3                              0.0           0.0                    0.0   
4                              0.0           0.0                    0.0   
5                              0.0           0.0                    2.0   

         12 Angry Men (1957)  187 (1997)  
user_id                                   
1                        5.0         0.0  
2                        0.0         0.0  
3                        0.0         2.0  
4                        0.0         0.0  
5                        0.0         0.0  


In [None]:
# Load the original ratings data (not the user-item matrix)
ratings = pd.read_csv('C:/Users/Zenab/Desktop/inn assigment/ml-100k', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])

# Define the rating scale (1-5)
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise format
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)

# Split into train-test (80-20)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
# Correct way to check sizes:

print(f"Train set size: {trainset.n_ratings} ratings")
print(f"Test set size: {len(testset)} ratings")

Train set size: 80000 ratings
Test set size: 20000 ratings


In [23]:
# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)
user_similarity = pd.DataFrame(
    user_similarity,
    index=user_item_matrix.index,
    columns=user_item_matrix.index
)

print("User Similarity Matrix Shape:", user_similarity.shape)
print("Sample (First 5 Users):")
print(user_similarity.iloc[:5, :5])

User Similarity Matrix Shape: (943, 943)
Sample (First 5 Users):
user_id         1         2         3         4         5
user_id                                                  
1        1.000000  0.168937  0.048388  0.064561  0.379670
2        0.168937  1.000000  0.113393  0.179694  0.073623
3        0.048388  0.113393  1.000000  0.349781  0.021592
4        0.064561  0.179694  0.349781  1.000000  0.031804
5        0.379670  0.073623  0.021592  0.031804  1.000000


In [None]:
def predict_rating(user_id, movie_title, k=5):
  
    # Get similarity scores for the target user
    sim_scores = user_similarity.loc[user_id]
    
    # Get ratings for the target movie
    movie_ratings = user_item_matrix[movie_title]
    
    # Remove users who haven't rated the movie
    valid_users = movie_ratings[movie_ratings > 0].index
    sim_scores = sim_scores[valid_users]
    
    # Get top-K most similar users
    top_k_users = sim_scores.sort_values(ascending=False)[1:k+1]  # Exclude self
    
    # Calculate weighted average
    weighted_sum = np.dot(
        top_k_users.values,
        user_item_matrix.loc[top_k_users.index, movie_title]
    )
    prediction = weighted_sum / top_k_users.sum()
    
    return prediction

In [25]:
user_id = 1
movie_title = "Toy Story (1995)"
predicted_rating = predict_rating(user_id, movie_title, k=5)

print(f"Predicted rating for User {user_id} on '{movie_title}': {predicted_rating:.2f}")

Predicted rating for User 1 on 'Toy Story (1995)': 4.00


In [None]:
def recommend_movies(user_id, n=5):
    """
    Recommends top-N movies a user hasn't rated yet.
    """
    # Movies the user has already rated
    rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    
    # Predict ratings for unrated movies
    predictions = []
    for movie in user_item_matrix.columns:
        if movie not in rated_movies:
            pred = predict_rating(user_id, movie)
            predictions.append((movie, pred))
    
    # Sort by predicted rating
    recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:n]
    
    return recommendations

# Example: Get top 5 recommendations for User 1
user_id = 1
recommendations = recommend_movies(user_id, n=5)

print(f"Top 5 Recommendations for User {user_id}:")
for i, (movie, rating) in enumerate(recommendations, 1):
    print(f"{i}. {movie} (Predicted Rating: {rating:.2f})")

In [None]:
def batch_predict(testset, user_similarity, user_item_matrix, movies, k=5):
    
    # Create mapping dictionaries for faster lookups
    movie_id_to_title = dict(zip(movies['item_id'], movies['title']))
    user_mean_ratings = user_item_matrix.mean(axis=1)
    global_mean = user_item_matrix.mean().mean()
    
    predictions = []
    
    # Pre-compute all possible movie titles in test set
    test_movies = {movie for _, movie, _ in testset}
    available_movies = set(movie_id_to_title.keys())
    
    for user, movie, _ in testset:
        # Skip if movie not in our training data
        if movie not in available_movies:
            predictions.append(global_mean)
            continue
            
        movie_title = movie_id_to_title[movie]
        
        try:
            # Get similar users (excluding self)
            sim_scores = user_similarity.loc[user]
            user_ratings = user_item_matrix[movie_title]
            
            # Find users who rated this movie
            rated_users = user_ratings[user_ratings > 0].index
            rated_users = rated_users[rated_users != user]  # Exclude self
            
            if len(rated_users) == 0:
                predictions.append(user_mean_ratings.loc[user])
                continue
                
            # Get top K similar users who rated this movie
            user_sims = sim_scores[rated_users]
            top_k_users = user_sims.nlargest(k).index
            
            # Calculate weighted average
            weighted_sum = (user_item_matrix.loc[top_k_users, movie_title] * 
                          user_similarity.loc[user, top_k_users]).sum()
            norm = user_similarity.loc[user, top_k_users].sum()
            
            prediction = weighted_sum / norm if norm != 0 else user_mean_ratings.loc[user]
            predictions.append(prediction)
            
        except:
            predictions.append(global_mean)
    
    return predictions


In [None]:
def precision_recall_at_k(user_id, ratings_df, movies_df, k=5, threshold=3.5):
   
    # Get ground truth (movies the user rated highly)
    user_ratings = ratings_df[ratings_df['user_id'] == user_id]
    highly_rated = set(user_ratings[user_ratings['rating'] >= threshold]['item_id'])
    
    # Get recommendations (assuming recommend_movies exists)
    recommended_movies = recommend_movies(user_id, n=k)
    
    # Convert recommendations to item IDs
    recommended = []
    for movie_title, _ in recommended_movies:
        movie_id = movies_df[movies_df['title'] == movie_title]['item_id'].values
        if len(movie_id) > 0:
            recommended.append(movie_id[0])
    
    # Calculate metrics
    relevant_and_recommended = len(highly_rated.intersection(recommended))
    precision = relevant_and_recommended / k
    recall = relevant_and_recommended / len(highly_rated) if highly_rated else 0
    
    return precision, recall

# Example usage:

ratings = pd.read_csv('C:/Users/Zenab/Desktop/inn assigment/ml-100k', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
movies = pd.read_csv('C:/Users/Zenab/Desktop/inn assigment/ml-100k', sep='|', encoding='latin-1', 
                    usecols=[0, 1], names=['item_id', 'title'])

# Then evaluate for a user
precision, recall = precision_recall_at_k(
    user_id=1,
    ratings_df=ratings,
    movies_df=movies,
    k=5
)
print(f"Precision@5: {precision:.2f}, Recall@5: {recall:.2f}")