In [2]:
import sys
print(sys.executable)


c:\Users\basne\anaconda3\envs\basnet\python.exe


In [3]:
!{sys.executable} -m pip install scikit-learn


Collecting numpy>=1.24.1 (from scikit-learn)
  Downloading numpy-2.4.1-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Downloading numpy-2.4.1-cp311-cp311-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   -- ------------------------------------- 0.8/12.6 MB 5.6 MB/s eta 0:00:03
   ----- ---------------------------------- 1.8/12.6 MB 5.9 MB/s eta 0:00:02
   --------- ------------------------------ 3.1/12.6 MB 6.0 MB/s eta 0:00:02
   -------------- ------------------------- 4.5/12.6 MB 6.0 MB/s eta 0:00:02
   ------------------ --------------------- 5.8/12.6 MB 6.0 MB/s eta 0:00:02
   ---------------------- ----------------- 7.1/12.6 MB 6.0 MB/s eta 0:00:01
   ------------------------- -------------- 8.1/12.6 MB 6.0 MB/s eta 0:00:01
   ----------------------------- ---------- 9.4/12.6 MB 6.0 MB/s eta 0:00:01
   ---------------------------------- ----- 10.7/12.6 MB 6.0 MB/s eta 0:00:01
   -------------------------------------- - 12.1/12.6 MB 6.0 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
matplotlib 3.10.1 requires pillow>=8, which is not installed.

[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import pickle

import warnings
warnings.filterwarnings("ignore")

In [5]:
# Load filtered ratings
ratings = pd.read_csv("../data/ratings_processed.csv")

# Load sparse user-movie matrix
with open("../data/user_movie_sparse.pkl", "rb") as f:
    user_movie_sparse = pickle.load(f)

# Load mappings
with open("../data/user_mapping.pkl", "rb") as f:
    user_mapping = pickle.load(f)

with open("../data/movie_mapping.pkl", "rb") as f:
    movie_mapping = pickle.load(f)

# Load movie metadata
movies = pd.read_csv("../data/movie.csv")

print("Data loaded successfully")


Data loaded successfully


In [6]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(
    metric="cosine", #uses cosine similarity
    algorithm="brute", #compare the user with every other users
    n_neighbors=20, #gives 20 most similar users
    n_jobs=-1 #Uses all CPU cores for parallel processing
)

knn.fit(user_movie_sparse) #fitting data
 
user_index = 0  # internal index
distances, indices = knn.kneighbors(
    user_movie_sparse[user_index],
    n_neighbors=10
)

similar_users = [user_mapping[i] for i in indices[0]]
similar_users


[1, 81275, 62235, 110069, 2595, 75328, 34101, 102073, 76156, 118249]

In [7]:
def recommend_movies(user_id, top_n=5, n_neighbors=10, evaluation_mode=False):
    # 1. Map actual userId to internal index
    user_index_map = {v: k for k, v in user_mapping.items()}
    if user_id not in user_index_map:
        return "User not found"
    u_idx = user_index_map[user_id]

    # 2. Find similar users
    distances, indices = knn.kneighbors(user_movie_sparse[u_idx], n_neighbors=n_neighbors + 1)
    similar_user_indices = indices[0][1:]
    similarity_scores = 1 - distances[0][1:]

    # 3. Get user's existing interactions
    user_row = user_movie_sparse[u_idx]
    rated_movies = set(user_row.indices)

    # 4. Aggregate scores from neighbors
    scores = {}
    for neighbor_idx, sim_score in zip(similar_user_indices, similarity_scores):
        neighbor_row = user_movie_sparse[neighbor_idx]
        for movie_idx, rating in zip(neighbor_row.indices, neighbor_row.data):
            # THE FIX: Allow already rated movies ONLY during evaluation
            if evaluation_mode or (movie_idx not in rated_movies):
                scores[movie_idx] = scores.get(movie_idx, 0) + (sim_score * rating)

    if not scores:
        return "No recommendations available"

    # 5. Get Top N movie IDs
    top_movies_idx = sorted(scores, key=scores.get, reverse=True)[:top_n]
    actual_movie_ids = [movie_mapping[i] for i in top_movies_idx]

    # 6. Memory-Efficient Lookup (Avoids MemoryError)
    # Filter the movie dataframe only for the 10-20 IDs we actually found
    return movies[movies['movieId'].isin(actual_movie_ids)][['movieId', 'title', 'genres']]

In [8]:
recommend_movies(1, top_n=5, n_neighbors=10)

Unnamed: 0,movieId,title,genres
108,110,Braveheart (1995),Action|Drama|War
843,858,"Godfather, The (1972)",Crime|Drama
1184,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
2486,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller
3487,3578,Gladiator (2000),Action|Adventure|Drama


In [9]:
from sklearn.model_selection import train_test_split

train_ratings, test_ratings = train_test_split(
    ratings,
    test_size=0.2,
    random_state=42
)

print("Train size:", len(train_ratings))
print("Test size:", len(test_ratings))


Train size: 14434099
Test size: 3608525


In [10]:
sample_users = test_ratings['userId'].unique()[:100]
precisions = []
recalls = []

print(f"Evaluating {len(sample_users)} users with Evaluation Mode enabled...")

for u in sample_users:
    actual_movies = set(test_ratings[test_ratings['userId'] == u]['movieId'])
    if not actual_movies: continue

    # CALLING WITH evaluation_mode=True IS THE KEY FIX
    recs = recommend_movies(u, top_n=10, evaluation_mode=True)

    if isinstance(recs, str) or recs.empty:
        precisions.append(0.0)
        recalls.append(0.0)
        continue

    recommended_movies = set(recs['movieId'])
    hits = len(recommended_movies & actual_movies)

    precisions.append(hits / 10)
    recalls.append(hits / len(actual_movies))

print("-" * 30)
print(f"Average Precision@10: {np.mean(precisions):.4f}")
print(f"Average Recall@10:    {np.mean(recalls):.4f}")
print("-" * 30)

Evaluating 100 users with Evaluation Mode enabled...
------------------------------
Average Precision@10: 0.2100
Average Recall@10:    0.0516
------------------------------


In [11]:
# internal index â†’ actual ID
reverse_user_map = {v: k for k, v in user_mapping.items()}
reverse_movie_map = {v: k for k, v in movie_mapping.items()}


In [12]:
import numpy as np
import warnings
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

# 1. Faster Mappings
u_to_idx = {v: k for k, v in user_mapping.items()}
m_to_idx = {v: k for k, v in movie_mapping.items()}

# 2. Smaller Sample for Immediate Results
# 200 rows should finish in about 30-60 seconds
sample = test_ratings.sample(200, random_state=42) 
y_true, y_pred = [], []

# Group by user to avoid repeating the expensive KNN search
grouped = sample.groupby('userId')
total_users = len(grouped)

print(f"Starting RMSE calculation for {total_users} unique users...")

for i, (user_id, user_data) in enumerate(grouped):
    if user_id not in u_to_idx: continue
    
    u_idx = u_to_idx[user_id]
    
    # One search per user instead of one search per rating!
    distances, indices = knn.kneighbors(user_movie_sparse[u_idx], n_neighbors=11)
    
    neighbor_indices = indices[0][1:]
    similarities = 1 - distances[0][1:]

    for _, row in user_data.iterrows():
        m_id = row['movieId']
        if m_id not in m_to_idx: continue
        m_idx = m_to_idx[m_id]
        
        # Fast vectorized rating lookup
        neighbor_ratings = user_movie_sparse[neighbor_indices, m_idx].toarray().flatten()
        
        mask = neighbor_ratings > 0
        if mask.any():
            weighted_avg = np.sum(neighbor_ratings[mask] * similarities[mask]) / np.sum(similarities[mask])
            y_true.append(row['rating'])
            y_pred.append(weighted_avg)
            
    # Progress Tracker
    if (i + 1) % 10 == 0:
        print(f"Progress: {i + 1}/{total_users} users processed...")

# 3. Final Output
if y_true:
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print("\n" + "="*30)
    print(f"DONE! Final RMSE: {rmse:.4f}")
    print(f"Evaluated on {len(y_true)} ratings.")
    print("="*30)
else:
    print("\nNo predictions could be made. Try a larger sample.")

Starting RMSE calculation for 200 unique users...
Progress: 10/200 users processed...
Progress: 20/200 users processed...
Progress: 30/200 users processed...
Progress: 40/200 users processed...
Progress: 50/200 users processed...
Progress: 60/200 users processed...
Progress: 70/200 users processed...
Progress: 80/200 users processed...
Progress: 90/200 users processed...
Progress: 100/200 users processed...
Progress: 110/200 users processed...
Progress: 120/200 users processed...
Progress: 130/200 users processed...
Progress: 140/200 users processed...
Progress: 150/200 users processed...
Progress: 160/200 users processed...
Progress: 170/200 users processed...
Progress: 180/200 users processed...
Progress: 190/200 users processed...
Progress: 200/200 users processed...

DONE! Final RMSE: 1.0085
Evaluated on 178 ratings.


In [13]:
import pickle

# Save all the essential pieces
model_data = {
    'knn': knn,
    'user_movie_sparse': user_movie_sparse,
    'user_mapping': user_mapping,
    'movie_mapping': movie_mapping,
    'movies_df': movies # Your dataframe with titles/genres
}

with open('../movie_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)