In [3]:
import pandas as pd
import numpy as np
import os
os.chdir('../')

In [29]:
df = pd.read_csv('data/ml-latest-small/movies.csv')

ratings = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [30]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
def collaborative_filtering(R,id):
    neighbour_ids = []
    k=5
    kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric='cosine')
    kNN.fit(R)
    neighbour = kNN.kneighbors(R[id].reshape(1,-1), return_distance=False)
    neighbour = neighbour[0][1:]
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(n)
    return neighbour_ids

In [69]:
movie_mapper = np.sort(np.array(df.movieId.unique()))

def get_movie_hash(id):
    return np.where(movie_mapper == id)[0][0]

def get_movie_id(id):
    return movie_mapper[id]

In [208]:
from sklearn.neighbors import NearestNeighbors
import random

def collaborative_filtering(R, id, k=10):
    # Get k nearest neighbors
    kNN = NearestNeighbors(n_neighbors=20+1, algorithm="brute", metric='cosine')
    kNN.fit(R)
    # Get indices of neighbors (excluding the user themselves)
    distances, indices = kNN.kneighbors(R[id].reshape(1,-1), return_distance=True)
    neighbor_indices = indices[0][1:]
    neighbor_distances = distances[0][1:]
    
    # Convert distances to weights (similarity scores)
    # Using inverse distance weighting
    similarities = 1 / (neighbor_distances + 1e-8)  # add small number to avoid division by zero
    # Normalize weights
    similarities = similarities / np.sum(similarities)
    
    # Get user's current ratings
    user_ratings = R[id]
    unrated_movies = user_ratings == 0
    
    # Calculate predicted ratings
    predicted_ratings = np.zeros_like(user_ratings)
    for idx, neighbor_idx in enumerate(neighbor_indices):
        neighbor_ratings = R[neighbor_idx]
        # Only consider movies the neighbor has rated
        rated_by_neighbor = neighbor_ratings != 0
        # Weight the neighbor's ratings by their similarity
        predicted_ratings += similarities[idx] * neighbor_ratings
    
    # Only keep predictions for movies the user hasn't rated
    predicted_ratings[~unrated_movies] = 0
    
    # Get top 10 movie recommendations
    top_movies_indices = np.argsort(predicted_ratings)[::-1][:k]
    
    # Create recommendations list with (movie_id, predicted_rating)
    recommendations = [idx for idx in top_movies_indices]
    
    return recommendations

def display_movie(id):
    print(f"{str(df[df['movieId'] == get_movie_id(id)].title.iloc[0])}")

    
class ReflexAgent:
    def __init__(self,id,k=50):
        self.id = id
        mh = list(ratings[ratings['userId'] == id].sort_values('timestamp').head(k).movieId)
        self.movie_history = list(map(get_movie_hash,mh))
    
    def get_recs(self,R):
        self.recs = collaborative_filtering(R,self.id)
        return self.recs
    
    def action(self,ratings):
        self.chosen = random.sample(self.recs, 1)[0]
        self.movie_history.append(self.chosen)

        # Create a new row as a DataFrame
        new_rating = pd.DataFrame({
        'userId': [self.id],
        'movieId': [get_movie_id(self.chosen)],
        'rating': [random.randint(0,10)/2],
        'timestamp': [82983923]
        })
    
        # Concatenate the new rating with the existing DataFrame
        ratings = pd.concat([ratings, new_rating], ignore_index=True)
        return ratings,self.chosen


def create_R(agents):
    m = len(df.movieId)
    R = []
    for agent in agents:
        mh = agent.movie_history
        mh_idx = list(map(get_movie_id,mh))
        user_ratings = ratings[(ratings['userId'] == agent.id) & (ratings['movieId'].isin(mh_idx))]
        row = [float(user_ratings[user_ratings['movieId'] == get_movie_id(i)].rating.iloc[0]) if i in mh else 0 for i in range(m)]
         # Convert to numpy array for easier manipulation
        row_array = np.array(row)
        
        # Calculate mean of only non-zero elements
        non_zero_mask = row_array != 0
        if non_zero_mask.any():  # check if there are any non-zero elements
            mean = np.mean(row_array[non_zero_mask])
            
            # Subtract mean only from non-zero elements
            row_array[non_zero_mask] = row_array[non_zero_mask] - mean
        
        R.append(row_array.tolist())

    return np.array(R)

In [209]:
# First pick 200 users -> Initialize 200 reflex agents -> Build R


users = np.array(pd.Series(ratings.userId.unique()).sample(n=200, random_state=42))
agents = [ReflexAgent(list(users).index(i)) for i in users]

for t in range(5):
    R = create_R(agents)
    for agent in agents:
        rec = agent.get_recs(R)
        ratings, _ = agent.action(ratings)
    print(f"timestep: {t}")




        






timestep: 0
timestep: 1
timestep: 2
timestep: 3
timestep: 4


In [133]:
R

array([[ 0.        , -0.16      ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.41      ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.65217391,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.125     ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [134]:
R.shape

(200, 100836)

In [171]:
list(map(display_movie,rec))

One Flew Over the Cuckoo's Nest (1975)
Godfather, The (1972)
Matrix, The (1999)
Monty Python and the Holy Grail (1975)
Usual Suspects, The (1995)
Lord of the Rings: The Two Towers, The (2002)
Saving Private Ryan (1998)
Blade Runner (1982)
Breakfast Club, The (1985)
Requiem for a Dream (2000)


[None, None, None, None, None, None, None, None, None, None]

In [202]:
random.randint(0,10)/2

2.5