In [2]:
import pandas as pd

path_rating = r"C:\Users\R\Downloads\archive(12)\ml-latest-small\ratings.csv"
path_movies = r"C:\Users\R\Downloads\archive(12)\ml-latest-small\movies.csv"

ratings = pd.read_csv(path_rating)
movies = pd.read_csv(path_movies)


In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [None]:
ratings.isnull().sum()
movies.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
user_item_matrix = ratings.pivot(index='userId',columns='movieId',values='rating') #userxmovies
user_item_matrix.fillna(0, inplace=True)

print(user_item_matrix.shape)

(610, 9724)


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(user_item_matrix.T) #1~0
item_similarity_df = pd.DataFrame(item_similarity,
                                  index=user_item_matrix.columns,
                                  columns=user_item_matrix.columns) #movieIDxmovieID (value=similarity)

In [20]:
import numpy as np

def predict_ratings_item_based(user_id,user_item_matrix,similarity_matrix):
  user_ratings = user_item_matrix.loc[user_id].values
  nominator = similarity_matrix.dot(user_ratings) 
  dominator = np.abs(similarity_matrix).sum(axis=1)
  weighted_ratings = nominator/dominator
  return pd.Series(weighted_ratings,index=user_item_matrix.columns)

In [None]:
def recommend_movies_item_based(user_id,user_item_matrix,similarity_matrix,movies,top_n=10):
  predicts = predict_ratings_item_based(user_id, user_item_matrix, similarity_matrix)
  seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
  predicts = predicts.drop(seen_movies)
  top_recs = predicts.sort_values(ascending=False).head(top_n)
  
  return movies[movies['movieId'].isin(top_recs.index)]

In [22]:
print(recommend_movies_item_based(1, user_item_matrix, item_similarity_df, movies, top_n=5))


      movieId                                           title  \
414       476                             Inkwell, The (1994)   
666       876  Supercop 2 (Project S) (Chao ji ji hua) (1993)   
2116     2812                              In Too Deep (1999)   
4224     6145                                    Venom (1982)   
6846    61697                           Righteous Kill (2008)   

                            genres  
414                   Comedy|Drama  
666   Action|Comedy|Crime|Thriller  
2116               Action|Thriller  
4224               Horror|Thriller  
6846        Crime|Mystery|Thriller  


In [26]:
def precision_at_k_item_based(user_id, user_item_matrix, similarity_matrix, k=10, threshold=3.5):
    # Get item-based predictions
    preds = predict_ratings_item_based(user_id, user_item_matrix, similarity_matrix)
    
    # Sort predictions
    user_preds = preds.sort_values(ascending=False).head(k)
    
    # Movies the user has already rated
    seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    
    # Relevant items: actual ratings > threshold
    relevant_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] >= threshold].index
    
    # Recommended items (unseen, top-K)
    recommended = [i for i in user_preds.index if i not in seen_movies]
    
    # Precision@K
    hits = len(set(recommended) & set(relevant_items))
    return hits / k if k > 0 else 0


In [27]:
print("Precision@10 for User 1 (Item-Based CF):", precision_at_k_item_based(1, user_item_matrix, item_similarity_df, k=10))


Precision@10 for User 1 (Item-Based CF): 0.0


In [30]:
from scipy.sparse.linalg import svds

R = user_item_matrix.values
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1,1)

U, sigma, Vt = svds(R_demeaned, k=50)
sigma = np.diag(sigma)

all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1,1)
pred_df = pd.DataFrame(all_user_predicted_ratings,columns=user_item_matrix.columns,index=user_item_matrix.index)

In [31]:
def recommend_movies_svd(user_id, preds_df, user_item_matrix, movies, top_n=10):
    user_row = preds_df.loc[user_id].sort_values(ascending=False)
    seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recs = user_row.drop(seen_movies).head(top_n)
    return movies[movies['movieId'].isin(recs.index)]

In [32]:
print(recommend_movies_svd(1, pred_df, user_item_matrix, movies, top_n=5))

      movieId                           title                 genres
659       858           Godfather, The (1972)            Crime|Drama
793      1036                 Die Hard (1988)  Action|Crime|Thriller
922      1221  Godfather: Part II, The (1974)            Crime|Drama
1067     1387                     Jaws (1975)          Action|Horror
1445     1968      Breakfast Club, The (1985)           Comedy|Drama


In [33]:
def precision_at_k_svd(user_id, preds_df, user_item_matrix, k=10, threshold=3.5):
    # Get SVD-based predictions for this user
    user_preds = preds_df.loc[user_id].sort_values(ascending=False).head(k)
    
    # Movies the user has already rated
    seen_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    
    # Relevant items: actual ratings > threshold
    relevant_items = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] >= threshold].index
    
    # Recommended items (unseen, top-K)
    recommended = [i for i in user_preds.index if i not in seen_movies]
    
    # Precision@K
    hits = len(set(recommended) & set(relevant_items))
    return hits / k if k > 0 else 0


In [34]:
print("Precision@10 for User 1 (SVD):", 
      precision_at_k_svd(1, pred_df, user_item_matrix, k=10))


Precision@10 for User 1 (SVD): 0.0
