In [1]:
import numpy as np
import pandas as pd


In [3]:
ratings = pd.read_csv('ratings.csv')
print("Ratings Data Info:")
print(ratings.info())
print("\nRatings Data Head:")
print(ratings.head())

Ratings Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None

Ratings Data Head:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [5]:
movies = pd.read_csv('movies.csv')
print("Movies Data Info:")
print(movies.info())
print("\nMovies Data Head:")
print(movies.head())

Movies Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

Movies Data Head:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [6]:
movies_sample = movies.sample(n=100, random_state=42)


In [7]:
ratings_sample = ratings[ratings.movieId.isin(movies_sample.movieId)].groupby('movieId').head(100)

In [8]:
merged_df = pd.merge(ratings_sample, movies_sample, on='movieId')

In [9]:
print(merged_df.head())

   userId  movieId  rating  timestamp  \
0       1      423     3.0  964982363   
1       4     1266     4.0  986849037   
2       4     1597     1.0  945079906   
3       4     2324     1.0  964622590   
4       5      290     5.0  847435311   

                                        title                          genres  
0                           Blown Away (1994)                 Action|Thriller  
1                           Unforgiven (1992)                   Drama|Western  
2                    Conspiracy Theory (1997)  Drama|Mystery|Romance|Thriller  
3  Life Is Beautiful (La Vita Ã¨ bella) (1997)        Comedy|Drama|Romance|War  
4                   Once Were Warriors (1994)                     Crime|Drama  


In [10]:
user_item_matrix = merged_df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)


In [11]:
R = user_item_matrix.values


In [12]:
print("User-Item Matrix Info:")
print(user_item_matrix.info())


User-Item Matrix Info:
<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 1 to 610
Data columns (total 100 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   43      300 non-null    float64
 1   108     300 non-null    float64
 2   290     300 non-null    float64
 3   389     300 non-null    float64
 4   416     300 non-null    float64
 5   423     300 non-null    float64
 6   522     300 non-null    float64
 7   556     300 non-null    float64
 8   694     300 non-null    float64
 9   835     300 non-null    float64
 10  1040    300 non-null    float64
 11  1266    300 non-null    float64
 12  1373    300 non-null    float64
 13  1398    300 non-null    float64
 14  1432    300 non-null    float64
 15  1446    300 non-null    float64
 16  1564    300 non-null    float64
 17  1597    300 non-null    float64
 18  1964    300 non-null    float64
 19  2071    300 non-null    float64
 20  2324    300 non-null    float64
 21  2358    300 non-null

In [13]:
print("\nUser-Item Matrix Head:")
print(user_item_matrix.head())


User-Item Matrix Head:
movieId  43      108     290     389     416     423     522     556     \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     3.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     5.0     0.0     0.0     0.0     0.0     0.0   
6           4.0     0.0     0.0     0.0     3.0     0.0     0.0     0.0   
7           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  694     835     ...  130518  134019  141994  149011  157130  174479  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
6           3.0     4.0  ...     0.0     0.0     0

In [14]:
def matrix_factorization(R, K, steps=50, alpha=0.0002, beta=0.02):
    N = len(R)
    M = len(R[0])
    P = np.random.rand(N, K)
    Q = np.random.rand(M, K)
    
    for step in range(steps):
        for i in range(N):
            for j in range(M):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i, :], Q[j, :].T)
                    for k in range(K):
                        P[i][k] += alpha * (2 * eij * Q[j][k] - beta * P[i][k])
                        Q[j][k] += alpha * (2 * eij * P[i][k] - beta * Q[j][k])
        
        error = 0
        for i in range(N):
            for j in range(M):
                if R[i][j] > 0:
                    error += pow(R[i][j] - np.dot(P[i, :], Q[j, :].T), 2)
                    for k in range(K):
                        error += (beta/2) * (pow(P[i][k], 2) + pow(Q[j][k], 2))
        if error < 0.001:
            break
    
    return P, Q


In [15]:
K = 2

P, Q = matrix_factorization(R, K, steps=50)

In [16]:
predicted_R = np.dot(P, Q.T)

In [17]:
print("\nPredicted user-item rating matrix:")
print(predicted_R)


Predicted user-item rating matrix:
[[1.6526217  0.65418784 2.49060257 ... 0.92768591 1.29594733 0.73813085]
 [1.53098671 0.57259655 2.3580745  ... 0.85671036 1.27469475 0.75028439]
 [1.4425826  0.51173422 2.26412515 ... 0.8049995  1.26271067 0.76222254]
 ...
 [1.82036525 0.71791084 2.74746912 ... 1.02163146 1.43342424 0.81837596]
 [1.53797866 0.45791692 2.54696009 ... 0.85116449 1.54052073 0.98688447]
 [1.91930056 0.80590104 2.82242537 ... 1.08110531 1.40277367 0.76550039]]
