In [4]:
# Import libraries
import numpy as np
import pandas as pd

# Reading ratings file
ratings = pd.read_csv('H:\\255\\ml-latest-small\\ratings.csv')

# Reading users file
Tags = pd.read_csv('H:\\255\\ml-latest-small\\tags.csv')

# Reading movies file
movies = pd.read_csv('H:\\255\\ml-latest-small\\movies.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
n_movies = ratings.movieId.unique().shape[0]
n_users = ratings.userId.unique().shape[0]
print('Number of movies = ' + str(n_movies)) 
print('Number of users = ' + str(n_users))

Number of movies = 9724
Number of users = 610


In [21]:
Ratings = ratings.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)
Ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
R = Ratings.values
user_ratings_mean = np.mean(R, axis = 1)
Ratings_demeaned = R - user_ratings_mean.reshape(-1, 1)

In [12]:
## Model-Based Collaborative Filtering

In [23]:
#sparsity = round(1.0 - (len(ratings)/float(n_movies,3))
sparsity = round(1.0 - len(ratings) / float(n_users*n_movies), 3)
print('The sparsity level of MovieLens100k dataset is ' +  str(sparsity * 100) + '%')

The sparsity level of MovieLens100k dataset is 98.3%


In [24]:
## setting up svd

In [25]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(Ratings_demeaned, k = 50)

In [26]:
sigma = np.diag(sigma)

In [27]:
## making predictions from the decomposed matrix

In [28]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

In [29]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
0,2.167328,0.402751,0.840184,-0.076281,-0.551337,2.504091,-0.890114,-0.026443,0.196974,1.593259,...,-0.023453,-0.019967,-0.026939,-0.026939,-0.023453,-0.026939,-0.023453,-0.023453,-0.023453,-0.058732
1,0.211459,0.006658,0.033455,0.017419,0.18343,-0.062473,0.083037,0.024158,0.04933,-0.15253,...,0.019498,0.016777,0.022219,0.022219,0.019498,0.022219,0.019498,0.019498,0.019498,0.032281
2,0.003588,0.030518,0.046393,0.008176,-0.006247,0.107328,-0.012416,0.003779,0.007297,-0.059362,...,0.005909,0.006209,0.00561,0.00561,0.005909,0.00561,0.005909,0.005909,0.005909,0.008004
3,2.051549,-0.387104,-0.252199,0.087562,0.130465,0.27021,0.477835,0.040313,0.025858,-0.017365,...,0.004836,0.004172,0.0055,0.0055,0.004836,0.0055,0.004836,0.004836,0.004836,-0.023311
4,1.344738,0.778511,0.065749,0.111744,0.273144,0.584426,0.25493,0.128788,-0.085541,1.023455,...,-0.008042,-0.007419,-0.008664,-0.008664,-0.008042,-0.008664,-0.008042,-0.008042,-0.008042,-0.010127


In [41]:
def recommend_movies(predictions, userID, movies, original_ratings, num_recommendations):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # User ID starts at 1, not 0
    sorted_user_predictions = preds.iloc[user_row_number].sort_values(ascending=False) # User ID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings[original_ratings.userId == (userID)]
    user_full = (user_data.merge(movies, how = 'left', left_on = 'movieId', right_on = 'movieId').
                     sort_values(['rating'], ascending=False)
                 )

    print('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    recommendations = (movies[~movies['movieId'].isin(user_full['movieId'])].
         merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'movieId',
               right_on = 'movieId').
         rename(columns = {user_row_number: 'Predictions'}).
         sort_values('Predictions', ascending = False).
                       iloc[:num_recommendations, :-1]
                      )

    return user_full, recommendations

In [44]:
already_rated, predictions = recommend_movies(preds, 10, movies, ratings, 20)

User 10 has already rated 140 movies.
Recommending highest 20 predicted ratings movies not already rated.


In [45]:
# Top 20 movies that User 1310 has rated 
already_rated.head(20)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
139,10,140110,5.0,1455356776,The Intern (2015),Comedy
48,10,8869,5.0,1455303064,First Daughter (2004),Comedy|Romance
117,10,96079,5.0,1455302172,Skyfall (2012),Action|Adventure|Thriller|IMAX
110,10,91529,5.0,1455302120,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
42,10,7458,5.0,1455356656,Troy (2004),Action|Adventure|Drama|War
100,10,81845,5.0,1455302591,"King's Speech, The (2010)",Drama
44,10,8533,5.0,1455301847,"Notebook, The (2004)",Drama|Romance
97,10,79091,5.0,1455306124,Despicable Me (2010),Animation|Children|Comedy|Crime
86,10,71579,5.0,1455301869,"Education, An (2009)",Drama|Romance
57,10,33794,5.0,1455302031,Batman Begins (2005),Action|Crime|IMAX


In [46]:
# Top 20 movies that User 1310 hopefully will enjoy
predictions

Unnamed: 0,movieId,title,genres
4394,6539,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy
7274,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
5123,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy|IMAX
3553,4896,Harry Potter and the Sorcerer's Stone (a.k.a. ...,Adventure|Children|Fantasy
3547,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
3593,4963,Ocean's Eleven (2001),Crime|Thriller
4050,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
7207,76093,How to Train Your Dragon (2010),Adventure|Animation|Children|Fantasy|IMAX
3601,4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",Comedy|Romance
6003,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller|IMAX


In [63]:
## model evaluation

In [77]:
# Import libraries from Surprise package
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

# Load Reader library
reader = Reader()

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset for 5-fold evaluation
#data.split(n_folds=5)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE', 'FCP'], cv=5, verbose=True)

# Use the SVD algorithm.
#svd = SVD()

# Compute the RMSE of the SVD algorithm.
#evaluate(svd, data, measures=['RMSE'])

Evaluating RMSE, MAE, FCP of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8879  0.8626  0.8683  0.8744  0.8753  0.8737  0.0085  
MAE (testset)     0.6829  0.6633  0.6719  0.6694  0.6723  0.6720  0.0063  
FCP (testset)     0.6574  0.6556  0.6633  0.6602  0.6633  0.6600  0.0031  
Fit time          4.84    4.92    4.79    5.23    5.10    4.98    0.17    
Test time         0.26    0.13    0.11    0.24    0.22    0.19    0.06    


{'test_rmse': array([0.88788202, 0.86257907, 0.86826269, 0.87436087, 0.8752528 ]),
 'test_mae': array([0.68288653, 0.66334684, 0.67189016, 0.66937459, 0.67227885]),
 'test_fcp': array([0.65742686, 0.65560858, 0.66331804, 0.66019733, 0.66334723]),
 'fit_time': (4.838450193405151,
  4.9220452308654785,
  4.787558317184448,
  5.231145620346069,
  5.097562074661255),
 'test_time': (0.2613818645477295,
  0.12824320793151855,
  0.10982155799865723,
  0.24229168891906738,
  0.22030925750732422)}

In [80]:
trainset = data.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x26048f663a0>

In [86]:
ratings[ratings['userId'] == 131]

Unnamed: 0,userId,movieId,rating,timestamp
19850,131,31,3.0,1349838570
19851,131,47,4.0,1349840567
19852,131,50,4.0,1349840171
19853,131,110,2.5,1349840503
19854,131,111,4.0,1349840160
...,...,...,...,...
19914,131,64614,3.5,1349840720
19915,131,68237,3.0,1349840757
19916,131,79132,4.0,1349840241
19917,131,89745,4.0,1349840436


In [87]:
svd.predict(1310, 1994)

Prediction(uid=1310, iid=1994, r_ui=None, est=3.546547068992359, details={'was_impossible': False})