# Data Streaming Algorithms and Online Learning
## HW 3 - Netflix Problem Revisited
by Millis Sahar

In [2]:
import numpy as np
import pandas as pd



# Download the MovieLens 1M dataset 
[https://grouplens.org/datasets/movielens/1m/]   
whichcontains 1 million ratings from 6000 users on 4000 movies.


##### ratings df

In [99]:
ratings_list = [i.strip().split("::") for i in open('ratings.dat', 'r').readlines()]
rating_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
rating_df['Rating']=rating_df['Rating'].apply(pd.to_numeric)
rating_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


###### users df

In [None]:
users_list = [i.strip().split("::") for i in open('users.dat', 'r').readlines()]
users_df = pd.DataFrame(users_list, columns = ['UserID', 'Gender', 'Age', 'Occupation','Zip-code'], dtype = int)
# users_df.head()

##### movies df

In [None]:
movies_list = [i.strip().split("::") for i in open('movies.dat', 'r').readlines()]
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'], dtype = int)
# movies_df.head()

##### Your goal: recover the cleared ratings with highest RMSE accuracy using a MATRIX COMPLETION method.
Sample 1000 of the given ratings matrix using seed of 123 and “clear them”. i.e., create a new ratings matrix without these samples



In [100]:
# random seed
np.random.seed(123)

# sample 1k
test_df = rating_df.sample(n=10)

# clear info
rating_df.loc[test_df.index,'Rating'] = 0 

test_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
382276,2233,440,4,974597580
712314,4274,587,5,965301765
415052,2498,454,3,974089255
465169,2868,2336,5,992717615
270413,1636,2686,5,974962900


##### Additional instructions

- You may use the methods from class OR any other method you would like,but it must be a matrix completion method, namely a method which completes the given partially-observed matrix.  
- You may use the jupyter notebook uploaded in Lecture 8 for the pre-processing  
- You may use any programming language.  


#### Requirements for submission
- Your full implementation code (including the pre-processing steps above).  
- Resulting predictions and RMSE.  
- A short report summarizing your efforts and insights of the problem.  
- Bonus points will be given for creative approaches and ideas.  
- You are highly encouraged to consult, implement and use the SVT method, which was taught in class:
    - [CCS10] Jian-Feng Cai, Emmanuel J. Candes, and Zuowei Shen. A singular value threshold- ing algorithm for matrix completion. SIAM J. on Optimization, 20(4):1956–1982, March 2010.  
    - As mentioned in class- it may be very interesting to study and analyze the effect of the various parameters in this algorithm (e.g., delta) on the convergence, accuracy, etc.

# Exploratory  Analysis

##### unique users and movies

In [101]:
n_users = rating_df.UserID.unique().shape[0]
n_movies = rating_df.MovieID.unique().shape[0]
print('Number of users = ' + str(n_users))
print('Number of movies = ' + str(n_movies))

Number of users = 6040
Number of movies = 3706


##### ratings matrix
one row per user and one column per movie. 
To do so, I'll pivot ratings to get that and call the new variable Ratings.

In [102]:
Ratings = rating_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
# Ratings.head()

##### convert matrix from a dataframe to a numpy array

In [103]:
R = Ratings.values

##### normalize by each users mean

In [104]:
Ratings_means = np.mean(R, axis = 1)
Ratings_demeaned = R - Ratings_means.reshape(-1, 1)

##### Sparse?

In [105]:
str(round((1-np.count_nonzero(Ratings)/Ratings.size)*100,2))+'%'

'95.53%'

# SVD

In [106]:
from scipy.sparse.linalg import svds

#parameters 
param_k = 150

# UIV
U, sigma, Vt = svds(Ratings_demeaned, k = param_k)

# diagonal
sigma = np.diag(sigma)


In [107]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + Ratings_means.reshape(-1, 1)

In [108]:
preds = pd.DataFrame(all_user_predicted_ratings, columns = Ratings.columns)
preds.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.560131,0.299027,0.292506,-0.004703,-0.014791,-0.379549,-0.077952,0.063209,0.035995,-0.363389,...,0.003811,-0.00976,-0.001084,-0.084464,-0.084585,0.177947,0.100308,0.026234,0.097888,-0.062766
1,0.327708,-0.006495,0.414506,0.058106,0.067796,1.250657,-0.194928,0.073968,0.044295,1.872875,...,-0.101543,0.000932,-0.020406,0.010098,0.083853,-0.674775,-0.20375,0.040747,0.062167,0.129198
2,1.148962,0.613345,0.05126,-0.044536,-0.012455,0.068508,0.063193,-0.015942,0.01317,1.208501,...,0.04275,-0.008885,0.025181,0.067866,-0.000627,0.00446,0.242074,0.047154,0.029182,-0.060645
3,-0.167782,0.360357,0.060695,0.02564,-0.065263,-0.345853,-0.047359,0.003655,-0.022004,-0.272025,...,0.009402,0.009354,0.020106,-0.015052,-0.002313,-0.060564,-0.019349,0.024983,0.063655,-0.079681
4,-0.225662,-0.43,-0.007721,0.135879,-0.211104,1.317622,-0.102276,-0.020951,-0.201545,-0.011815,...,0.143578,0.07215,-0.004047,-0.039035,-0.162658,-0.27039,0.400715,0.019016,0.100691,0.248009


In [109]:
np.count_nonzero(np.round(preds))/preds.size

0.10330241276898389

In [142]:
d = pd.DataFrame(columns=['target','prediction'])


for r in test_df.iterrows():
    d = d.append({'UserID':r[1][0],'MovieID':r[1][1],'target':r[1][2]},ignore_index=True)
    break
d.head()

preds.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.560131,0.299027,0.292506,-0.004703,-0.014791,-0.379549,-0.077952,0.063209,0.035995,-0.363389,...,0.003811,-0.00976,-0.001084,-0.084464,-0.084585,0.177947,0.100308,0.026234,0.097888,-0.062766
1,0.327708,-0.006495,0.414506,0.058106,0.067796,1.250657,-0.194928,0.073968,0.044295,1.872875,...,-0.101543,0.000932,-0.020406,0.010098,0.083853,-0.674775,-0.20375,0.040747,0.062167,0.129198
2,1.148962,0.613345,0.05126,-0.044536,-0.012455,0.068508,0.063193,-0.015942,0.01317,1.208501,...,0.04275,-0.008885,0.025181,0.067866,-0.000627,0.00446,0.242074,0.047154,0.029182,-0.060645
3,-0.167782,0.360357,0.060695,0.02564,-0.065263,-0.345853,-0.047359,0.003655,-0.022004,-0.272025,...,0.009402,0.009354,0.020106,-0.015052,-0.002313,-0.060564,-0.019349,0.024983,0.063655,-0.079681
4,-0.225662,-0.43,-0.007721,0.135879,-0.211104,1.317622,-0.102276,-0.020951,-0.201545,-0.011815,...,0.143578,0.07215,-0.004047,-0.039035,-0.162658,-0.27039,0.400715,0.019016,0.100691,0.248009
