# User CF + Item CF + SVD

In [165]:
import pandas as pd
import os

### Import data
The file "u.data" which contains the full dataset.
- 100,000 rating(1-5) from 943 users on 1682 movies
- Each user has rated at least 20 movies

In [166]:
data_dir = os.path.join('data', 'ml-100k')

In [167]:
data_dir

'data/ml-100k'

In [168]:
file_name = os.path.join(data_dir, 'u.data')

df = pd.read_csv(file_name, sep='\t', header = -1)

### full u dataset
- user id || item id || rating || timestape

Give the columns name to the dataset

In [25]:
df.columns = ['user_id', 'item_id', 'rating', 'itemstape']

In [26]:
df.head()

Unnamed: 0,user_id,item_id,rating,itemstape
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### Split dataset(train 25%, test 75%) 

In [80]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size = 0.1)

### Rebuild the matrix(n x m) which n is number of users and m is the number of movies

In [81]:
n_users = df.user_id.unique().shape[0]

n_items = df.item_id.unique().shape[0]

print('Number of users = ' + str(n_users) + '| number of movies = '+ str(n_items))

Number of users = 943| number of movies = 1682


### Create two user-item matrix, one for training and another for testing

### Training dataset

In [82]:
import numpy as np

train_data_matrix = np.zeros((n_users, n_items))

train_data_matrix.shape

(943, 1682)

### Testing dataset

In [83]:
test_data_matrix = np.zeros((n_users, n_items))

In [84]:
train_data.head()

Unnamed: 0,user_id,item_id,rating,itemstape
73607,709,402,3,879849185
3683,59,405,3,888203578
73508,721,582,3,877140490
69129,711,134,5,876278804
24108,20,181,4,879667904


### insert the data into the train data matrix and test data matrix

Becuase user_id and item_id are start from 1, so the value in the matrix should be decread by 1

In [85]:
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]

for line in test_data.itertuples():
    test_data_matrix[line[1] - 1, line[2] - 1] = line[3]

## Collaborative Flitering(Memory based)

There are two main parts of CF: User CF and Item CF. 

User CF: choose a specific user and based on the similarity of two users to recommend items(like: the person like these items also like that one)

Item CF: Find the persons who like the same item and these persons other like items. Calculate the similarity base on that


![title](data/User_base_and_item_base.jpeg)

### compute user and item similarity


In [86]:
import sklearn

##### The normal distance matrix in recommendation system we use the cosine similarity

User similarity:

$ S_{u}^{cosine}(U_{k},U_{a}) = \frac{U_{k}*U_{a}}{|U_{k}| |U_{a}|} = \frac{\sum{X_{k,m} * X_{a,m}}}{\sqrt{\sum{X_{k}^{2}}\sum{X^{2}_{a,m}}}} $

$ S_{u}^{cosine}(I_{m},I_{b}) = \frac{I_{m}*I_{b}}{|I_{m}| |I_{b}|} = \frac{\sum{X_{a,m} * X_{a,b}}}{\sqrt{\sum{X_{a,m}^{2}}\sum{X^{2}_{a,b}}}} $

In [87]:
user_similarity = sklearn.metrics.pairwise_distances(train_data_matrix, metric = "cosine")

item_similarity = sklearn.metrics.pairwise_distances(train_data_matrix.T, metric = "cosine")

#### The shape of the user similarity matrix becomes 943 x 943 

In [88]:
user_similarity.shape

(943, 943)

#### The shape of the item similarity matrix becomes 1682 x 1682

In [89]:
item_similarity.shape

(1682, 1682)

### Prediction based on either user or items

Prediction function is based on:
    $x_{k,m} = \overline{x_{k}} + \frac{\sum_{u_{a}}{sim_{u}(u_{k}, u_{a})(x_{a,m} - \overline{x_{u_{a}}})}}{\sum_{u_{a}}|sim_{u}(u_{k}, u_{a})|}$

![title](data/user_base.png)

##### The intuition here is that not everyone rating standard is same. 
##### For example, if user A like a movie he might give the movie 5 stars and others 3 or 4. But some picky people rarely give 5 star to the movie. 
##### So the algorithm use the similarity as the weight to predict rating

In [108]:
a = 0
n = 0
for i in train_data_matrix[0]:
    if i != 0:
        a = a + i
        n += 1
a / n

3.60082304526749

In [119]:
train_data_matrix.shape

(943, 1682)

In [120]:
len(train_data_matrix)

943

In [126]:
def c_mean(mat):
    r = []
    for i in range(len(mat)):
        num = 0
        a = 0
        for n in mat[i]:
            if n != 0:
                a = a + n
                num += 1
        r.append(a/num)
    return r

In [134]:
def predict(ratings, similarity, type='user'):
        if type == 'user':
            mean_user_rating = np.array(c_mean(ratings))
            #You use np.newaxis so that mean_user_rating has same format as ratings
            ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
            pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        elif type == 'item':
            pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
        return pred

In [135]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
# user_prediction = predict(train_data_matrix, item_similarity, type='user')

### Evaluation

##### The most popular way to evaluate the model is using RMSE(root-mean-square error)

$RMSE = \sqrt{\frac{1}{N}\sum(x_{i} - \hat{x_{i}})^{2}}$

In [136]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [137]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [138]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.2063910213408042
Item-based CF RMSE: 3.4038842195708194


#### Conclusion:
Memory-based CF is easy to implement but the real problem is cold start, which means when the new users or data come

### Collabrative flitering base on SVD

##### Calculate the sparse level of the matrix

In [95]:
sparsity = round(1.0 - len(df)/float(n_users*n_items),3)

print('The sparsity level of the MovieLens 100K is ' + str(sparsity * 100) + '%')

The sparsity level of the MovieLens 100K is 93.7%


#### Use scipy svd

In [96]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [216]:
u, s, vt = svds(train_data_matrix, k = 200)

In [217]:
u.shape

(943, 200)

In [218]:
vt.shape 

(200, 1682)

In [219]:
s_diag_matrix = np.diag(s)

In [220]:
X_pred = np.dot(u,vt)

In [221]:
print('User-based CF RMSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF RMSE: 3.710126967168523


## Some Math notes

### SVD Algorithm

In linear algebra, the singular-value decomposition is a factorization of a real and complex matrix. 

##### Intuition of the SVD: Each operation of the matrix is rotation and streching

### Example

X = $\begin{bmatrix}
1\\
3
\end{bmatrix}
$
and 
$
A = \begin{bmatrix}
2 & 1\\
-1 & 1
\end{bmatrix}
$

$
Y = Ax = 
\begin{bmatrix}
2 & 1\\
-1 & 1
\end{bmatrix}
$
$
\begin{bmatrix}
1 \\
3
\end{bmatrix}
$
= $
\begin{bmatrix}
5 \\
2
\end{bmatrix}
$

##### rotation function

$A = \begin{bmatrix}
cos\Theta & -sin\Theta\\
sin\Theta & cos\Theta
\end{bmatrix}
$
which is rotated by $\Theta$

##### Streching function

$A = \begin{bmatrix}
\alpha & 0\\
0 & \alpha
\end{bmatrix}$
which $\alpha$ = "streching"

##### General function:

$AV = U\sum$
which $u$ is the unit matrix and $\sum$ is streching matrix

It is like a enignvalues function which is :
$A\overline{x} = \lambda\overline{x}$

When both sides of the equation multiply by $V^{-1}$

$A = U\sum V^{-1}$

And we let $V^{-1} = V^{*}$

$A = U\sum V^{*}$ which is the SVD function

#### Solve this function

$A^{T}A = (U\sum V^{*})^{T}(U\sum V^{*})$

$= (V\sum U^{*})(U\sum V^{*})$

$=V\sum^{2}V^{*}$

##### Final equation:
$A^{T}AV = V\sum^{2}V^{*}V$

$A^{T}AV = V\sum^{2}$ which is the enigvalue problem:

$A^{T}A = A$


$\sum^{2}$ is $\lambda$
##### enignvector is: V

#### To get U use the same way:
The both side of equation right multyplied by $A^{T}$<br>
which is $AA^{T} = (U\sum V^{*})(U\sum V^{*})^{T}$

$AA^{T}U = U\sum^{2}$

### Solve eignvalues and eignvector problem

##### Defination:
Let $A$ be an $n*n$ matrix. A scalar $lambda$ is called an eigenvalue of $A$ if there is a nonzero vector $\overline{x}$ such that $A\overline{x} = \lambda\overline{x}$. So sunch a vector $\overline{x}$ is called an eigenvector of $A$ corresponding to $\lambda$

### Example

Show that $\overline{x} = \begin{bmatrix}
3 & 2\\
3 & -2
\end{bmatrix}
$
corresponding to 
$\lambda$ = 4

##### note:
If $\lambda$ is an eigenvalue of A, and $\overline{x}$ is an eigenvector belonging to $\lambda$,<br>
any nonzero multiple of $\overline{x}$ will be an eigenvector

### Finding Eigenvalues and Eigenvector

#### 2x2 matrix example 

The process: <br>
To solve for the eigenvalues, $\lambda_{i}$ and the corresponding eigenvectors, $\overline{x_{i}}$, of an $n x n$ matrix $A$, do the following:<br>
- Multiply an $n x n$ identity matrix by the scalar
- subtract the identity matrix multiple from the matrix A
- Find the determined of the matrix and the difference
- solve for the values of $\lambda$ that satisfy the equation: $det(A - \lambda I) = \overline{0}$
- Solve for the corresponding vector to each $\lambda$

#### find the eigenvalues and eigenvectors of the matrix

$A = \begin{bmatrix}
7 & 3\\
3 & -1
\end{bmatrix}$

1. $\lambda I = \lambda \begin{bmatrix}
1 & 0\\
0 & 1
\end{bmatrix}
$ = 
$
\begin{bmatrix}
\lambda & 0\\
0 & \lambda
\end{bmatrix}
$

2.  $A - \lambda I = \begin{bmatrix}
7-\lambda & 3\\
3 & -1-\lambda
\end{bmatrix}
$

Solve this equation we can get a $\lambda$ = 8 which is the eigenvalue

To find the corresponding eigenvector: <br>
$A - \lambda I$ = B<br>
And let $B\overline{x} = \overline{0}$

Solve the equation to get the eigenvector $\overline{x}$

## Try to use larger dataset

In [140]:
data_dir2 = os.path.join('data', 'ml-latest-small')

In [141]:
file_name2 = os.path.join(data_dir2, 'ratings.csv') 

In [142]:
df = pd.read_csv(file_name2)

In [143]:
df.shape

(100836, 4)

In [145]:
n_users2 = df.userId.unique().shape[0]

n_items2 = df.movieId.unique().shape[0]

print('Number of users = ' + str(n_users2) + '| number of movies = '+ str(n_items2))

Number of users = 610| number of movies = 9724


In [146]:
train_data2, test_data2 = train_test_split(df, test_size = 0.1)

In [152]:
train_data_matrix2 = np.zeros((n_users2, n_items2))

train_data_matrix2.shape

(610, 9724)

In [153]:
test_data_matrix2 = np.zeros((n_users2, n_items2))

In [157]:
train_data_matrix2.shape

(610, 9724)

9417

In [158]:
for line in train_data2.itertuples():
    train_data_matrix2[line[1] - 1, line[2] - 1] = line[3]

for line in test_data2.itertuples():
    test_data_matrix2[line[1] - 1, line[2] - 1] = line[3]

IndexError: index 87306 is out of bounds for axis 1 with size 9724

In [170]:
df.shape

(100000, 4)

In [171]:
pwd

'/Users/mac/Desktop/UCD/Code/RecSys_Preparation'

In [173]:
cd data/

/Users/mac/Desktop/UCD/Code/RecSys_Preparation/data


In [174]:
ls

User_base_and_item_base.jpeg  [34mml-latest-small[m[m/
[34mml-100k[m[m/                      user_base.png


In [175]:
cd ml-100k/

/Users/mac/Desktop/UCD/Code/RecSys_Preparation/data/ml-100k


In [176]:
ls

README        u.info        u1.test       u4.base       ua.test
[31mallbut.pl[m[m*    u.item        u2.base       u4.test       ub.base
[31mmku.sh[m[m*       u.occupation  u2.test       u5.base       ub.test
u.data        u.user        u3.base       u5.test
u.genre       u1.base       u3.test       ua.base


In [177]:
data_dir

'data/ml-100k'

In [184]:
cd ..

/Users/mac/Desktop/UCD/Code/RecSys_Preparation


In [None]:
from sklearn.metrics import mean_absolute_error
data_shape = (943, 1682)
df = pd.read_csv(data_dir + "/ua.base", sep="\t", header=-1)
values = df.values
values[:, 0:2] -= 1

In [201]:
values.shape

(9430, 4)

In [199]:
X_train = sp.csr_matrix((values[:, 2], (values[:, 0], values[:, 1])), dtype=np.float, shape=data_shape)

df = pd.read_csv(data_dir + "/ua.test", sep="\t", header=-1)
values = df.values
values[:, 0:2] -= 1
X_test = sp.csr_matrix((values[:, 2], (values[:, 0], values[:, 1])), dtype=np.float, shape=data_shape)

# Compute means of nonzero elements
X_row_mean = np.zeros(data_shape[0])
X_row_sum = np.zeros(data_shape[0])

train_rows, train_cols = X_train.nonzero()

# Iterate through nonzero elements to compute sums and counts of rows elements
for i in range(train_rows.shape[0]):
    X_row_mean[train_rows[i]] += X_train[train_rows[i], train_cols[i]]
    X_row_sum[train_rows[i]] += 1

# Note that (X_row_sum == 0) is required to prevent divide by zero
X_row_mean /= X_row_sum + (X_row_sum == 0)

# Subtract mean rating for each user
for i in range(train_rows.shape[0]):
    X_train[train_rows[i], train_cols[i]] -= X_row_mean[train_rows[i]]

test_rows, test_cols = X_test.nonzero()
for i in range(test_rows.shape[0]):
    X_test[test_rows[i], test_cols[i]] -= X_row_mean[test_rows[i]]

X_train = np.array(X_train.toarray())
X_test = np.array(X_test.toarray())

ks = np.arange(2, 50)
train_mae = np.zeros(ks.shape[0])
test_mae = np.zeros(ks.shape[0])
train_scores = X_train[(train_rows, train_cols)]
test_scores = X_test[(test_rows, test_cols)]

# Now take SVD of X_train
U, s, Vt = np.linalg.svd(X_train, full_matrices=False)

for j, k in enumerate(ks):
    X_pred = U[:, 0:k].dot(np.diag(s[0:k])).dot(Vt[0:k, :])

    pred_train_scores = X_pred[(train_rows, train_cols)]
    pred_test_scores = X_pred[(test_rows, test_cols)]

    train_mae[j] = mean_absolute_error(train_scores, pred_train_scores)
    test_mae[j] = mean_absolute_error(test_scores, pred_test_scores)

    print(k,  train_mae[j], test_mae[j])

2 0.7632419407797969 0.8062531226330467
3 0.7545135187578939 0.8048283869532032
4 0.7461815082010831 0.803729719563713
5 0.7386400369670652 0.8011145270503263
6 0.7306512701022605 0.798606159176064
7 0.7245430691752702 0.7950942508422514
8 0.7184225229432085 0.7947004912749904
9 0.7131587228174484 0.7944223883210804
10 0.7078434814689719 0.7941923052812567
11 0.7031113156493303 0.7943444364320172
12 0.6981481528577465 0.7943605493580395
13 0.6929835821073164 0.7934440095633654
14 0.6884743963590056 0.7929708627863534
15 0.6836396562235089 0.7922081894793233
16 0.6790602923886585 0.7920361382802814
17 0.6747128004581309 0.7918711839004922
18 0.6703847818696135 0.7922283928765901
19 0.6660369335968889 0.7929622216206689
20 0.6618914209928964 0.7932268361329607
21 0.6575440316880845 0.7933534111379542
22 0.653316116100171 0.7936182056562894
23 0.6494722231973431 0.7941444714887819
24 0.6453853056023461 0.7942901022234803
25 0.6412792909620335 0.7951338725132165
26 0.6373078537624587 0.795

In [188]:
df.shape

(90570, 4)