### Importing Libraries

In [91]:
import pandas as pd
import numpy as np

### Importing Datasets

In [92]:
# Reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('01_AV_MovieLens/ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

In [93]:
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('01_AV_MovieLens/ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

In [94]:
# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('01_AV_MovieLens/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

In [95]:
print("No. of users:", users.shape[0])
print("No. of Ratings:", ratings.shape[0])
print("No. of items:", items.shape[0])

No. of users: 943
No. of Ratings: 100000
No. of items: 1682


In [96]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [97]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [98]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Creating User-item Rating matrix

In [278]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

In [279]:
UI_mtx = pd.pivot_table(ratings, values='rating', index='user_id', columns='movie_id')

In [280]:
UI_mtx.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [281]:
UI_mtx.shape

(943, 1682)

In [282]:
UI_mtx.count().sum()

100000

# 1. Noisy Ratings Detection

In [303]:
## Classify each rating of the User-Item Rating Matrix into Weak/Average/Strong Preference
## Input: User-item rating matrix 
## Output: Matrix with ratings classified

def rating_classification(UI_mtx, k, v):
    nusers, nmovies = UI_mtx.shape[0], UI_mtx.shape[1]
    r_clss = np.zeros((nusers,nmovies)).astype(str)
    r_clss[:] = ''
    
    for u in range(nusers):
        for m in range(nmovies):
            rui = UI_mtx.iloc[u,m]
            if(rui<k and rui>0):
                r_clss[u][m] = 'W'
            elif(rui>=k and rui<v):
                r_clss[u][m] = 'A'
            elif(rui>=v):
                r_clss[u][m] = 'S'
            else:
                r_clss[u][m] = ''
                
    return r_clss

In [233]:
## User-class classification
## Input: Classified-rating matrix
## Output: User-class list

def user_classification(rss):
    user_class = []
    for u in range(rss.shape[0]):
        S,A,W = 0,0,0
        try:
            S = rss.iloc[u,:].value_counts()['S']
            A = rss.iloc[u,:].value_counts()['A']
            W = rss.iloc[u,:].value_counts()['W']
        except:
            pass
        
        if(W >= A+S):
            user_class.append('Critical')
        elif(A >= W+S):
            user_class.append('Average')
        elif(S >= W+A):
            user_class.append('Benevolent')
        else:
            user_class.append('Variable')
            
    return user_class               

In [238]:
## Item-class classification
## Input: Classified-rating matrix
## Output: Item-class list

def item_classification(rss):
    item_class = []
    for i in range(rss.shape[1]):
        S,A,W = 0,0,0
        try:
            S = rss.iloc[:,i].value_counts()['S']
            A = rss.iloc[:,i].value_counts()['A']
            W = rss.iloc[:,i].value_counts()['W']
        except:
            pass
        
        if(W >= A+S):
            item_class.append('Wp')
        elif(A >= W+S):
            item_class.append('Ap')
        elif(S >= W+A):
            item_class.append('Sp')
        else:
            item_class.append('Vp')
            
    return item_class               

In [247]:
## Noisy rating identification
## Input: classified rating mtx, user classes list, item classes list
## Output: identified noisy/non-noisy ratings mtx

def noise_identification(UI_mtx, rss, uc, ic):
    nusers, nmovies = rss.shape[0], rss.shape[1]
    nmtx = UI_mtx.copy()
    
    for u in range(nusers):
        for m in range(nmovies):
            rc = rss.iloc[u,m]
            if (uc[u]=='Critical' and ic[m]=='Wp' and (rc=='A' or rc=='S')):
                nmtx.iloc[u,m] = 'N'
            if (uc[u]=='Average' and ic[m]=='Ap' and (rc=='W' or rc=='S')):
                nmtx.iloc[u,m] = 'N'
            if (uc[u]=='Benevolent' and ic[m]=='Sp' and (rc=='W' or rc=='A')):
                nmtx.iloc[u,m] = 'N'
    
    return nmtx

In [283]:
rss = pd.DataFrame(rating_classification(UI_mtx,2,4))
rss.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,S,A,S,A,A,S,S,W,S,A,...,,,,,,,,,,
1,S,,,,,,,,,A,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,S,A,,,,,,,,,...,,,,,,,,,,


In [284]:
uc = user_classification(rss)
ic = item_classification(rss)

In [285]:
nmtx = noise_identification(UI_mtx, rss, uc, ic)

In [286]:
nmtx.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,N,3.0,5.0,4.0,N,5.0,N,...,,,,,,,,,,
2,4.0,,,,,,,,,N,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [290]:
noise_correct_UI_mtx = nmtx.fillna(0)
noise_correct_UI_mtx.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5,3,4,N,3,5,4,N,5,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,0,0,0,0,0,0,0,0,N,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4,3,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Naive Approach 
Fill in the missing values with the mean rating of all users and on all products

In [289]:
mean_rating = ratings['rating'].mean()
base_rmse = np.sqrt(pow(ratings['rating'] - mean_rating, 2).mean())
print("RMSE = {}".format(base_rmse))

RMSE = 1.125667970762062


#### User-User CF 

In [256]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

In [None]:
def User_User_CF(rss):
    """
    Obtains the RMSE error in the user-user collaborative 
    filtering algorithms using all the similar users.
    
    :param: reviews (numpy.ndarray) : The dense user-item matrix.
    
    :returns: RMSE of predictions
    :rvalue: float
    """
    
    # Get the user-user similarity matrix
    user_user_similarity = cosine_similarity(reviews)
    
    sqdiffs = 0
    num_preds = 0

    # to protect against divide by zero issues
    eps = 1e-6

    cnt_no_sims = 0
    # loop over the users
    for user_i, u in enumerate(reviews):
        
        # products user HAS rated
        i_rated = np.where(u>0)[0]
    
        # Get the average rating for this user
        user_avg =  i_rated.mean()
        
        # loop over all the products that each user reviewed
        # Note: these are all the non-zero entries in the row
        for i_product in i_rated:
        
            # Get all the users (indices) that
            # have also reviewed this product.
            # Note: This also includes the user of interest!
            i_has_rated = np.where(reviews[:, i_product])[0]
        
            # remove the user (indices) of interest 
            # from this column
            i_remove = np.argmin(abs(i_has_rated - user_i))
            i_others_have_rated = np.delete(i_has_rated, i_remove)
        
            # Get the ratings for product by the similar users,
            ratings = reviews[i_others_have_rated, 
                              i_product]
                    
            ratings -= user_avg
               
            # get the cosine similarity between the users
            # and other users.
            similarities = user_user_similarity[user_i,
                                                i_others_have_rated]
            
            # denominator of user_i's produdct prediction
            norm = np.sum(similarities)
            if norm==0:
                cnt_no_sims += 1
                norm = eps
            
            # prediction of user u on product i: p_{u,i}
            predicted_rating = user_avg + np.sum(np.multiply(
                                            ratings.T,
                                            similarities))/norm
            
            # actual rating by user u on product i
            actual_rating = reviews[user_i, i_product]
            
        # get the L2 difference in predicted and actual
        # rating for the user
        sqdiffs += pow(predicted_rating - actual_rating, 2.0)
        num_preds+=1
    
    # get the average of all the predictions
    rmse_cossim = np.sqrt(sqdiffs/num_preds)
    return rmse_cossim

In [274]:
# user similarity 
b = cosine_similarity(UI_mtx.fillna(0))
#np.fill_diagonal(b, 0 )
similarity_with_user = pd.DataFrame(b,index=UI_mtx.index)
similarity_with_user.columns = UI_mtx.index
similarity_with_user.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [292]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(UI_mtx, test_size=0.2)