# Author: Pradeep Sathyamurthy
# Project: Case-2
# Topic Covered: Item Based Recommender Modelling Using Pandas and Numpy
# Start Date: 25 Oct 2017
# Last Modified: 25 Nov 2017
# Dataset: Movie Lens 100K <a href='https://grouplens.org/datasets/movielens/100k/'>Link</a>
# Citation: The recommender wrapper was incorporated from Prof. Bamshad course (CSC-478) and necessary changes where done inorder to customize for Movielens dataset and to make it work as expected.

## <font color='green'> Abstract: </font>
#### This is the part-3 of my Recommener Modelling case study for the main project, in this project I tried to build a item based collaborative filterring model manually using Numpy and Pandas
#### With the Part-2 implemenration though graphlab we where able to infer that this movielens dataset works better with item based recommender model
#### However, in graphlab the actual algorith is encapsulated and user have less control over it
#### In order to get used to actual algorothim behind, I tried to build this implementation.
#### For this part I will be using the same Movie Lens dataset (u.data) and try to build item based collaborative filtering recommendation model and compare the results generated from models built in earlier parts and provide my conclusion.

In [40]:
# importing the necessary packages
import numpy as np
import pandas as pd
import math
#import prady_Item_CF as ICF

## Step-1: Reading the rating data

In [13]:
# Importing the ratings file:
# since the item here is movie, i will try to rename the columns whereever item is there as movie
# This is a conglomoration of User data and item data in order to provide rating for movies watched by user
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
data_rating = pd.read_csv('u.data', sep='\t', names=rating_cols, encoding='utf-8')
print(data_rating.shape)
data_rating.head(3)

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116


In [14]:
data_rating  = data_rating.drop('timestamp',axis=1)
data_rating .head(3)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1


## Step-2: Converting the dataframe to sparce Matrix:

In [30]:
# Converting the array to generate the sparse matrix
ratingArray = np.array(data_rating)
rows, row_pos = np.unique(ratingArray[:, 0], return_inverse=True)
cols, col_pos = np.unique(ratingArray[:, 1], return_inverse=True)
rows, row_pos = np.unique(ratingArray[:, 0], return_inverse=True)
cols, col_pos = np.unique(ratingArray[:, 1], return_inverse=True)
pivot_table = np.zeros((len(rows), len(cols)), dtype=ratingArray.dtype)
pivot_table[row_pos, col_pos] = ratingArray[:, 2]
data_sparsed =pd.DataFrame(pivot_table)
data_sparsed.columns = range(1,1683,1)
data_sparsed.index = range(1,944,1)

In [31]:
data_sparsed.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5,3,4,3,3,5,4,1,5,3,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
data_sparsed.to_csv('Prady_Sparsed_Rating_Matrix.csv')

#### 1. In above dataframe rows signifies the User_id and each collumn signifies the movie_id
#### 2. Data inside this matrix are filled with rating given by each user for the corresponding movie
#### 3. Just for a backup i have stored the rating matrix to a csv file

In [33]:
# we will also have sparce matric as numpy array object too
rating_array = np.array(data_sparsed)

In [34]:
rating_array.shape

(943, 1682)

## Step-3: Creating the Function for calculating the distance metric - Used Cosine Similarity Measure

In [46]:
def cosineSim(inA,inB):
    dataSetI = inA
    dataSetII = inB
    result = 1 - spatial.distance.cosine(dataSetI, dataSetII)
    return result

## Step-4: Function to Train the dataset for Item Based Collaborative Filtering Recommender Model:

In [47]:
def getsim(dataMat, simMeas = cosineSim):
    n = np.shape(dataMat)[1] # number of items
    dat = dataMat.T # movie-user format
    simL = {}

    for item in range(n):
        temp = {}
        simL[item] = {}
        for j in range(n):
            test1 = np.nonzero(dat[item])[0]
            test2 = np.nonzero(dat[j])[0]

            items = []
            for i in test1:
                if i in test2: 
                    items.append(i)

            to_calc_1 = np.array(dat[item][items])
            to_calc_2 = np.array(dat[j][items])

            if len(items) != 0:
                correlation = np.corrcoef(to_calc_1, to_calc_2)[0][1]
            else:
                correlation = 0

            if math.isnan(correlation): 
                correlation = 0

            temp[j] = correlation
                # returns r between item and j
            if item not in simL:
                simL[item] = temp
            else:
                simL[item].update(temp)

    return simL

## Step-5: Function to estimate the rating for sparsed data

In [52]:
def standEstTest(dataMat, user, similarity, item):
    n = np.shape(dataMat)[1] # number of items
    simTotal = 0.0
    ratSimTotal = 0.0

    for j in range(n):
        userRating = dataMat[user, j]
        if userRating == 0: 
            continue

        sim = similarity[item][j]
        if sim <= 0: sim = 0

        simTotal += sim
        ratSimTotal += sim * userRating

    if simTotal == 0: 
        return 0
    else: 
        return ratSimTotal/simTotal

## Step-6: Function to recommend based on item similariry by taking user interest into consideration

In [57]:
def recommend(dataMat, user, simMeas, N=3, estMethod=standEstTest):
    unratedItems = np.nonzero(np.matrix(dataMat[user,:]).A==0)[1] #find unrated items 
    
    if len(unratedItems) == 0: return 'you rated everything'
    itemScores = []
    
    for item in unratedItems:
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        itemScores.append((item, estimatedScore))
    
    return sorted(itemScores, key=lambda jj: jj[1], reverse=True)[:N]

## Step-7: Function to performs evaluation on a single user based on the test_ratio
### For example, with test_ratio = 0.2, a randomly selected 20 percent of rated items by the user are withheld and the rest are used to estimate the withheld ratings

In [76]:
def cross_validate_user(dataMat, user, test_ratio, simMeas, estMethod=standEstTest):
    number_of_items = np.shape(dataMat)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
    test_size = int(test_ratio * len(rated_items_by_user))
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[user])
    dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

    # Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        error_u = error_u + abs(estimatedScore - original_user_profile[item])	

    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        dataMat[user, item] = original_user_profile[item]

    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

## Step-8: Function to compute the Model Evaluation - Mean Absolute Error

In [71]:
def test(dataMat, test_ratio, simMeas):
    error = 0
    count = 0
    for i in range(np.shape(dataMat)[0]):
        er, ct = cross_validate_user(dataMat, i, test_ratio, simMeas)
        error += er
        count += ct
    MAE = error/count
    print ("The MAE  is for user-based collaborative filtering is: %0.5f" % MAE)
    return MAE

## Step-9: Computing the Cosine Similarity for each of the item available

In [48]:
simMat = getsim(rating_array)

  c = cov(x, y, rowvar)


In [49]:
print (len(simMat))

1682


In [50]:
simMat[0]

{0: 0.99999999999999989,
 1: 0.22178411018797145,
 2: 0.17579429803514823,
 3: 0.10313529410109291,
 4: 0.38647516379283531,
 5: 0.52940092554398921,
 6: 0.15924799761320829,
 7: 0.24736715740480411,
 8: 0.090176037480161592,
 9: 0.18750154919627687,
 10: 0.31045375453461849,
 11: 0.086473046817654303,
 12: 0.013655367616886739,
 13: 0.12445664945873652,
 14: 0.096277919210685139,
 15: 0.3091579884856358,
 16: 0.28786626943656068,
 17: 0.45652173913043481,
 18: 0.14436417532667772,
 19: 0.070117462970397268,
 20: 0.29043036154297142,
 21: 0.30148137816807946,
 22: 0.10966983236230071,
 23: 0.17746272042633648,
 24: 0.11287513826792017,
 25: 0.079519992828935362,
 26: 0.026839370987441505,
 27: 0.35071558140929671,
 28: 0.30613175197210551,
 29: -0.16826090464408092,
 30: 0.26523380349053388,
 31: -0.028256770241451921,
 32: 0.11619700061650476,
 33: -0.28912165479145596,
 34: 0.44610890810959708,
 35: 0.50404864909094493,
 36: -0.045037734911104486,
 37: 0.34410854979507666,
 38: 0.240

## Step-10: Testing Rating Estimation Function

In [54]:
standEstTest(rating_array, 0, simMat, 300)

3.2575875274689814

## Step-11: Evaluating the Model using MAE

In [77]:
test(rating_array, 0.2, simMat)

The MAE  is for user-based collaborative filtering is: 0.71953


0.71952644555939593

## Step-12: Model Recommendation

In [58]:
recommend(rating_array, 1, simMat)

[(441, 5.0), (599, 5.0), (776, 5.0)]

## <font color='red'>      End of the Book!!!   </font>