In [1]:
import numpy as np
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import pairwise_distances as pwd
from sklearn.cross_validation import KFold


def uMat(df, shape):
    'Utility Matrix'
    uMat = np.zeros(shape)
    for line in df.itertuples(index = False):
        uMat[line[3]-1,line[4]-1] = line[2]
    return uMat

def sim(mat):
    'Similarity Matrix'
    sim = 1-pwd(mat, metric = 'cosine')
    eps = np.finfo(float).eps
    sim[sim <= 0] = eps
    return sim

def isn(mat):
    '''Number of nan values'''
    return np.isnan(mat).sum()


def ztn(mat):
    '''Zero to nan'''
    mat[mat == 0] = np.nan
    return mat

def ntz(mat):
    '''Nan to zero'''
    mat = np.nan_to_num(mat)
    return mat

def nmean(mat, axis = None):
    '''Return mean considering only elements differents from zero.
    Transform nan values to zero.'''
    m = np.true_divide(mat.sum(axis), (mat != 0).sum(axis))
    return ntz(m)

def normU(uMat):
    '''Normalize the utility matrix with the base rating.
    The base ratings is obtained as OverallMean+(UserMean - OverallMean)+
    +(ItemMean - OverallMean)'''
    mat = uMat.copy()
    OverallMean = ntz(nmean(mat))
    ItemMean = ntz(nmean(mat, 0))
    UserMean = ntz(nmean(mat, 1))
    baseR = UserMean[:,np.newaxis] + ItemMean[np.newaxis,:] - OverallMean 
    mat = ztn(mat)
    normMat = mat  - baseR
    mat = ntz(mat)
    normMat = ntz(normMat)
    return normMat, baseR


def pred(input_mat, cf_mode = 'u'):
    mat = input_mat.copy()
    Norm, baseR = normU(mat)
    
    if cf_mode == 'u':
        Usim = sim(Norm)
        Usimsim = (np.abs(Usim).sum(1))[:, np.newaxis]
        Upred = baseR + Usim.dot(Norm)/Usimsim
        return {'pred':Upred, 'sim':Usim}
    
    if cf_mode == 'i':
        Isim = sim(Norm.T)
        Isimsim = (np.abs(Isim).sum(1))[:, np.newaxis]
        Ipred = baseR.T + Isim.dot(mat.T)/Isimsim
        return {'pred':Ipred.T, 'sim':Isim}
    
def rmse(pred, truth):
    '''Compute the root mean squared error.
    '''
    xHat = pred[truth.nonzero()].flatten()
    x = truth[truth.nonzero()].flatten()
    return np.sqrt(mse(xHat,x))

def kfoldCV(data, shape, nf = "5", mode = 'u', verbose = False):
    kf = KFold(data.shape[0], n_folds = nf, shuffle=True)

    rmse_list = []
    print('Performing '+str(nf)+'-foldCV '+ str(mode)+'-i'+' CF...\n')
    cycle = 0
    
    for train, test in kf:
        if verbose: print('Iteration '+str(cycle))
        trMat = uMat(data.iloc[train], shape)
        teMat = uMat(data.iloc[test], shape)
        
        prediction = pred(trMat, mode)['pred']
        
        rmse_cycle = rmse(prediction, teMat)
        rmse_list.append(rmse_cycle)
        
        if verbose: print('  -->'+str(rmse_cycle)+'\n')
        cycle += 1

    kfold_rmse = np.mean(rmse_list)
    
    if verbose: print(str(nf)+'-foldCV rmse: ', str(kfold_rmse))
    
    return {"rmse_list":rmse_list, "rmse":kfold_rmse}


def train_test_validation(ratings, shape, testSize = 0.2):
    from sklearn.cross_validation import train_test_split as tts
    trSet, teSet = tts(ratings, test_size = testSize)
    trMat, teMat = uMat(trSet, shape), uMat(teSet, shape)

    print('User-Item rmse :'+str( rmse(pred(trMat)['pred'], teMat) ) )
    print('Item-Item rmse :'+str( rmse(pred(trMat,'i')['pred'], teMat) ))



In [2]:
import pandas as pd

ratings = pd.read_csv('./BX-Book-Ratings.csv',\
           header = 0,
           names = ['UserID', 'ISBN', 'BookRating'],
           encoding = 'ISO-8859-1', 
           delimiter = ';')

In [3]:
# DATA PREPROCESSING

ratings = ratings[ratings['BookRating'] != 0]   
ratings = ratings.sample(int(ratings.shape[0]*0.8))

URateNum = ratings.groupby('UserID')['BookRating'].count()
IRateNum = ratings.groupby('ISBN')['BookRating'].count()

Uidx = URateNum[URateNum < 8].index
Iidx = IRateNum[IRateNum < 8].index

ratings.drop(ratings[ratings['UserID'].isin(Uidx)].index, axis = 0, inplace = True)
ratings.drop(ratings[ratings['ISBN'].isin(Iidx)].index, axis = 0, inplace = True)

ratings['UserID'] = pd.Categorical(ratings['UserID'])
ratings['ISBN'] = pd.Categorical(ratings['ISBN'])

ratings['U'] = ratings['UserID'].cat.codes
ratings['I'] = ratings['ISBN'].cat.codes


shape = (ratings['U'].unique().shape[0], 
        ratings['I'].unique().shape[0])




In [4]:
# Import Rating Table
clust_rat_col = ['UserID', 'ISBN', 'BookRatingC','UClusterID','IClusterID']
ratings_cluster = pd.read_csv('ratings.csv', index_col = False)
ratings_cluster = ratings_cluster[clust_rat_col] 
shape_clust = (ratings_cluster.UClusterID.unique().shape[0], 
         ratings_cluster.IClusterID.unique().shape[0])    # shape

In [5]:
# Train/Test validation
train_test_validation(ratings_cluster, shape_clust)

User-Item rmse :0.268792714629
Item-Item rmse :0.896388712476


In [6]:
# kFold Cross Validation
n_fold = 10
kfoldrmse_user = kfoldCV(ratings_cluster, shape_clust,  n_fold, 'u', verbose = False)
print(str(n_fold)+'-fold rmse user-item CF: ', str(round(kfoldrmse_user['rmse'],2)))


kfoldrmse_item = kfoldCV(ratings_cluster, shape_clust, n_fold, 'i', verbose = False)
print(str(n_fold)+'-fold rmse item-item CF: ', str(round(kfoldrmse_item['rmse'],2)))

Performing 10-foldCV u-i CF...

10-fold rmse user-item CF:  0.22
Performing 10-foldCV i-i CF...

10-fold rmse item-item CF:  0.95


In [None]:
# ONLINE RECOMMENDATION

In [7]:
from random import shuffle

def makeUser(ratings, filename = 'user.txt', books_num = 10):
    ISBN_list = ratings['ISBN'].sample(books_num)
    with open(filename, 'w') as f:    # prepare book list
        for ISBN in ISBN_list[:books_num+1]:
            f.write(str(ISBN)+'\n')


def readBooklist(file_path):
    df = pd.read_csv(file_path, 
                     header = None, 
                     names = ['ISBN'], 
                    dtype = 'str')
    vector = df.values.flatten().tolist()
    
    return vector

def User(vector, ratings, shape):
    user_vec = np.zeros(shape[1])
    for isbn in vector:
        i = (ratings_cluster[ratings_cluster.ISBN == isbn]
             ['IClusterID'].unique().tolist())
        user_vec[i] = 1
    return user_vec.astype(np.int)

def predJac(ratings_cluster, shape_clust, cf_mode = 'u'):
    prediction = pred(uMat(ratings_cluster, shape_clust), cf_mode='u')['pred']
    predMean = round(prediction.mean(), 2)
    prediction[prediction <= prediction.mean()-1] = 0
    prediction[prediction>prediction.mean()-1] = 1
    
    return prediction


def recommend(ratings_cluster, vector, Y, n_rec = 3, sim_threshold = 0.95):
    similarity = recCluster(user_vec, Y)
    books = []
    sim = similarity[similarity >= sim_threshold]
    for clusterID in sim.index:
        cluster = ratings_cluster[ratings_cluster['UClusterID'] == clusterID]
        if cluster['BookRatingC'].unique().flatten() > 8 :
            books.extend(cluster[cluster['BookRating']>8]['ISBN'].tolist())
    
    books = set(books)                           
    m = list(books.intersection(vector))
    for i in m: books.discard(i)
    books = list(books)
    shuffle(books)
    return list(books[:n_rec])

from sklearn.metrics import jaccard_similarity_score as jaccard

def recCluster(user_vec, utility):
    similarity = np.zeros(utility.shape[0])
    for i in range(utility.shape[0]):
        similarity[i] = jaccard(user_vec, utility[i,:])
    sim = pd.Series(similarity)
    sim.sort_values(ascending = False, inplace = True)
    return sim

import isbnlib

def isbn2title(isbn_list):
        titles = []
        for isbn in isbn_list:
            titles.append(isbnlib.meta(isbn)['Title'])
        return titles

def titles(user_isbn, rec_isbn):
    book_titles = isbn2title(rec_isbn)
    read_books = isbn2title(user_isbn)

    book_titles = set(book_titles)
    common = list(book_titles.intersection(set(read_books)))
    for i in common: book_titles.remove(i)

    print('You have read:', 20*'-', *read_books, '\n',sep = '\n')
    print('You could like:',20*'-',*book_titles,sep = '\n')
    return (read_books,book_titles)

In [13]:
## Examples

# Random
makeUser(ratings, books_num=20) # create random user

user_file = './user.txt'
HPExample = './HPexample.txt'




## Read book list in memory

# Read from examples
example_num = 'HP'
example = pd.read_csv('example'+str(example_num)+'.txt', sep='#', header = None)
vector = example[0]    # set vector from example


#vector = readBooklist(user_file)
#vector = readBooklist(HPExample) 

user_vec = User(vector, ratings_cluster, shape_clust)    # create the user row 
Y = predJac(ratings_cluster, shape_clust, cf_mode = 'u')    # create 0/1 matrix from User
                                                            # Collaborative Filter predictions
similarity = recCluster(user_vec, Y) # calculate user row jaccard similarity with user clusters
full_rat = pd.read_csv('./ratings.csv', index_col = 0)    # load in memory the full ratings

# Find books to recommend
books = recommend(full_rat, 
                  similarity, 
                  Y,
                  n_rec = 5,    # number of books to recommend
                  sim_threshold = 0.99)    # cluster similarity threshold

In [10]:
# Output ISBN (as original program)

print('You have read:', *vector, sep = '\n')
print('We suggest you:',*books, sep = '\n')
titles(vector, books)

You have read:
0525946233    
034540288X    
0807083054    
0316693324    
0743203178    
0375412824    
0440487617    
0440184053    
0425116840    
1573227331    
1558746226    
0553562738    
0373834284    
0345413350    
0451167317    
0842329269    
0345426037    
2266023039    
0671722751    
0446356875    
We suggest you:
0836218655
0446364282
0345339711
0440998050
0440904196


KeyboardInterrupt: 

In [14]:
# INPUT AND ANSWER
print(example)
print(pd.read_csv('answer'+str(example_num)+'.txt', sep = '#', header = None))

               0                                         1
0  9780747560722   Harry Potter And The Chamber Of Secrets
1  9780439708180     Harry Potter And The Sorcerer's Stone
2  9780439136365  Harry Potter And The Prisoner Of Azkaban
3  9780618260300      The Hobbit, Or, There And Back Again
           0                                                  1
0   60928336                                 Bel Canto: A Novel
1   60188731                        The MacGregors: Daniel, Ian
2  373483902    Divine secrets of the Ya-Ya Sisterhood: a novel
3  590846280                                 A bend in the road
4  446611867  The Adventures Of Captain Underpants: An Epic ...
