### Preprocessing data


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

In [2]:
ratings=pd.read_csv('/Users/aditya16.narula/Sites/BookReco/goodbooks-10k/ratings.csv')
books=pd.read_csv('/Users/aditya16.narula/Sites/BookReco/goodbooks-10k/books.csv')
print(ratings.columns)
#books['elo']=1200.0
print(books.columns)
print(ratings.head(1))

Index(['user_id', 'book_id', 'rating'], dtype='object')
Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',
       'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',
       'original_title', 'title', 'language_code', 'average_rating',
       'ratings_count', 'work_ratings_count', 'work_text_reviews_count',
       'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
       'image_url', 'small_image_url'],
      dtype='object')
   user_id  book_id  rating
0        1      258       5


### Get books liked by a particular User ID

In [3]:

userBooks = ratings.loc[ratings['user_id'] == 1]
userbookdetails = pd.merge(userBooks, books, left_on = 'book_id', right_on = 'goodreads_book_id')
print(userbookdetails)

    user_id  book_id_x  rating  book_id_y  goodreads_book_id  best_book_id  \
0         1         11       5         54                 11        386162   
1         1         33       4        189                 33            33   
2         1         10       4       3753                 10            10   
3         1         36       4       4229                 36            36   
4         1        119       3       3230                119           119   
5         1         13       4        337                 13            13   
6         1       2002       5       9048               2002          2002   
7         1         67       3       3504                 67            67   
8         1        378       3        561                378           378   
9         1         98       3       7683                 98            98   
10        1       5191       4       1311               5191          5191   
11        1        231       3       4081                231    

### Sparse Matrix representation

In [4]:
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


from scipy.sparse import coo_matrix, csr_matrix
def read_data(filename):
    """ Reads in the last.fm dataset, and returns a tuple of a pandas dataframe
    and a sparse matrix of song/user/playcount """
    # read in triples of user/song/playcount from the input dataset
    data = pd.read_csv(filename,
                             usecols=[0,1,2],        #[36, 11, 10] vrk_pat_primkey,prd_atc_primkey,vdp_aantal
                             names=['song', 'user','plays'],skiprows=1) #[:1000000]   # user = patient, or prescriptionnr song=atc

    data=data.dropna(axis=0, how='any')  #drop nan
    data['plays']=data['plays']+1
    print(data.head())
    # map each song and user to a unique numeric value
    data['user'] = data['user'].astype("category")
    data['song'] = data['song'].astype("category")

    # create a sparse matrix of all the users/plays
    plays = coo_matrix((data['plays'].astype(float),
                       (data['song'].cat.codes.copy(),
                        data['user'].cat.codes.copy())))
    data['song_nr']=data['song'].cat.codes.copy()
    return data, plays,data.groupby(['song_nr','song']).plays.sum(),data['user'].cat.codes.copy()

data,matrix,songsd,user=read_data('/Users/aditya16.narula/Sites/BookReco/goodbooks-10k/ratings.csv')
data.head()

   song  user  plays
0     1   258      6
1     2  4081      5
2     2   260      6
3     2  9296      6
4     2  2318      4


Unnamed: 0,song,user,plays,song_nr
0,1,258,6,0
1,2,4081,5,1
2,2,260,6,1
3,2,9296,6,1
4,2,2318,4,1


### Normalize

In [5]:
from sklearn.preprocessing import normalize


def cosine(plays):
    normalized = normalize(plays)
    return normalized.dot(normalized.T)


def bhattacharya(plays):
    plays.data = np.sqrt(plays.data)
    return cosine(plays)


def ochiai(plays):
    plays = csr_matrix(plays)
    plays.data = np.ones(len(plays.data))
    return cosine(plays)


def bm25_weight(data, K1=1.2, B=0.8):
    """ Weighs each row of the matrix data by BM25 weighting """
    # calculate idf per term (user)
    N = float(data.shape[0])
    idf = np.log(N / (1 + np.bincount(data.col)))

    # calculate length_norm per document (artist)
    row_sums = np.squeeze(np.asarray(data.sum(1)))
    average_length = row_sums.sum() / N
    length_norm = (1.0 - B) + B * row_sums / average_length

    # weight matrix rows by bm25
    ret = coo_matrix(data)
    ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col]
    return ret


def bm25(plays):
    plays = bm25_weight(plays)
    return plays.dot(plays.T)

def get_largest(row, N=10):
    if N >= row.nnz:
        best = zip(row.data, row.indices)
    else:
        ind = np.argpartition(row.data, -N)[-N:]
        best = zip(row.data[ind], row.indices[ind])
    return sorted(best, reverse=True)


def calculate_similar_artists(similarity, artists, artistid):
    neighbours = similarity[artistid]
    top = get_largest(neighbours)
    return [(artists[other], score, i) for i, (score, other) in enumerate(top)]


songsd = dict(enumerate(data['song'].cat.categories))
user_count = data.groupby('song').size()
to_generate = sorted(list(songsd), key=lambda x: -user_count[x])

similarity = bm25(matrix)

### SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
Xr=svd.fit_transform(bm25(matrix))  
print(svd.explained_variance_ratio_)  
print(svd.explained_variance_ratio_.sum())

### Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
Udf=pd.DataFrame(cosine_similarity(Xr))


In [None]:
booknr=4536
print(Udf[booknr].sort_values(ascending=False)[:10])
books[books['id'].isin( Udf[booknr].sort_values(ascending=False)[:10].index )]

In [None]:
# get the books of user 1
userBooks = ratings.loc[ratings['user_id'] == 1]
userbookdetails = pd.merge(userBooks, books, left_on = 'book_id', right_on = 'goodreads_book_id')
# get the book ids liked by user 1
bookids = userbookdetails['goodreads_book_id']
print(type(bookids))


In [None]:
for bookid in bookids[:10]:
    booknr=bookid #4536
    print(booknr)
    print(Udf[booknr].sort_values(ascending=False)[:10])
    books[books['id'].isin( Udf[booknr].sort_values(ascending=False)[:10].index )]
