In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix, lil_matrix, save_npz, hstack

In [4]:
ratings = pd.read_csv("../ratings_meta_info.csv", index_col=0)

  mask |= (ar1 == a)


In [5]:
ratings.head()

Unnamed: 0,user_id,book_id,rating,goodreads_book_id,authors,original_publication_year,language_code,Art,Biography,Business,...,Romance,Science,Science Fiction,Self Help,Spirituality,Sports,Suspense,Thriller,Travel,Young Adult
0,1,258,5,1232,"Carlos Ruiz Zafón, Lucia Graves",2001.0,eng,0.0,0.0,0.0,...,355.0,0.0,0.0,0.0,0.0,0.0,230.0,311.0,0.0,0.0
1,2,4081,4,231,Tom Wolfe,2004.0,en-US,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
2,2,260,5,4865,Dale Carnegie,1936.0,eng,0.0,0.0,113.0,...,0.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,9296,5,4887,"Alice Miller, Ruth Ward",1979.0,en-GB,0.0,0.0,0.0,...,0.0,6.0,0.0,82.0,4.0,0.0,0.0,0.0,0.0,0.0
4,2,2318,3,998,"Thomas J. Stanley, William D. Danko",1995.0,eng,0.0,0.0,513.0,...,0.0,7.0,0.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
ratings.language_code.value_counts()

eng      4239947
en-US    1179179
en-GB     111743
en-CA      47106
spa        25033
fre        10815
ara        10543
ger         3837
ind         2649
jpn         1190
en          1171
por         1043
nl          1017
nor          963
pol          961
per          700
dan          479
ita          307
mul          195
vie          179
swe          129
fil           75
tur           67
rus           67
rum           64
Name: language_code, dtype: int64

In [7]:
ratings.columns

Index([u'user_id', u'book_id', u'rating', u'goodreads_book_id', u'authors',
       u'original_publication_year', u'language_code', u'Art', u'Biography',
       u'Business', u'Chick Lit', u'Children's', u'Christian', u'Classics',
       u'Comics', u'Contemporary', u'Cookbooks', u'Crime', u'Ebooks',
       u'Fantasy', u'Fiction', u'Gay and Lesbian', u'Graphic Novels',
       u'Historical Fiction', u'History', u'Horror', u'Humor and Comedy',
       u'Manga', u'Memoir', u'Music', u'Mystery', u'Nonfiction', u'Paranormal',
       u'Philosophy', u'Poetry', u'Psychology', u'Religion', u'Romance',
       u'Science', u'Science Fiction', u'Self Help', u'Spirituality',
       u'Sports', u'Suspense', u'Thriller', u'Travel', u'Young Adult'],
      dtype='object')

In [6]:
len(ratings.user_id.unique())

53424

In [7]:
ratings.user_id.max()

53424

In [8]:
user_ids = ratings.user_id.values

In [9]:
type(user_ids)

numpy.ndarray

In [10]:
book_ids = ratings.book_id.values

In [56]:
y = ratings.rating.values

### book mapping

In [12]:
book_id_mapping = ratings.drop(['user_id', 'rating'], axis = 1)

In [13]:
book_id_mapping = book_id_mapping.drop_duplicates()

In [14]:
mapping = np.arange(10000)

In [15]:
book_id_mapping['mapping'] = mapping

In [16]:
book_id_mapping.tail()

Unnamed: 0,book_id,mapping
5835494,3885,9995
5837194,9569,9996
5837312,9580,9997
5858335,8892,9998
5876842,9548,9999


In [17]:
book_id_mapping.to_csv('mapping_goodreads_ids.csv')

In [18]:
ratings_with_map = ratings.merge(book_id_mapping, how = 'left', left_on = 'book_id', right_on = 'book_id')

In [19]:
ratings_with_map.shape

(5976479, 4)

In [20]:
book_ids = ratings_with_map.mapping.values

### Converting to numpy one hot

In [21]:
book_ids.size

5976479

In [39]:
def numpy_get_dummies(array):
    b = csc_matrix((array.size, array.max()+1))
#    b = lil_matrix((array.size, array.max()+1))
    print(b.shape)
#    b = np.zeros((array.size, array.max()+1))
#    b = csc_matrix(b)
#     temp = np.arange(array.size)
#     for i in temp:
#         b[i, array[i]] = 1
#    b[temp, array[temp]] = 1
    b[np.arange(array.size), array] = 1
    return b

In [40]:
book_ids_dummies = numpy_get_dummies(book_ids)

(5976479, 10000)


In [43]:
type(book_ids_dummies)

scipy.sparse.csc.csc_matrix

In [46]:
save_npz('book_ids_dummies.npz', book_ids_dummies)

In [47]:
user_ids_dummies = numpy_get_dummies(user_ids)

(5976479, 53425)


In [50]:
user_ids_dummies.sum()

5976479.0

In [51]:
save_npz('user_ids_dummies.npz', user_ids_dummies)

In [57]:
np.savetxt('y.txt', y)

### Concatenating the two

In [59]:
user_ids_dummies.shape

(5976479, 53425)

In [61]:
book_ids_dummies.shape

(5976479, 10000)

In [65]:
def concatenate_csc_matrices_by_columns(matrix1, matrix2):
    new_data = np.concatenate((matrix1.data, matrix2.data))
    new_indices = np.concatenate((matrix1.indices, matrix2.indices))
    new_ind_ptr = matrix2.indptr + len(matrix1.data)
    new_ind_ptr = new_ind_ptr[1:]
    new_ind_ptr = np.concatenate((matrix1.indptr, new_ind_ptr))

    return csc_matrix((new_data, new_indices, new_ind_ptr))

In [66]:
users_books_dummies = concatenate_csc_matrices_by_columns(user_ids_dummies, book_ids_dummies)

In [67]:
users_books_dummies.shape

(5976479, 63425)

In [68]:
save_npz('user_books_dummies.npz', user_ids_dummies)