This notebook contains code to create the matrix used for our FM model. It will contain, from left to right, for every rating:
- user (in dummy format)
- book (in dummy format)
- author (in dummy format)
- language (in dummy format)
- year of publication
- NLP features
- genre tags

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import csc_matrix, lil_matrix, save_npz, hstack
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.datasets import dump_svmlight_file

In [2]:
ratings = pd.read_csv('../ratings_us.csv') # pure ratings data
books_info = pd.read_csv('books_meta_info.csv') # book metadata
nlp = pd.read_pickle('books_with_summaries.p') # book nlp features

## Ratings

In [3]:
y = ratings.rating.values

## User Dummies

In [4]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
len(ratings.user_id.unique())

53428

In [6]:
def numpy_get_dummies(array, zero_index=False):
    sparse = csc_matrix((array.size, array.max()+1)) # creates one extra column for zero index
    sparse[np.arange(array.size), array] = 1         # so it can use the ids 1-10000 as indices
    if zero_index:
        return sparse # don't get rid of the zero column
    else:
        return sparse[:,1:] # get rid of the zero column

In [7]:
user_ids = ratings.user_id.values

In [8]:
users_dummies = numpy_get_dummies(user_ids)



In [9]:
users_dummies.shape # n_ratings x n_users

(5976695, 53428)

In [10]:
users_dummies.sum() # sanity check

5976695.0

## Book Dummies

In [11]:
len(ratings.book_id.unique())

10000

In [12]:
book_ids = ratings.book_id.values

In [13]:
book_dummies = numpy_get_dummies(book_ids)



In [14]:
book_dummies.shape # n_ratings x n_books

(5976695, 10000)

In [15]:
book_dummies.sum() # sanity check

5976695.0

## Author Dummies

In [94]:
books_info.head()

Unnamed: 0.1,Unnamed: 0,book_id,goodreads_book_id,authors,original_publication_year,language_code,Art,Biography,Business,Chick Lit,...,Romance,Science,Science Fiction,Self Help,Spirituality,Sports,Suspense,Thriller,Travel,Young Adult
0,0,1,2767052,Suzanne Collins,2008.0,eng,0.0,0.0,0.0,0.0,...,3341.0,0.0,8772.0,0.0,0.0,0.0,641.0,800.0,0.0,25968.0
1,1,2,3,"J.K. Rowling, Mary GrandPré",1997.0,eng,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14984.0
2,2,3,41865,Stephenie Meyer,2005.0,en-US,0.0,0.0,0.0,716.0,...,0.0,0.0,253.0,0.0,0.0,0.0,0.0,0.0,0.0,16640.0
3,3,4,2657,Harper Lee,1960.0,eng,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2436.0
4,4,5,4671,F. Scott Fitzgerald,1925.0,eng,0.0,0.0,0.0,0.0,...,2090.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,397.0


###### Label Encode

In [95]:
books_authors = books_info[['book_id', 'authors']]

In [96]:
len(books_authors.authors.unique())

4664

In [97]:
le = LabelEncoder()
books_authors['author_id'] = le.fit_transform(books_authors.authors)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [98]:
books_authors.head()

Unnamed: 0,book_id,authors,author_id
0,1,Suzanne Collins,4222
1,2,"J.K. Rowling, Mary GrandPré",1869
2,3,Stephenie Meyer,4153
3,4,Harper Lee,1652
4,5,F. Scott Fitzgerald,1351


###### Map to Ratings and Get Dummies

In [99]:
ratings_authors = ratings.merge(books_authors[['book_id', 'author_id']], on='book_id', how='left')

In [100]:
authors_dummies = numpy_get_dummies(np.array(ratings_authors.author_id.values), zero_index=True)



In [101]:
authors_dummies.shape # n_ratings x n_authors

(5976695, 4664)

In [102]:
authors_dummies.sum() # sanity check

5976695.0

## Language Dummies

###### Fix English codes and Missing Values

In [25]:
books_languages = books_info[['book_id', 'language_code']]

In [26]:
english = ['eng', 'en-US', 'en-GB', 'en-CA', 'en']
def map_lang(row): # maps all english codes to one, and labels NA values as unknown
    if isinstance(row.language_code, str):
        if row.language_code in english:
            return 'eng'
        else:
            return row.language_code
    else:
        if np.isnan(row.language_code):
            return 'unk'
        else:
            print(row.language_code)

In [27]:
books_languages['language_mapped'] = books_languages.apply(lambda x: map_lang(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


###### Label Encode

In [28]:
le = LabelEncoder()
books_languages['language_id'] = le.fit_transform(books_languages.language_mapped)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [29]:
books_languages.head()

Unnamed: 0,book_id,language_code,language_mapped,language_id
0,1,eng,eng,2
1,2,eng,eng,2
2,3,en-US,eng,2
3,4,eng,eng,2
4,5,eng,eng,2


In [30]:
sorted(books_languages.language_id.unique()) # NOTE THESE START AT ZERO!

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

###### Map to Ratings and Get Dummies

In [31]:
ratings_languages = ratings.merge(books_languages[['book_id', 'language_id']], on='book_id', how='left')

In [32]:
language_dummies = numpy_get_dummies(np.array(ratings_languages.language_id.values), zero_index=True)



In [33]:
language_dummies.shape # n_ratings x n_languages

(5976695, 22)

In [34]:
language_dummies.sum() # sanity check

5976695.0

## Year of Publication

In [35]:
books_years = books_info[['book_id', 'original_publication_year']]

###### Fill NA values with median

In [36]:
year_median = books_years.original_publication_year.median()

In [37]:
year_median

2004.0

In [38]:
books_years.original_publication_year.fillna(value=year_median, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


###### Set values < 1700 to 1700

In [39]:
only_above_1700 = pd.concat([books_years.book_id, 
                             books_years.original_publication_year.where(
                                books_years.original_publication_year > 1700, other=1700)], axis=1)

In [40]:
only_above_1700.head()

Unnamed: 0,book_id,original_publication_year
0,1,2008.0
1,2,1997.0
2,3,2005.0
3,4,1960.0
4,5,1925.0


In [82]:
# Normalize the year to a value from 0 to 1 (1 is 2017, close to 0 (but never 0) is 1700)
def normalize_year(x):
    if np.isnan(x):
        return np.nan
    if int(x)<1700:
        x=1700
    return 1-((2017-x)/318.0)
only_above_1700['original_publication_year']=only_above_1700.original_publication_year.apply(normalize_year)

In [83]:
only_above_1700.head()

Unnamed: 0,book_id,original_publication_year
0,1,0.971698
1,2,0.937107
2,3,0.962264
3,4,0.820755
4,5,0.710692


###### Map to Ratings

In [84]:
ratings_years = ratings.merge(only_above_1700, on='book_id', how='left').drop(['user_id', 'book_id', 'rating'], axis=1)

In [85]:
ratings_years.shape # n_ratings x 1

(5976695, 1)

###### Make Sparse Matrix

In [86]:
years_sparse = csc_matrix(ratings_years)

## NLP Features

In [44]:
books_summaries = pd.read_pickle('books_with_summaries.p')

###### Merge with Ratings

In [45]:
ratings_nlp = ratings.merge(books_summaries, on='book_id', how='left').drop(['user_id', 'book_id', 'rating'], axis=1)

In [46]:
ratings_nlp.shape # n_ratings x n_nlp_features

###### Make Sparse Matrix

In [47]:
nlp_sparse = csc_matrix(ratings_nlp)

## Genre Mapping

###### Combine Genres into Broader Categories

In [48]:
def map_genres(row):
    new_genres = {}
    new_genres['book_id'] = row.book_id
    new_genres['art_music'] = row.Art + row.Music + row.Poetry
    new_genres['biography'] = row.Biography + row.Memoir
    new_genres['business'] = row.Business + row.Psychology
    new_genres['romance'] = row['Chick Lit'] + row.Romance
    new_genres['children'] = row["Children's"] + row['Young Adult']
    new_genres['religion_philosophy'] = row.Christian + row.Religion + row.Philosophy + row.Spirituality
    new_genres['classics'] = row.Classics
    new_genres['comics'] = row.Comics + row['Graphic Novels'] + row.Manga
    new_genres['contemporary'] = row.Contemporary
    new_genres['cookbooks'] = row.Cookbooks
    new_genres['mystery'] = row.Crime + row.Horror + row.Mystery + row.Suspense + row.Thriller
    new_genres['scifi_fantasy'] = row.Fantasy + row.Paranormal + row['Science Fiction']
    new_genres['fiction'] = row.Fiction
    new_genres['gay_lesbian'] = row['Gay and Lesbian']
    new_genres['hist_fiction'] = row['Historical Fiction']
    new_genres['history'] = row.History
    new_genres['humor'] = row['Humor and Comedy']
    new_genres['nonfiction'] = row.Nonfiction
    new_genres['science'] = row.Science
    new_genres['self_help'] = row['Self Help']
    new_genres['sports'] = row.Sports
    new_genres['travel'] = row.Travel
    return new_genres

In [49]:
books_genres = pd.DataFrame(list(books_info.apply(lambda x: map_genres(x), axis=1)))

In [50]:
books_genres.head()

Unnamed: 0,art_music,biography,book_id,business,children,classics,comics,contemporary,cookbooks,fiction,...,humor,mystery,nonfiction,religion_philosophy,romance,science,scifi_fantasy,self_help,sports,travel
0,0.0,0.0,1,0.0,25968.0,0.0,0.0,382.0,0.0,13819.0,...,0.0,1441.0,0.0,0.0,3341.0,0.0,19608.0,0.0,0.0,0.0
1,0.0,0.0,2,0.0,16755.0,1898.0,0.0,397.0,0.0,13239.0,...,0.0,657.0,0.0,0.0,0.0,0.0,48401.0,0.0,0.0,0.0
2,0.0,0.0,3,0.0,16640.0,0.0,0.0,503.0,0.0,6298.0,...,0.0,462.0,0.0,0.0,716.0,0.0,19721.0,0.0,0.0,0.0
3,0.0,0.0,4,0.0,2436.0,31068.0,0.0,253.0,0.0,879.0,...,0.0,555.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,5,0.0,397.0,32061.0,0.0,0.0,0.0,15360.0,...,0.0,0.0,0.0,0.0,2090.0,0.0,0.0,0.0,0.0,0.0


###### L1 Normalize By Row

In [51]:
new_genres = books_genres.drop('book_id', axis=1).columns

In [52]:
books_genres_norm = pd.DataFrame(Normalizer(norm='l1').fit_transform(books_genres.drop('book_id', axis=1)), columns=new_genres)

In [53]:
books_genres_norm.head()

Unnamed: 0,art_music,biography,business,children,classics,comics,contemporary,cookbooks,fiction,gay_lesbian,...,humor,mystery,nonfiction,religion_philosophy,romance,science,scifi_fantasy,self_help,sports,travel
0,0.0,0.0,0.0,0.402237,0.0,0.0,0.005917,0.0,0.214052,0.0,...,0.0,0.022321,0.0,0.0,0.051751,0.0,0.303722,0.0,0.0,0.0
1,0.0,0.0,0.0,0.205969,0.023332,0.0,0.00488,0.0,0.162747,0.0,...,0.0,0.008077,0.0,0.0,0.0,0.0,0.594994,0.0,0.0,0.0
2,0.0,0.0,0.0,0.375282,0.0,0.0,0.011344,0.0,0.142039,0.0,...,0.0,0.010419,0.0,0.0,0.016148,0.0,0.444768,0.0,0.0,0.0
3,0.0,0.0,0.0,0.060441,0.770842,0.0,0.006277,0.0,0.021809,0.0,...,0.0,0.01377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.007591,0.612998,0.0,0.0,0.0,0.293679,0.0,...,0.0,0.0,0.0,0.0,0.03996,0.0,0.0,0.0,0.0,0.0


###### Drop Values < 0.1

In [54]:
only_above_10 = books_genres_norm.where(books_genres_norm > 0.1, other=0)

###### Normalize Again

In [55]:
books_genres_norm2 = pd.concat([books_genres.book_id, 
                                pd.DataFrame(Normalizer(norm='l1').fit_transform(only_above_10), 
                                             columns=new_genres)], axis=1)

In [56]:
books_genres_norm2.head()

Unnamed: 0,book_id,art_music,biography,business,children,classics,comics,contemporary,cookbooks,fiction,...,humor,mystery,nonfiction,religion_philosophy,romance,science,scifi_fantasy,self_help,sports,travel
0,1,0.0,0.0,0.0,0.437209,0.0,0.0,0.0,0.0,0.232663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.330129,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.213725,0.0,0.0,0.0,0.0,0.168876,...,0.0,0.0,0.0,0.0,0.0,0.0,0.617399,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.39007,0.0,0.0,0.0,0.0,0.147636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.462294,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.867749,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.676093,0.0,0.0,0.0,0.323907,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


###### Map to Ratings

In [57]:
ratings_genres = ratings.merge(books_genres_norm2, on='book_id', how='left').drop(['user_id', 'book_id', 'rating'], axis=1)

In [58]:
ratings_genres.shape # n_ratings x n_genres

(5976695, 22)

In [59]:
ratings_genres.sum().sum() # sanity check - rounding errors?

5975780.000002879

###### Make Sparse Matrix

In [60]:
genres_sparse = csc_matrix(ratings_genres)

## Concatenate Everything!

In [61]:
def concatenate_csc_matrices_by_columns(matrix1, matrix2):
    new_data = np.concatenate((matrix1.data, matrix2.data))
    new_indices = np.concatenate((matrix1.indices, matrix2.indices))
    new_ind_ptr = matrix2.indptr + len(matrix1.data)
    new_ind_ptr = new_ind_ptr[1:]
    new_ind_ptr = np.concatenate((matrix1.indptr, new_ind_ptr))

    return csc_matrix((new_data, new_indices, new_ind_ptr))

In [103]:
temp1 = concatenate_csc_matrices_by_columns(users_dummies, book_dummies)
temp2 = concatenate_csc_matrices_by_columns(temp1, authors_dummies)
#temp3 = concatenate_csc_matrices_by_columns(temp2, language_dummies)
temp3 = concatenate_csc_matrices_by_columns(temp2, language_dummies)
temp4 = concatenate_csc_matrices_by_columns(temp3, years_sparse)
temp5 = concatenate_csc_matrices_by_columns(temp4, nlp_sparse)
all_features = concatenate_csc_matrices_by_columns(temp5, genres_sparse)
#all_features = concatenate_csc_matrices_by_columns(temp4, genres_sparse)

In [105]:
#n_users + n_books + n_authors + n_languages + 1 years column + 40 nlp features + n_genres
53428 + 10000 + 4664 + 22 + 1 + 40 + 22
#53428 + 10000 + 4664+22 + 1  + 22

68137

In [104]:
all_features.shape # BOOM

(5976695, 68137)

## Train/Test Sets

In [75]:
val_test_size = round(0.15 * all_features.shape[0])

In [106]:
X_temp, X_test, y_temp, y_test = train_test_split(all_features, y, test_size=val_test_size, random_state=42)

In [107]:
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=val_test_size, random_state=42)

## Write Files

In [108]:
train_file = 'libfm_train_nonlp.txt'
val_file = 'libfm_val_nonlp.txt'
test_file = 'libfm_test_nonlp.txt'

In [109]:
with open(train_file, 'wb') as f:
    dump_svmlight_file(X_train, y_train, f=f)

In [110]:
with open(val_file, 'wb') as f:
    dump_svmlight_file(X_val, y_val, f=f)

In [111]:
with open(test_file, 'wb') as f:
    dump_svmlight_file(X_test, y_test, f=f)