In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import warnings
import keras
import tensorflow as tf

from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.sequence import pad_sequences
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
book = pd.read_csv('kaggle_book_tagid.csv')
ratings = pd.read_csv('ratings.csv')
author = pd.read_csv('bookid_author_tags.csv')

In [3]:
book.rename(columns = {'GROUP_CONCAT(tag_name)':'tags'}, inplace = True)

In [4]:
df = pd.merge(author, book, left_on='book_id', right_on='book_id', how='inner')

In [5]:
df.drop(columns = ['GROUP_CONCAT(tag_name)'])

Unnamed: 0,book_id,authors,tags
0,1,"J.K. Rowling, Mary GrandPr?","""fantasy\r""\r,""young-adult\r""\r,""fiction\r""\r,..."
1,2,"J.K. Rowling, Mary GrandPr?","""fantasy\r""\r,""children\r""\r,""children-s\r""\r,..."
2,3,"J.K. Rowling, Mary GrandPr?","""fantasy\r""\r,""young-adult\r""\r,""fiction\r""\r,..."
3,5,"J.K. Rowling, Mary GrandPr?, Rufus Beck","""fantasy\r""\r,""young-adult\r""\r,""fiction\r""\r,..."
4,6,"J.K. Rowling, Mary GrandPr?","""fantasy\r""\r,""young-adult\r""\r,""fiction\r""\r,..."
...,...,...,...
9410,31538614,"J.K. Rowling, MinaLima","""fantasy\r""\r,""short-stories\r""\r,""harry-potte..."
9411,31538635,"J.K. Rowling, MinaLima","""fantasy\r""\r,""short-stories\r""\r,""harry-potte..."
9412,31845516,Glennon Doyle Melton,"""memoir\r""\r,""non-fiction\r""\r,""nonfiction\r""\..."
9413,32075671,Angie Thomas,"""young-adult\r""\r,""contemporary\r""\r,""fiction\..."


In [6]:
from nltk.corpus import stopwords
from gensim.models.doc2vec import LabeledSentence
from gensim import utils
import re
import nltk
import string

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

def textClean(text):
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = text.lower().split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]    
    text = " ".join(text)
    return(text)

def cleanup(text):
    text = textClean(text)
    text= text.translate(str.maketrans("","", string.punctuation))
    return text

In [8]:
tag = df['tags'].apply(cleanup)
sentences = constructLabeledSentences(tag)
tag.head()

0    fantasy  youngadult  fiction  harrypotter  ya ...
1    fantasy  children  childrens  fiction  youngad...
2    fantasy  youngadult  fiction  harrypotter  ya ...
3    fantasy  youngadult  fiction  harrypotter  ya ...
4    fantasy  youngadult  fiction  harrypotter  ya ...
Name: tags, dtype: object

In [9]:
author = df['authors'].apply(cleanup)
sentences2 = constructLabeledSentences(author)
author.head()

0                jk rowling mary grandpr
1                jk rowling mary grandpr
2                jk rowling mary grandpr
3    jk rowling mary grandpr  rufus beck
4                jk rowling mary grandpr
Name: authors, dtype: object

In [10]:
from gensim.models import Doc2Vec

Text_INPUT_DIM=100


text_model=None
filename='docEmbeddings_1_clean.d2v'
if os.path.isfile(filename):
    text_model = Doc2Vec.load(filename)
else:
    text_model = Doc2Vec(min_count=1, window=5, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=4, iter=5,seed=1)
    text_model.build_vocab(sentences)
    text_model.train(sentences, total_examples=text_model.corpus_count, epochs=text_model.iter)
    text_model.save(filename)

In [11]:
Text_INPUT_DIM2=20


text_model2=None
filename='docEmbeddings_2_clean.d2v'
if os.path.isfile(filename):
    text_model2 = Doc2Vec.load(filename)
else:
    text_model2 = Doc2Vec(min_count=1, window=5, size=Text_INPUT_DIM2, sample=1e-4, negative=5, workers=4, iter=5,seed=1)
    text_model2.build_vocab(sentences2)
    text_model2.train(sentences2, total_examples=text_model2.corpus_count, epochs=text_model2.iter)
    text_model2.save(filename)

In [12]:
vector = np.zeros((9415, Text_INPUT_DIM)).astype(np.float32)

for i in range(9415):
    vector[i] = text_model.docvecs['Text_'+str(i)]

In [13]:
vector2 = np.zeros((9415, Text_INPUT_DIM2)).astype(np.float32)

for i in range(9415):
    vector2[i] = text_model2.docvecs['Text_'+str(i)]

In [14]:
vector_merge = np.hstack((vector, vector2))

In [15]:
df['vector'] = np.stack(vector_merge).tolist()

In [16]:
ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 35]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [17]:
def encoding(x):
  if x>=1:
    return 1
  else:
    return 0
new_ratings['rating'] = new_ratings['rating'].apply(encoding)
new_ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,1
1,1,439,1
2,1,588,1
3,1,1169,1
4,1,1185,1


In [18]:
df2 = pd.merge(new_ratings, df, left_on='book_id', right_on='book_id', how='inner')

In [19]:
user_ids = df2["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoede2user = {i: x for i, x in enumerate(user_ids)}

book_ids = df2["book_id"].unique().tolist()
book2book_encoded = {x: i for i, x in enumerate(book_ids)}
book_encoded2book = {i: x for i, x in enumerate(book_ids)}
df2["encoded_user_id"] = df2["user_id"].map(user2user_encoded)
df2["encoded_book_id"] = df2["book_id"].map(book2book_encoded)

In [20]:
df_rating=df2[['encoded_book_id', 'encoded_user_id', 'rating','vector', 'book_id']]

In [21]:
df3 = df_rating.groupby(['book_id', 'encoded_user_id'])['rating'].max().unstack().fillna(0)

In [22]:
data = df3.stack().reset_index(name = 'rating')

In [23]:
check=df[['book_id','vector']]

In [25]:
new_data = pd.merge(data, check, left_on='book_id', right_on='book_id', how='inner')

In [26]:
train, test = train_test_split(new_data,test_size = 0.2)
print(train.shape , test.shape)
train.head(-1)

(5161504, 4) (1290376, 4)


Unnamed: 0,book_id,encoded_user_id,rating,vector
5586134,8647,4619,0.0,"[0.04888492077589035, -0.07791770994663239, -0..."
3482745,5204,1285,0.0,"[-0.00507347472012043, -0.016497120261192322, ..."
3729697,5417,702,0.0,"[0.020230021327733994, -0.028136828914284706, ..."
3312034,5159,6244,0.0,"[0.0015050944639369845, -0.0340639166533947, -..."
2912153,4894,5613,0.0,"[-0.016363980248570442, -0.01728922873735428, ..."
...,...,...,...,...
3070558,4983,4318,0.0,"[0.016490070149302483, -0.025128550827503204, ..."
3627154,5351,1964,0.0,"[0.011294916272163391, -0.0436568409204483, -0..."
1888152,3384,3692,0.0,"[0.011801958084106445, -0.025697041302919388, ..."
5156532,7682,6207,0.0,"[0.012981892563402653, -0.03738074004650116, -..."


In [27]:
X=np.array([np.array(xi) for xi in train.vector])

In [32]:
number_of_unique_user = len(new_data.encoded_user_id.unique())
number_of_unique_book_id = len(new_data.book_id.unique())
print(number_of_unique_user, number_of_unique_book_id)

7985 808


Recall, precision, F1

In [28]:
from keras import backend as K

In [29]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [38]:
n_latent_factors_user = 32
n_latent_factors_book = 32
n_latent_factors_mf = 8
n_users, n_books = number_of_unique_user, number_of_unique_book_id

book_input = keras.layers.Input(shape=[120],name='Book')
#book_embedding_mlp = keras.layers.Embedding(n_books + 1, n_latent_factors_book, name='Book-Embedding-MLP')(book_input)
book_vec_mlp = keras.layers.Flatten(name='FlattenBooks-MLP')(book_input)

#book_embedding_mf = keras.layers.Embedding(n_books + 1, n_latent_factors_mf, name='Book-Embedding-MF')(book_input)
book_vec_mf = keras.layers.Flatten(name='FlattenBooks-MF')(book_input)


user_input = keras.layers.Input(shape=[1],name='User')
user_vec_mlp = keras.layers.Flatten(name='FlattenUsers-MLP')(keras.layers.Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding-MLP')(user_input))

user_vec_mf = keras.layers.Flatten(name='FlattenUsers-MF')(keras.layers.Embedding(n_users + 1, 120,name='User-Embedding-MF')(user_input))

#concat = keras.layers.merge([book_vec_mlp, user_vec_mlp], mode='concat',name='Concat')
concat = keras.layers.concatenate([book_vec_mlp, user_vec_mlp], name='Concat')
dense = keras.layers.Dense(524,name='FullyConnected',activation='relu')(concat)
#dense_batch = keras.layers.BatchNormalization(name='Batch')(dense)
#dropout_1 = keras.layers.Dropout(0.1,name='Dropout-1')(dense_batch)
dense_2 = keras.layers.Dense(256,name='FullyConnected-1',activation='relu')(dense)
#dense_batch_2 = keras.layers.BatchNormalization(name='Batch-2')(dense_2)

dense_3 = keras.layers.Dense(128,name='FullyConnected-2', activation='relu')(dense_2)
#dropout_3 = keras.layers.Dropout(0.1,name='Dropout-3')(dense_3)
#dense_batch_3 = keras.layers.BatchNormalization(name='Batch-3')(dense_3)
dense_4 = keras.layers.Dense(64,name='FullyConnected-3', activation='relu')(dense_3)
#dropout_4 = keras.layers.Dropout(0.1,name='Dropout-4')(dense_4)


#pred_mf = keras.layers.merge([book_vec_mf, user_vec_mf], mode='dot',name='Dot')
pred_mf = keras.layers.Dot(axes=1)([book_vec_mf, user_vec_mf])



pred_mlp = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)

#combine_mlp_mf = keras.layers.merge([pred_mf, pred_mlp], mode='concat',name='Concat-MF-MLP')
combine_mlp_mf = keras.layers.concatenate([pred_mf, pred_mlp] ,name='Concat-MF-MLP')
result_combine = keras.layers.Dense(100,name='Combine-MF-MLP')(combine_mlp_mf)
deep_combine = keras.layers.Dense(50,name='FullyConnected-4')(result_combine)


result = keras.layers.Dense(1,activation = 'sigmoid', name='Prediction')(deep_combine)


model = keras.Model([user_input, book_input], result)
opt = tf.keras.optimizers.Adam(lr =0.01)
model.compile(optimizer='adam',loss= 'mse', metrics=['accuracy', f1_m,precision_m, recall_m])
#model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])

In [39]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 User (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 Book (InputLayer)              [(None, 120)]        0           []                               
                                                                                                  
 User-Embedding-MLP (Embedding)  (None, 1, 32)       255552      ['User[0][0]']                   
                                                                                                  
 FlattenBooks-MLP (Flatten)     (None, 120)          0           ['Book[0][0]']                   
                                                                                            

In [40]:
checkpoint = keras.callbacks.ModelCheckpoint('beta_model{epoch:08d}.h5', period=10) 



In [None]:
history = model.fit([train.encoded_user_id, X],train.rating,batch_size = 64,epochs=100,callbacks=[checkpoint], verbose=1,validation_split=0.2) 

Epoch 1/100