References: https://github.com/awarebayes/RecNN

# Text preprocessing

## Google's BERT

In [None]:
import pandas as pd
import pickle
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from tqdm import tqdm_notebook as tqdm
import numpy as np
# from tqdm import tqdm_notebook as tqdm
# import torch
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm
import pickle
from collections import OrderedDict
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
import torch
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
cuda = torch.device('cuda')

# Text embedding

In [None]:
# Model imports
cuda = torch.device('cuda')
bert = BertModel.from_pretrained('bert-base-uncased').to(cuda)
bert.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
text = pd.read_csv('../data_raw/books_cleaned.csv')['summary']

In [None]:
text[1]

In [None]:
infos_tensor = {}
for i, summary in tqdm(enumerate(text)):
    summary = str(summary)[:512]
    v = tokenizer.tokenize(summary)
    v = tokenizer.convert_tokens_to_ids(v)
    v = torch.tensor(v).to(cuda)
    infos_tensor[i] = v

In [None]:
infos_sorted = OrderedDict(sorted(infos_tensor.items(), key=lambda t: t[1].size()))

In [None]:
infos_bert = {}
batch = []
indexes = []
max_size = 0
batch_size = 1

for i in tqdm(range(len(infos_sorted))):
    idx, tensor = infos_sorted.popitem()
    batch.append(tensor)
    indexes.append(idx)
    
    if len(batch) >= batch_size:
        seq_lengths = torch.tensor([len(seq) for seq in batch]).long().cuda()
        seq_tensor = torch.zeros((len(batch), seq_lengths.max())).long().cuda()
        
        for idx, (seq, seqlen) in enumerate(zip(batch, seq_lengths)):
            seq_tensor[idx, :seqlen] = torch.tensor(seq).long().cuda()
            
        _, output = bert(seq_tensor)

        output = output.detach().cpu()
        for i in range(output.size(0)):
            infos_bert[indexes[i]] = output[i]
            
        batch = []
        indexes = []

In [None]:
seq_lengths = torch.tensor([len(seq) for seq in batch]).long().cuda()
seq_tensor = torch.zeros((len(batch), seq_lengths.max())).long().cuda()

for idx, (seq, seqlen) in enumerate(zip(batch, seq_lengths)):
    seq_tensor[idx, :seqlen] = torch.tensor(seq).long().cuda()
            
    _, output = bert(seq_tensor)

    output = output.detach().cpu()
    for i in range(output.size(0)):
        infos_bert[indexes[i]] = output[i]

In [None]:
pickle.dump(dict([(i[0], i[1].numpy()) for i in infos_bert.items()]),
            open('../data_raw/books_bert.p', 'wb'))

In [None]:
embs = pickle.load(open('../data_raw/books_bert.p', 'rb'))
embs[0].shape

# Label Encoding

In [None]:
def multicat_label_encode(infos, col, trim=1):
    col_sort = []
    [col_sort.extend(i[1:-1].split(',')) for i in infos[col]]
    col_sort = pd.Series(col_sort).value_counts()
    result = {}
    valid = 0
    for key, value in col_sort.items(): 
        if value > trim:
            result[key] = value
            valid += value
    col_sort = pd.Series(result).index.tolist()
    print(col, 'unique:', len(col_sort)) #, 'valid: ', valid)
        
    col_dict = dict([(k, idx) for idx, k in enumerate(col_sort)])
    
    def func(values):
        values = values[1:-1].split(',')
        result = []
        for value in values:
            standard = col_dict.get(value, len(col_sort))
            result.append(standard)
        return list(set(result))
    
    infos[col] = infos[col].apply(lambda x:func(x))
#     print('have the non-label', sum(infos[col].apply(lambda x:any([i==len(col_sort) for i in x]))))
#     print('only the non-label', sum(infos[col].apply(lambda x:len(x) ==0 and x[0] == len(col_sort))))

In [None]:
def single_label_encode(infos, col, trim=1):
    col_sort = infos[col].value_counts()
    
    result = {}
    valid = 0
    for key, value in col_sort.items(): 
        if value > trim:
            result[key] = value
            valid += value
    col_sort = pd.Series(result).index.tolist()
    print(col, 'unique:', len(col_sort)) #, 'valid: ', valid)
    
    
    col_dict = dict([(k, idx) for idx, k in enumerate(col_sort)])
    
    infos[col] = infos[col].apply(lambda x:col_dict.get(x, len(col_sort)))
#     print('have the non-label', sum(infos[col].apply(lambda x:x==len(col_sort))))

In [None]:
books  = pd.read_csv('../data_raw/books_cleaned.csv')
multicat_label_encode(books, 'authors', 5)
multicat_label_encode(books, 'similar_books', 25)
single_label_encode(books, 'publisher', 2)

In [None]:
def convert_year(year):
    if year >= 2010:
        return 0
    elif year >= 2000:
        return 1
    else:
        return 2
books['publication_year'] = books['publication_year'].apply(convert_year)

In [None]:
books['publication_year'].value_counts()

## Standartization

In [None]:
def standardize(infos, col, plot=True, bins=100):
    scaler = preprocessing.StandardScaler()
    a = np.array(infos[col])

    a = a.reshape(-1, 1)
    #print(a.shape)
    scaler = scaler.fit(a)
    
    def func(value):
        value = np.array(value).reshape(-1, 1)
        return scaler.transform(value)[0][0]
        
    
    infos[col] = infos[col].apply(lambda x:func(x))
        

In [None]:
standardize(books, 'ratings_count')
standardize(books, 'average_rating')
standardize(books, 'num_pages')

In [None]:
books.head()

## Pytorch integration

In [None]:
texts = pickle.load(open('../data_raw/books_bert.p', 'rb'))

In [None]:
tensor_dict = {}

for i in tqdm(texts.keys()):
    
    
    similar = torch.zeros(1, 504+1)
    authors = torch.zeros(1, 466+1)
    publisher = torch.zeros(1, 513+1)

    similar[0][[books['similar_books'].iloc[i]]] = 1
    authors[0][[books['authors'].iloc[i]]] = 1
    publisher[0][[books['publisher'].iloc[i]]] = 1


    misc = torch.tensor([[books['num_pages'].iloc[i],
                          books['ratings_count'].iloc[i],
                          books['average_rating'].iloc[i]
                       ]])
    
    text = torch.from_numpy(texts[i])
    text = text.unsqueeze(0)
    item = torch.cat([similar, authors, publisher, misc, text], 1)
    
    # note may produce some nans!
    item[torch.isnan(item)] = 0
    tensor_dict[i] = item.squeeze()

In [None]:
pickle.dump(tensor_dict,  open("../data_raw/books_embs_raw_dict.p", "wb"))

## Principal component analysis

In [None]:
books_embs = pickle.load(open('../data_raw/books_embs_raw_dict.p', 'rb'))

In [None]:
def reduce_dim(x):
    pca = PCA(x)
    data = torch.stack(list(books_embs.values()))
    data[torch.isnan(data)] = 0
    pca.fit(data.cpu().numpy())

    dim_reduced = {}

    for k,v in tqdm(books_embs.items()):
        v = v.numpy()
        v[np.isnan(v)] = 0
        dim_reduced[k] = torch.from_numpy(pca.transform(v.reshape(1, -1))).squeeze()

    scaler = MinMaxScaler(feature_range=(-0.01, 0.01))
    scaler.fit(np.stack(list(dim_reduced.values())))

    for k,v in tqdm(dim_reduced.items()):
        v = v.numpy()
        dim_reduced[k] = scaler.transform(v.reshape(1, -1)).squeeze()
        
    #pickle.dump(dim_reduced,  open(f"data/books_embs_{x}_dict.p", "wb"))
    
    array = []
    n = len(dim_reduced)
    for i in range(n):
        array.append(dim_reduced[i])
    np.save(f'../data/books_embs_{x}.npy', np.array(array))

In [None]:
reduce_dim(16)

In [None]:
reduce_dim(32)

In [None]:
reduce_dim(64)

In [None]:
reduce_dim(128)