# Prepare data set for Amazon book reviews

In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
from timeit import default_timer as timer

# bert
BERT_CONFIG = 'distilbert-base-multilingual-cased'

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    id=1
    for node in xroot: 
        res = []
        res.append(id)
        id+=1
        for el in df_cols[1:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text.strip())
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
    out_df['rating'] = out_df.rating.astype(float)
    out_df['norm_rating'] = (out_df['rating']/2 -1.5)/2
    return out_df

def get_data(file_paths):
    df = None
    for file_path in file_paths:
        if df is None:
            df = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
        else:
            df_new = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
            df = df.append(df_new)
    return df


# tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_CONFIG)
SUMMARY_MAX_LENGTH = 50
REVIEW_MAX_LENGTH = 256

class AmazonReviewDataset(Dataset):
    def __init__(self, df):
        self.rating = torch.tensor(list(df.norm_rating), dtype=torch.float)
        
        text = list(df.text)
        summary = list(df.summary)
        
        self.summary = tokenizer.batch_encode_plus(summary,
                                                   return_tensors='pt',
                                                   max_length=SUMMARY_MAX_LENGTH,
                                                   pad_to_max_length=True)
        
        self.text = tokenizer.batch_encode_plus(text,
                                                return_tensors='pt',
                                                max_length=REVIEW_MAX_LENGTH,
                                                pad_to_max_length=True)
        
        
    
    def __len__(self):
        return len(self.rating)
    
    def __getitem__(self, index):
        summary_input_ids = self.summary['input_ids'][index]
        summary_attention_mask = self.summary['attention_mask'][index]
        text_input_ids = self.text['input_ids'][index]
        text_attention_mask = self.text['attention_mask'][index]
        rating = self.rating[index]
        
        return summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating

files = [r'./data/amazon-dataset/english/books/train.review',
         r'./data/amazon-dataset/english/books/test.review',
         r'./data/amazon-dataset/french/books/train.review',
         r'./data/amazon-dataset/french/books/test.review',
         r'./data/amazon-dataset/german/books/train.review',
         r'./data/amazon-dataset/german/books/test.review']

files_en_only = [r'./data/amazon-dataset/english/books/train.review',
                 r'./data/amazon-dataset/english/books/test.review']

start = timer()
print('Load dataset')

#data_frame = get_data(files)
data_frame = get_data(files_en_only)

# test data
train_df, test_df = train_test_split(data_frame, test_size=0.2)

# validation data
train_df, val_df = train_test_split(train_df, test_size = 0.25)

train_dataset = AmazonReviewDataset(train_df)
test_dataset = AmazonReviewDataset(test_df)
val_dataset = AmazonReviewDataset(val_df)

end = timer()
print(f'Data loaded in {int(end-start)} seconds')
print(f'Training data size: {len(train_dataset)}')
print(f'Validation data size: {len(val_dataset)}')
print(f'Test data size: {len(test_dataset)}')

Load dataset
Data loaded in 11 seconds
Training data size: 2400
Validation data size: 800
Test data size: 800


# Create network using BERT

In [2]:
BERT_DIM = 768

# network
OUT_DIM = 1

class BookReviewNet(torch.nn.Module):
    def __init__(self):
        super(BookReviewNet, self).__init__()
                        
        # bert layers
        self.summary_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
        self.text_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
                
        # output layer
        self.output_layer = torch.nn.Linear(2*BERT_DIM, OUT_DIM)       
    
    def forward(self, summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask):
        
        # input to bert & get first output of hidden layer, this is your sentence vector
        bert_summary_out = self.summary_bert_layer(input_ids = summary_input_ids,
                                             attention_mask = summary_attention_mask)[0][:,0,:]
        
        bert_text_out = self.text_bert_layer(input_ids = text_input_ids,
                                       attention_mask = text_attention_mask)[0][:,0,:]
                
        # concatenate sentence vector for summary and text
        bert_out = torch.cat((bert_summary_out, bert_text_out), 1)
        
        # send to output layer
        return self.output_layer(bert_out)

if torch.cuda.is_available():
    gpu = torch.device("cuda:0")
    print("Running on the GPU")

cpu = torch.device("cpu")

Running on the GPU


In [3]:
import tqdm.notebook as tqdm


# training
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 3e-5

# create network and load it to GPU
torch.cuda.empty_cache()
net = BookReviewNet().to(gpu)

# init optimizer
optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.MSELoss()

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=1)

def train():
    total_loss = 0
    total_error = 0
    n = 0
    
    net.train()
    
    for summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating in tqdm.tqdm(train_dataloader, desc="Training", leave=False):

        x1 = summary_input_ids.to(gpu)
        x2 = summary_attention_mask.to(gpu)
        x3 = text_input_ids.to(gpu)
        x4 = text_attention_mask.to(gpu)
        y = rating.unsqueeze(1).to(gpu)
        
        y_pred = net(x1, x2, x3 ,x4)
        loss = loss_function(y_pred, y)
        
        k = len(y)
        
        total_loss += loss*k
        n += k
        
        total_error += (y_pred-y).abs().sum()*4
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss/n, total_error/n


def cv():
    total_loss = 0
    total_error = 0
    n = 0
    
    with torch.no_grad():
        net.eval()

        for summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating in tqdm.tqdm(val_dataloader, desc="Validating", leave=False):

            x1 = summary_input_ids.to(gpu)
            x2 = summary_attention_mask.to(gpu)
            x3 = text_input_ids.to(gpu)
            x4 = text_attention_mask.to(gpu)
            y = rating.unsqueeze(1).to(gpu)

            y_pred = net(x1, x2, x3 ,x4)
            loss = loss_function(y_pred, y)

            k = len(y)

            total_loss += loss*k
            n += k
            total_error += (y_pred-y).abs().sum()*4

    return total_loss/n, total_error/n


print(f"Train for {EPOCHS} epochs")
history = []
for epoch in range(EPOCHS):
    start = timer()
    train_loss, train_error = train()
    val_loss, val_error = cv()    
    end = timer()
    secs = int(end-start)
    mins = int(secs/60)
    secs = int(secs%60)
    print(f"Epoch: {epoch} completed in {mins} minutes & {secs} seconds")
    print(f"Training: loss:{train_loss}, error:{train_error}")
    print(f"Validation: loss:{val_loss}, error:{val_error}")
    history.append((train_loss, train_error, val_loss, val_error))

Train for 2 epochs


HBox(children=(FloatProgress(value=0.0, description='Training', max=75.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=800.0, style=ProgressStyle(description_w…

Epoch: 0 completed in 1 minutes & 17 seconds
Training: loss:0.14630064368247986, error:1.2971007823944092
Validation: loss:0.08843161165714264, error:0.9630958437919617


HBox(children=(FloatProgress(value=0.0, description='Training', max=75.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=800.0, style=ProgressStyle(description_w…

Epoch: 1 completed in 1 minutes & 18 seconds
Training: loss:0.07772058248519897, error:0.887874960899353
Validation: loss:0.10118740051984787, error:1.0102969408035278
