# Prepare data set for Amazon book reviews

In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
from timeit import default_timer as timer

# bert
BERT_CONFIG = 'distilbert-base-multilingual-cased'

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    id=1
    for node in xroot: 
        res = []
        res.append(id)
        id+=1
        for el in df_cols[1:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text.strip())
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
    out_df['rating'] = out_df.rating.astype(float)
    out_df['norm_rating'] = (out_df['rating']/2 -1.5)/2
    return out_df

def get_data(file_paths):
    df = None
    for file_path in file_paths:
        if df is None:
            df = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
        else:
            df_new = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
            df = df.append(df_new)
    return df


# tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_CONFIG)
SUMMARY_MAX_LENGTH = 50
REVIEW_MAX_LENGTH = 256

class AmazonReviewDataset(Dataset):
    def __init__(self, df):
        self.rating = torch.tensor(list(df.norm_rating), dtype=torch.float)
        
        text = list(df.text)
        summary = list(df.summary)
        
        self.summary = tokenizer.batch_encode_plus(summary,
                                                   return_tensors='pt',
                                                   max_length=SUMMARY_MAX_LENGTH,
                                                   pad_to_max_length=True)
        
        self.text = tokenizer.batch_encode_plus(text,
                                                return_tensors='pt',
                                                max_length=REVIEW_MAX_LENGTH,
                                                pad_to_max_length=True)
        
        
    
    def __len__(self):
        return len(self.rating)
    
    def __getitem__(self, index):
        summary_input_ids = self.summary['input_ids'][index]
        summary_attention_mask = self.summary['attention_mask'][index]
        text_input_ids = self.text['input_ids'][index]
        text_attention_mask = self.text['attention_mask'][index]
        rating = self.rating[index]
        
        return summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating

files = [r'./data/amazon-dataset/english/books/train.review',
         r'./data/amazon-dataset/english/books/test.review',
         r'./data/amazon-dataset/french/books/train.review',
         r'./data/amazon-dataset/french/books/test.review',
         r'./data/amazon-dataset/german/books/train.review',
         r'./data/amazon-dataset/german/books/test.review']

files_en_only = [r'./data/amazon-dataset/english/books/train.review',
                 r'./data/amazon-dataset/english/books/test.review']

start = timer()
print('Load dataset')

data_frame = get_data(files)
#data_frame = get_data(files_en_only)

# test data
train_df, test_df = train_test_split(data_frame, test_size=0.2)

# validation data
train_df, val_df = train_test_split(train_df, test_size = 0.25)

train_dataset = AmazonReviewDataset(train_df)
test_dataset = AmazonReviewDataset(test_df)
val_dataset = AmazonReviewDataset(val_df)

end = timer()
print(f'Data loaded in {int(end-start)} seconds')
print(f'Training data size: {len(train_dataset)}')
print(f'Validation data size: {len(val_dataset)}')
print(f'Test data size: {len(test_dataset)}')

Load dataset
Data loaded in 30 seconds
Training data size: 7200
Validation data size: 2400
Test data size: 2400


# Create network using BERT

In [2]:
BERT_DIM = 768

# network
OUT_DIM = 1
N_HIDDEN = 256

class BookReviewNet(torch.nn.Module):
    def __init__(self):
        super(BookReviewNet, self).__init__()
                        
        # bert layers
        self.summary_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
        self.text_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
        
        # hidden layer
        self.hidden_layer = torch.nn.Linear(2*BERT_DIM, N_HIDDEN)
        
        # output layer
        self.output_layer = torch.nn.Linear(N_HIDDEN, OUT_DIM)       
    
    def forward(self, summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask):
        
        # input to bert & get first output of hidden layer, this is your sentence vector
        bert_summary_out = self.summary_bert_layer(input_ids = summary_input_ids,
                                             attention_mask = summary_attention_mask)[0][:,0,:]
        
        bert_text_out = self.text_bert_layer(input_ids = text_input_ids,
                                       attention_mask = text_attention_mask)[0][:,0,:]
                
        # concatenate sentence vector for summary and text
        bert_out = torch.cat((bert_summary_out, bert_text_out), 1)
        
        # hidden out
        hidden_out = self.hidden_layer(bert_out)
        hidden_out = torch.nn.functional.relu(hidden_out)
        
        # send to output layer
        return self.output_layer(hidden_out)

if torch.cuda.is_available():
    gpu = torch.device("cuda:0")
    print("Running on the GPU")

cpu = torch.device("cpu")

Running on the GPU


In [3]:
import tqdm.notebook as tqdm
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

# training
BATCH_SIZE = 32
EPOCHS = 20
TRAINING_STEPS_PER_EPOCH = 225
LEARNING_RATE = 3e-5
num_training_steps = EPOCHS*TRAINING_STEPS_PER_EPOCH
num_warmup_steps = num_training_steps/10

# create network and load it to GPU
torch.cuda.empty_cache()
net = BookReviewNet().to(gpu)

optimizer = AdamW(net.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

loss_function = torch.nn.MSELoss()

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_dataloader = DataLoader(val_dataset, batch_size=1)

def train(dataloader):
    total_loss = 0
    total_error = 0
    n = 0
    
    net.train()
    
    for summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating in tqdm.tqdm(dataloader, desc="Training", leave=False):

        x1 = summary_input_ids.to(gpu)
        x2 = summary_attention_mask.to(gpu)
        x3 = text_input_ids.to(gpu)
        x4 = text_attention_mask.to(gpu)
        y = rating.unsqueeze(1).to(gpu)
        
        y_pred = net(x1, x2, x3 ,x4)
        loss = loss_function(y_pred, y)
        
        k = len(y)
        
        total_loss += loss*k
        n += k
        
        total_error += (y_pred-y).abs().sum()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return total_loss/n, total_error/n


def cv(dataloader):
    total_loss = 0
    total_error = 0
    n = 0
    
    with torch.no_grad():
        net.eval()

        for summary_input_ids, summary_attention_mask, text_input_ids, text_attention_mask, rating in tqdm.tqdm(dataloader, desc="Validating", leave=False):

            x1 = summary_input_ids.to(gpu)
            x2 = summary_attention_mask.to(gpu)
            x3 = text_input_ids.to(gpu)
            x4 = text_attention_mask.to(gpu)
            y = rating.unsqueeze(1).to(gpu)

            y_pred = net(x1, x2, x3 ,x4)
            loss = loss_function(y_pred, y)

            k = len(y)

            total_loss += loss*k
            n += k
            total_error += (y_pred-y).abs().sum()

    return total_loss/n, total_error/n


print(f"Train {EPOCHS} epochs")
history = []
start = timer()

for epoch in range(EPOCHS+1):
    if(epoch==0):
        train_loss, train_error = cv(train_dataloader)
        val_loss, val_error = cv(val_dataloader)
        print(f"Initial Training: loss:{train_loss}, error:{train_error} Validation: loss:{val_loss}, error:{val_error}")
    else:
        train_loss, train_error = train(train_dataloader)
        val_loss, val_error = cv(val_dataloader)   
        print(f"Epoch: {epoch}/{EPOCHS} Training: loss:{train_loss}, error:{train_error} Validation: loss:{val_loss}, error:{val_error}")
    history.append((train_loss, train_error, val_loss, val_error))
 
end = timer()
secs = int(end-start)
mins = int(secs/60)
secs = int(secs%60)
print(f"{epoch} epochs completed in {mins} minutes & {secs} seconds")

Train 20 epochs


HBox(children=(FloatProgress(value=0.0, description='Validating', max=225.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Initial Training: loss:0.1852995753288269, error:0.4070536494255066 Validation: loss:0.18386772274971008, error:0.40555864572525024


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 1/20 Training: loss:0.1168837919831276, error:0.2808656692504883 Validation: loss:0.08150845766067505, error:0.2199486345052719


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 2/20 Training: loss:0.0567927323281765, error:0.17484670877456665 Validation: loss:0.1134311854839325, error:0.24078448116779327


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 3/20 Training: loss:0.032974980771541595, error:0.13110186159610748 Validation: loss:0.07100590318441391, error:0.19137313961982727


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 4/20 Training: loss:0.023687124252319336, error:0.11315606534481049 Validation: loss:0.09218796342611313, error:0.21247613430023193


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 5/20 Training: loss:0.01729070022702217, error:0.09821941703557968 Validation: loss:0.06894295662641525, error:0.1836334466934204


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 6/20 Training: loss:0.01151254866272211, error:0.08193361014127731 Validation: loss:0.0650070533156395, error:0.1772390902042389


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 7/20 Training: loss:0.008120810613036156, error:0.07004198431968689 Validation: loss:0.06472143530845642, error:0.17587503790855408


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 8/20 Training: loss:0.006173513829708099, error:0.0603792667388916 Validation: loss:0.06341039389371872, error:0.17362181842327118


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 9/20 Training: loss:0.005408432800322771, error:0.055945686995983124 Validation: loss:0.0660490095615387, error:0.1739116907119751


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 10/20 Training: loss:0.004883971530944109, error:0.05317292362451553 Validation: loss:0.06634633988142014, error:0.17572449147701263


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 11/20 Training: loss:0.005335190799087286, error:0.05562980845570564 Validation: loss:0.06609403342008591, error:0.17117837071418762


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 12/20 Training: loss:0.005808497779071331, error:0.05813925713300705 Validation: loss:0.0665312334895134, error:0.18028096854686737


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 13/20 Training: loss:0.006774582900106907, error:0.062451425939798355 Validation: loss:0.06958405673503876, error:0.18413075804710388


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Validating', max=2400.0, style=ProgressStyle(description_…

Epoch: 14/20 Training: loss:0.005725529510527849, error:0.058214858174324036 Validation: loss:0.06382070481777191, error:0.17593152821063995


HBox(children=(FloatProgress(value=0.0, description='Training', max=225.0, style=ProgressStyle(description_wid…

KeyboardInterrupt: 

In [11]:
history[0:]

[(tensor(0.1853, device='cuda:0'),
  tensor(0.4071, device='cuda:0'),
  tensor(0.1839, device='cuda:0'),
  tensor(0.4056, device='cuda:0')),
 (tensor(0.1169, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.2809, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0815, device='cuda:0'),
  tensor(0.2199, device='cuda:0')),
 (tensor(0.0568, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.1748, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.1134, device='cuda:0'),
  tensor(0.2408, device='cuda:0')),
 (tensor(0.0330, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.1311, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0710, device='cuda:0'),
  tensor(0.1914, device='cuda:0')),
 (tensor(0.0237, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.1132, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0922, device='cuda:0'),
  tensor(0.2125, device='cuda:0')),
 (tensor(0.0173, device='cuda:0', grad_fn=<DivBackward0>),
  tensor(0.0982, device='cuda:0', grad_fn=<D