# load dataset into a data frame

In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import torch
from torch.utils.data import DataLoader, Dataset

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    id=1
    for node in xroot: 
        res = []
        res.append(id)
        id+=1
        for el in df_cols[1:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text.strip())
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
    out_df['rating'] = out_df.rating.astype(float)
    out_df['norm_rating'] = out_df['rating']/2 -1.5
    return out_df



class AmazonReviewDataset(Dataset):
    def __init__(self, file_path):
        df = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
        self.rating = list(df.norm_rating)
        self.text = list(df.text)
        self.summary = list(df.summary)
    
    def __len__(self):
        return len(self.rating)
    
    def __getitem__(self, index):
        return self.summary[index], self.text[index] , self.rating[index]

dataset = AmazonReviewDataset(r'./data/amazon-dataset/english/books/train.review')

# create a simple regression model

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel

# bert
BERT_CONFIG = 'distilbert-base-multilingual-cased'
BERT_DIM = 768

# network
OUT_DIM = 1

class BookReviewNet(torch.nn.Module):
    def __init__(self):
        super(BookReviewNet, self).__init__()
                        
        # bert layers
        self.summary_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
        self.text_bert_layer = DistilBertModel.from_pretrained(BERT_CONFIG)
                
        # output layer
        self. = torch.nn.Linear(2*BERT_DIM, OUT_DIM)       
    
    def forward(self, summary, text):
        
        # input to bert & get first output of hidden layer, this is your sentence vector
        bert_summary_out = self.summary_bert_layer(input_ids = summary['input_ids'],
                                             attention_mask = summary['attention_mask'])[0][:,0,:]
        
        bert_text_out = self.text_bert_layer(input_ids = text['input_ids'],
                                       attention_mask = text['attention_mask'])[0][:,0,:]
                
        # concatenate sentence vector for summary and text
        bert_out = torch.cat((x_summary, x_text), 1)
        
        # send to output layer
        return self.output_layer(bert_out)

if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")
    
# create network and load it to GPU
net = BookReviewNet().to(device)
net

Running on the GPU


In [18]:
# tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_CONFIG)
SUMMARY_MAX_LENGTH = 50
REVIEW_MAX_LENGTH = 256

# training
BATCH_SIZE = 8
EPOCHS = 1
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)

for epoch in range(EPOCHS):
    for summary, text, rating in dataloader:
        
        x_summary = tokenizer.batch_encode_plus(summary,
                                                return_tensors='pt',
                                                max_length=SUMMARY_MAX_LENGTH,
                                                pad_to_max_length=True).to(device)
        
        x_text = tokenizer.batch_encode_plus(text,
                                             return_tensors='pt',
                                             max_length=REVIEW_MAX_LENGTH,
                                             pad_to_max_length=True).to(device)
        
        y = rating.unsqueeze(1).to(device)
        
        net.zero_grad()
        optimizer.zero_grad()
               
        y_pred = net(x_summary, x_text)
        
        print(y)
        print(y_pred)
        
        break






UnboundLocalError: local variable 'x_summary' referenced before assignment