# load dataset into a data frame

In [13]:
import pandas as pd
import xml.etree.ElementTree as et
import torch
from torch.utils.data import DataLoader, Dataset

def parse_XML(xml_file, df_cols): 
    """Parse the input XML file and store the result in a pandas 
    DataFrame with the given columns. 
    
    The first element of df_cols is supposed to be the identifier 
    variable, which is an attribute of each node element in the 
    XML data; other features will be parsed from the text content 
    of each sub-element. 
    """
    
    xtree = et.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    id=1
    for node in xroot: 
        res = []
        res.append(id)
        id+=1
        for el in df_cols[1:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text.strip())
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
    out_df['rating'] = out_df.rating.astype(float)
    out_df['norm_rating'] = out_df['rating']/2 -1.5
    return out_df



class AmazonReviewDataset(Dataset):
    def __init__(self, file_path):
        df = parse_XML(file_path, ['id', 'summary', 'rating', 'text', 'category'])
        self.rating = list(df.norm_rating)
        self.review = list(df.text)
        self.summary = list(df.summary)
    
    def __len__(self):
        return len(self.rating)
    
    def __getitem__(self, index):
        return self.summary[index], self.review[index] , self.rating[index]


In [14]:
dataset = AmazonReviewDataset(r'./data/amazon-dataset/english/books/train.review')

# create a simple regression model

In [15]:

from transformers import DistilBertTokenizer, DistilBertModel

bert_config = 'distilbert-base-multilingual-cased'

tokenizer = DistilBertTokenizer.from_pretrained(bert_config)
summary_model = DistilBertModel.from_pretrained(bert_config)
review_model = DistilBertModel.from_pretrained(bert_config)

In [19]:
summary_max_length = 50
review_max_length = 512
batch_size = 64
dataloader = DataLoader(dataset, batch_size=batch_size)

for x_summary, x_review, y in dataloader:
    # split summary and review from tuple
    # tokenize
    x_summary = tokenizer.batch_encode_plus(x_summary, return_tensors='pt', max_length=summary_max_length, pad_to_max_length=True)
    x_review = tokenizer.batch_encode_plus(x_summary, return_tensors='pt', max_length=review_max_length, pad_to_max_length=True)
    
    # output bert model hidden states , index 0 of output of model is hidden state of which the first hidden state vector represents the sentence vector
    summary_hidden_state = summary_model(input_ids=x_summary['input_ids'], attention_mask=x_summary['attention_mask'])[0]#[:,0,:]
    review_hidden_state = review_model(input_ids=x_review['input_ids'], attention_mask=x_review['attention_mask'])[0]#[:,0,:]
    
    print(summary_hidden_state.shape)
    print(review_hidden_state.shape)
    
    break

AttributeError: 'tuple' object has no attribute 'shape'

In [57]:
x_summary['input_ids'].shape

torch.Size([16, 50])