In [1]:
from LoadData import LoadDataAndProcessing, LoadTestDataAndProcessing
from RumourDataSet import RumourDataset
from torch.utils.data import DataLoader

In [2]:
train_file = "data/train_data_all.json"
train_label_file = "data/train.label.txt"
dev_file = "data/dev_data_all.json"
dev_label_file =  "data/dev.label.txt"

load_twitter_train_data = LoadDataAndProcessing(train_file, train_label_file)
load_twitter_dev_data = LoadDataAndProcessing(dev_file, dev_label_file)

In [42]:
train_input = load_twitter_train_data.prepareDataset()
dev_input = load_twitter_dev_data.prepareDataset()

In [None]:
tweets_file_path = "../../project-data/tweet-objects/tweet-objects/"
tweet_data = "data/test.data.txt"
load_twitter_test_data= LoadTestDataAndProcessing(tweets_file_path, tweet_data)

In [120]:
test_input = load_twitter_test_data.prepareDataset()

In [100]:
class TrainDataset(Dataset):

    def __init__(self, filename, input_size):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        
        # the input length for BERT model. Max length is 512
        self.input_size = 0
        if input_size > 512:
            self.input_size = 512
        else:
            self.input_size = input_size
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):

        #Selecting the content and label at the specified index in the data frame
        tweet = self.df.loc[index, 'content']
        label = self.df.loc[index, 'label']
        
        # Tokenize the tweet and insering the CLS and SEP
        tokens = ['[CLS]'] + self.tokenizer.tokenize(tweet) + ['[SEP]'] 
        
        if len(tokens) < self.input_size:
            #Padding token
            tokens = tokens + ['[PAD]' for _ in range(self.input_size - len(tokens))] 
        else:
            # if tokens length > input_size, extract the first input_size-1 and add SEP
            tokens = tokens[:self.input_size-1] + ['[SEP]'] 
        
        #Converting the token to a pytorch ID tensor
        tokens_ids_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)) 

        #Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()
        
        return tokens_ids_tensor, attn_mask, label

In [102]:
train_tsv = "data/train_loadData.tsv"
dev_tsv = "data/dev_loadData.tsv"

train_set = TrainDataset(filename = train_tsv, input_size = 350)
dev_set = TrainDataset(filename = dev_tsv, input_size = 350)

In [108]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_set, batch_size = 1, num_workers = 0)
dev_loader = DataLoader(dev_set, batch_size = 1, num_workers = 0)

In [86]:
with open("data/saved/train_input.pickle", "wb") as file_:
    pickle.dump(train_input, file_, -1)

with open("data/saved/dev_input.pickle", "wb") as file_:
    pickle.dump(dev_input, file_, -1)
    
with open("data/saved/test_input.pickle", "wb") as file_:
    pickle.dump(test_input, file_, -1)