In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import csv
import nltk
from nltk.tokenize import word_tokenize
from torchtext.vocab import build_vocab_from_iterator, Vocab
import re
import pickle
from typing import List, Tuple, Optional
from torch import swapaxes
from torch.nn.utils.rnn import pad_sequence
torch.manual_seed(13)

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
def load_data(file_path):
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row[1])
    # delete the first element of the list (header)
    del data[0]
    return data

In [4]:
# load data (the csv)
train_data = load_data('./ANLP-2/train.csv')
test_data = load_data('./ANLP-2/test.csv')

In [None]:
OUT_OF_VOCAB = "<OOV>"
PAD_TAG = "<PAD>"

class POSDataset(Dataset):
    def __init__(self, sentences, labels, vocabulary: Optional[Vocab] = None):
        self.sentences = sentences
        self.labels = labels
        if vocabulary is None:
            self.vocab = build_vocab_from_iterator(self.sentences, specials=[OUT_OF_VOCAB, PAD_TAG])
            self.vocab.set_default_index(self.vocab[OUT_OF_VOCAB])
        else:
            self.vocab = vocabulary

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.vocab.lookup_indices(self.sentences[idx])), torch.tensor(self.labels[idx])
    
    def format(self, batch, encodings) -> Tuple[torch.Tensor, torch.Tensor]:
        # add <BOS> and <EOS> to the sentences
        sentences, pos = zip(*batch)
        sentences = list(sentences)
        # add <BOS> and <EOS> to the sentences
        sentences = pad_sequence(sentences, padding_value=self.vocab[PAD_TAG])
        sentences = swapaxes(sentences, 0, 1)

        # convert all the words in the sentences to the vector representation
        for i in range(len(sentences)):
            sentences[i] = encodings(sentences[i])

        # conver to float
        return list(zip(sentences, pos))

        


In [6]:
class Preprocess():
    def __init__ (self, data):
        self.data = data

    def tokenize(self, data):
        tokenized_data = []
        for text in data:
            text = re.sub(r'\\', ' ', text)
            text = re.sub(r'\d+', '<NUMBER>', text)
            text = text.lower()
            # remove punctuation
            text = re.sub(r'[^\w\s]', ' ', text)
            tokenized_data.append(word_tokenize(text))
        return tokenized_data
    
    
    def text_to_indices(self, tokenized_data, vocab):
        indexed_data = []
        for sentence in tokenized_data:
            indexed_data.append([vocab[token] for token in sentence])
        return indexed_data
    
    # def __call__(self):
    #     tokenized_data = self.tokenize(self.data)
    #     vocab = self.build_vocab(tokenized_data)
    #     indexed_data = self.text_to_indices(tokenized_data, vocab)
    #     return indexed_data, vocab, tokenized_data 

In [8]:
# read from picklefile
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [9]:
# conver to indices
preprocess = Preprocess(train_data)
tokenized_data = preprocess.tokenize(train_data)
indexed_data_train = preprocess.text_to_indices(tokenized_data, vocab)

tokenized_data = preprocess.tokenize(test_data)
indexed_data_test = preprocess.text_to_indices(tokenized_data, vocab)

In [None]:
class News_Classifier