In [16]:
import torch
import torch.nn as nn
import string
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import word_tokenize
import numpy as np

In [27]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x_file_location, y_file_location):
        self.x_file_location = x_file_location
        self.y_file_locaiton = y_file_location
        self.initialize_data()
        self.modify()
        self.combined_data = list()
        self.combineData()

    def initialize_data(self):
        with open(self.x_file_location, "r") as inFile:
            data = inFile.readlines()
        
        self.scores = list()
        with open(self.y_file_locaiton, "r") as inFile:
            self.scores = inFile.readlines()

        self.scores = [int(score) for score in self.scores]
        
        enData = list()
        esData = list()

        for line in data:
            line = line.strip().split("\t")
            enData.append(line[0])
            esData.append(line[1])
        
        en_tokenized_data = self.tokenizer(enData)
        self.ENdataset = self.cleaner(en_tokenized_data)
        (
            self.ENword2Index,
            self.ENindex2Word,
            self.ENvocab_size,
            self.ENvocab,
            self.ENwordFrequency
        ) = self.vocabBuilder(self.ENdataset)
        self.ENwords = list()
        for sentence in self.ENdataset:
            for word in sentence:
                self.ENwords.append(word)

        self.ENwords_indexes = [self.ENword2Index[word] for word in self.ENwords]

        fr_tokenized_data = self.tokenizer(esData)
        self.ESdataset = self.cleaner(fr_tokenized_data)
        (
            self.ESword2Index,
            self.ESindex2Word,
            self.ESvocab_size,
            self.ESvocab,
            self.ESwordFrequency
        ) = self.vocabBuilder(self.ESdataset)
        self.ESwords = list()
        for sentence in self.ESdataset:
            for word in sentence:
                self.ESwords.append(word)

        self.ESwords_indexes = [self.ESword2Index[word] for word in self.ESwords]

    def tokenizer(self,corpus):
        """
            tokenizes the corpus
            
            Arguments:
                corpus (list)

            Returns:
                tokenized corpus (list)
        """
        hashtag_regex = "#[a-zA-Z0-9]+"
        url_regex = "((http|https)://)(www.)?[a-zA-Z0-9@:%._\\+~#?&//=]{2,256}\\.[a-z]{2,6}\\b([-a-zA-Z0-9@:%._\\+~#?&//=]*)"
        mention_regex = "@\w+"

        processed_corpus = list()

        for tweet in corpus:
            normalized_tweet = tweet.lower()
            hashtag_removed_tweet = re.sub(hashtag_regex, "<HASHTAG>", normalized_tweet)
            website_removed_tweet = re.sub(url_regex, "<URL>", hashtag_removed_tweet)
            mention_removed_tweet = re.sub(
                mention_regex, "<MENTION>", website_removed_tweet
            )
            punctuation_repeat_removed = re.sub(
                r"(\W)(?=\1)", "", mention_removed_tweet
            )
            tokenized_tweet = punctuation_repeat_removed.split()

            cleaned_tokenized_tweet = list()
            for token in tokenized_tweet:
                if token not in ["<HASHTAG>", "<URL>", "<MENTION>", "<OOV>"]:
                    split_tokens = "".join(
                        (char if char.isalpha() or char.isnumeric() else f" {char} ")
                        for char in token
                    ).split()
                    for cleaned_token in split_tokens:
                        cleaned_tokenized_tweet.append(cleaned_token)

                else:
                    cleaned_tokenized_tweet.append(token)
            cleaned_tokenized_tweet = ['<SOS>'] + cleaned_tokenized_tweet + ['<EOS>']
            processed_corpus.append(cleaned_tokenized_tweet)

        return processed_corpus

    def cleaner(self,corpus):
        """
            replacing !,?,. with . and removing other punctuations
            
            Arguments:
                tokenized corpuse (list)

            Returns:
                cleaned corpus (list)
        """
        import string

        cleaned_corpus = list()

        for sentence in corpus:
            new_sentence = list()
            for token in sentence:
                if token in ["!", ".", "?"]:
                    new_sentence.append(".")
                elif token in string.punctuation:
                    continue
                else:
                    new_sentence.append(token)

            cleaned_corpus.append(new_sentence)

        return cleaned_corpus

    def vocabBuilder(self,corpus):
        """
            Builds the vocabulary of the input dataset.

            Arguments:
                The cleaned tokenized the dataset
            
            Returns:
                Word to Index dict, Index to Word list, Number of Unique Words, Set of Vocab
        """
        word2Index = dict()
        index2Word = list()
        vocab = set()
        wordFrequency = dict()

        n_unique_words = 0

        for sentence in corpus:
            for word in sentence:
                vocab.add(word)
                if word not in word2Index:
                    word2Index[word] = n_unique_words
                    index2Word.append(word)
                    n_unique_words += 1
                    wordFrequency[word] = 1
                else:
                    wordFrequency[word] += 1

        return word2Index, index2Word, n_unique_words, vocab, wordFrequency
    
    def modify(self):
        for i in range(len(self.ENdataset)):
            for j in range(len(self.ENdataset[i])):
                if self.ENwordFrequency[self.ENdataset[i][j]] < 2:
                    self.ENdataset[i][j] = '<OOV>'
                elif any(character.isdigit() for character in self.ENdataset[i][j]):
                    self.ENdataset[i][j] = '<OOV>'

        print(self.ENvocab_size)
        
        self.ENdataset = self.cleaner(self.ENdataset)
        (
            self.ENword2Index,
            self.ENindex2Word,
            self.ENvocab_size,
            self.ENvocab,
            self.ENwordFrequency
        ) = self.vocabBuilder(self.ENdataset)
        self.ENwords = list()
        for sentence in self.ENdataset:
            for word in sentence:
                self.ENwords.append(word)

        self.ENwords_indexes = [self.ENword2Index[word] for word in self.ENwords]

        for i in range(len(self.ESdataset)):
            for j in range(len(self.ESdataset[i])):
                if self.ESwordFrequency[self.ESdataset[i][j]] < 2:
                    self.ESdataset[i][j] = '<OOV>'
                elif any(character.isdigit() for character in self.ESdataset[i][j]):
                    self.ESdataset[i][j] = '<OOV>'

        self.ESdataset = self.cleaner(self.ESdataset)
        (
            self.ESword2Index,
            self.ESindex2Word,
            self.ESvocab_size,
            self.ESvocab,
            self.ESwordFrequency
        ) = self.vocabBuilder(self.ESdataset)
        self.ESwords = list()
        for sentence in self.ESdataset:
            for word in sentence:
                self.ESwords.append(word)

        self.ESwords_indexes = [self.ESword2Index[word] for word in self.ESwords]

        print(self.ESvocab_size)

    def combineData(self):
        for idx in range(len(self.ENdataset)):
            self.combined_data.append((self.ENdataset[idx], self.ESdataset[idx]))

    def __len__(self):
        return len(self.ESdataset)

    def __getitem__(self, index):
        return (
            np.array(self.ENdataset[index]),
            np.array(self.ESdataset[index]),
            self.scores[index]
        )

In [21]:
data = Dataset('../sts-2017-en-es/En_Es_STS/STS.input.en-es.train.txt', '../sts-2017-en-es/En_Es_STS/STS.input.en-es.train_scores.txt')

4426
2066


In [28]:
next(iter(data))

AttributeError: 'Dataset' object has no attribute 'sequence_length'