In [1]:
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')
!pip install datasets

[nltk_data] Downloading package treebank to
[nltk_data]     /home/soumodiptab/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /home/soumodiptab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/soumodiptab/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.




In [6]:
from datasets import load_dataset
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/soumodiptab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

NameError: name 'torch' is not defined

In [4]:
from nltk.corpus import treebank

In [7]:
dataset = load_dataset('sst')

No config specified, defaulting to: sst/default
Found cached dataset sst (/home/soumodiptab/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})


In [9]:
train_sentences_raw = [nltk.word_tokenize(sentence) for sentence in dataset['train'][:]['sentence']]
train_labels = [label for label in dataset['train'][:]['label']]

In [10]:
val_sentences_raw = [nltk.word_tokenize(sentence) for sentence in dataset['validation'][:]['sentence']]
val_labels = [label for label in dataset['validation'][:]['label']]

In [11]:
test_sentences_raw = [nltk.word_tokenize(sentence) for sentence in dataset['test'][:]['sentence']]
test_labels = [label for label in dataset['test'][:]['label']]

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def remove_stopwords(sentences):
    filtered_sentences = []

    for sentence in sentences:
        filtered_sentence = [word for word in sentence if word.lower() not in stop_words]
        filtered_sentences.append(filtered_sentence)

    return filtered_sentences

In [14]:
stemmer = PorterStemmer()

In [15]:
def stem(filtered_sentences):
    stemmed_sentences = []

    for sentence in filtered_sentences:
        stemmed_sentence = [stemmer.stem(word.lower()) for word in sentence]
        stemmed_sentences.append(stemmed_sentence)

    return stemmed_sentences

In [16]:
def pre_process(sentences):
    fil_sentances = remove_stopwords(sentences)
    stemmed_sentences = stem(fil_sentances)
    return stemmed_sentences

In [17]:
train_sentences = pre_process(train_sentences_raw)
val_sentences = pre_process(val_sentences_raw)
test_sentences = pre_process(test_sentences_raw)

In [18]:
def get_max_len(sentences):
    maxx = 0
    for sentance in sentences:
        maxx = max(len(sentance),maxx)
    return maxx

In [19]:
max_sentence_size = get_max_len(train_sentences)

In [20]:
max_sentence_size

38

In [21]:
def add_padding_normalize(list_list_words,max_sentence_size):
    for list_words in list_list_words:
        diff = max_sentence_size - len(list_words)
        if diff > 0:
            padding = [ "<pad>" for i in range(diff)]
            list_words.extend(padding)
        else:
            list_words[:] = list_words[:max_sentence_size]
    return list_list_words

In [22]:
train_sentences = add_padding_normalize(train_sentences,max_sentence_size)
val_sentences = add_padding_normalize(val_sentences,max_sentence_size)
test_sentences = add_padding_normalize(test_sentences,max_sentence_size)

In [23]:
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

8544
1101
2210


In [24]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [25]:
import torchtext
import torch
import torchtext.vocab as vocab
import numpy as np

In [26]:
vocab = torchtext.vocab.build_vocab_from_iterator(train_sentences, min_freq=1,specials=["<unk>"]) 
vocab.set_default_index(vocab["<unk>"])       

In [27]:
def read_emb(path):
    with open(path, encoding="utf-8") as file:
        word_embedding_dim = len(file.readline().split(" ")) - 1
    embeddings = np.zeros((len(vocab), word_embedding_dim))
    last_idx = 0

    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            word,*vector  = line.split()
            if word in vocab:
                  indx = vocab[word]
                  embeddings[indx] = vector
        return torch.FloatTensor(embeddings)

In [28]:
embeddings = read_emb('embeddings/glove/glove.6B.100d.txt')

In [29]:
word_to_idx = vocab.get_stoi()
idx_to_word = [word for word, idx in sorted(word_to_idx.items(), key=lambda x: x[1])]

In [30]:
len(vocab)

12059

In [31]:
len(word_to_idx)

12059

In [32]:
def vectorize_tokens(list_words):
    return [vocab[word] for word in list_words] 

In [33]:
def vectorize_dataset(list_list_words):
    x = []
    for list_words in list_list_words:
        words = vectorize_tokens(list_words)
        x.append(words)  
    return x

In [34]:
train_vector = vectorize_dataset(train_sentences)
val_vector = vectorize_dataset(val_sentences)
test_vector = vectorize_dataset(test_sentences)

In [38]:
len(train_vector[0])

38

AttributeError: 'list' object has no attribute 'shape'

In [39]:

import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [40]:
class ELMoDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.backward_context = [sentance[:-1] for sentance in self.data]
        self.forward_context = [sentance[1:] for sentance in self.data]
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.backward_context[idx], self.forward_context[idx]

In [41]:
train_dataset = ELMoDataset(torch.tensor(train_vector))
test_dataset = ELMoDataset(torch.tensor(test_vector))

In [42]:
batch_size = 32

In [43]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [44]:
embeddings.size()[1]

100

In [45]:
class ELMo(nn.Module):
    def __init__(self, vocab_size, embeddings, hidden_dim):
        super(ELMo, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim =  embeddings.size()[1]
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.embedding.weight = nn.Parameter(self.embedding.weight, requires_grad=True)
        self.lstm1 = nn.LSTM(self.embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(hidden_dim*2, hidden_dim, batch_first=True, bidirectional=True)
        self.linear1 = nn.Linear(self.embedding_dim, hidden_dim)
        self.linear_out = nn.Linear(hidden_dim*2, vocab_size)

    def forward(self,back_data):
        back_embed = self.embedding(back_data)
        back_lstm1, _ = self.lstm1(back_embed)   #output : batch_size x max_len x hidden_dim*2
        back_lstm2, _ = self.lstm2(back_lstm1)
        linear_out = self.linear_out(back_lstm2)
        return linear_out

In [46]:
hidden_dim = 256
num_layers = 2
vocab_size = len(word_to_idx)

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(0)

<torch._C.Generator at 0x7f024c7f0bb0>

In [49]:
elmo = ELMo(vocab_size, embeddings, 100)
elmo.to(device)
print(elmo)

ELMo(
  (embedding): Embedding(12059, 100)
  (lstm1): LSTM(100, 100, batch_first=True, bidirectional=True)
  (lstm2): LSTM(200, 100, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (linear_out): Linear(in_features=200, out_features=12059, bias=True)
)


In [50]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(elmo.parameters(), lr=0.001)

In [51]:
elmo_losses = {'train': [], 'val': [], 'epoch' : []}
epochs = 4

In [52]:
for epoch in range(epochs):
    train_loss = 0
    elmo.train()
    for i, (forward_context, backward_context) in enumerate(train_loader):
        forward_context, backward_context = forward_context.to(device), backward_context.to(device)
        optimizer.zero_grad()
        output = elmo(backward_context)
        output = output.view(-1, len(vocab))
        target = forward_context.view(-1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if i%500 == 0:
            print('Epoch: {}/{}'.format(epoch+1, epochs), 'Step: {}'.format(i), 'Loss: {}'.format(loss.item()), 'Train Loss: {}'.format(train_loss/(i+1)))

Epoch: 1/4 Step: 0 Loss: 9.3722505569458 Train Loss: 9.3722505569458
