In [2]:
import torch 
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import sys 
sys.path.append('../ELMo')
from ELMO_used import ELMo
import wandb
import re
import pandas as pd 
from preprocessing import tokenize, CharLevelVocab, WordLevelVocab

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
char_vocab = torch.load('../hindi_nli/Marathi_ELMo/char_vocab_marathi.pt')
word_vocab = torch.load('../hindi_nli/Marathi_ELMo/word_vocab_marathi.pt')

In [5]:
elmo = ELMo(cnn_config = {'character_embedding_size': 16, 
                           'num_filters': 32, 
                           'kernel_size': 5, 
                           'max_word_length': 10, 
                           'char_vocab_size': char_vocab.num_chars}, 
             elmo_config = {'num_layers': 3,
                            'word_embedding_dim': 150,
                            'vocab_size': word_vocab.num_words}, 
             char_vocab_size = char_vocab.num_chars).to(device)

In [6]:
elmo.load_state_dict(torch.load('../hindi_nli/Marathi_ELMo/elmo_marathi.pt'))

<All keys matched successfully>

In [7]:
m_train_df = pd.read_table('train_iob.txt')
m_train_df

Unnamed: 0,words,labels,sentence_id
0,यादरम्यान,O,1.0
1,वर्षानुवर्षे,O,1.0
2,विसर्जनानंतर,O,1.0
3,स्वच्छता,O,1.0
4,करणाऱ्यांच्या,O,1.0
...,...,...,...
199251,पोलिसांना,O,21501.0
199252,क्वॉरंटाइन,O,21501.0
199253,करण्यात,O,21501.0
199254,आले,O,21501.0


In [8]:
# make a df with sentence (list of words), labels (list of labels) based on the sentence_id
# loop throughb the df, and append words to a temp sentence_list and labels to a temp label_list as long as sentence_id is same 
# append that to a new df and reset the temp lists
# do this for the entire df

def make_sentence_df(df):
    sentence_list = []
    labels_list = []
    temp_sentence = []
    temp_labels = []
    for i in range(1, len(df)):
        if df['sentence_id'][i] == df['sentence_id'][i-1]:
            temp_sentence.append(df['words'][i])
            temp_labels.append(df['labels'][i])
        else:
            sentence_list.append(temp_sentence)
            labels_list.append(temp_labels)
            temp_sentence = []
            temp_labels = []
    return pd.DataFrame({'sentence': sentence_list, 'labels': labels_list})

sentence_m_train_df = make_sentence_df(m_train_df)
sentence_m_train_df

Unnamed: 0,sentence,labels
0,"[वर्षानुवर्षे, विसर्जनानंतर, स्वच्छता, करणाऱ्य...","[O, O, O, O, O, O, O, O, O, O, O]"
1,"[रैना, त्याला, मिळालेल्या, हॉटेलमधील, रुमबद्दल...","[BNEP, O, O, O, O, O, O]"
2,"[एकूणच, स्थलांतरितांच्या, भावना, स्पष्ट, करणार...","[O, O, O, O, O, O]"
3,"[ग्रामीण, मधील, ५०५, शहरातील, १९०, मालेगाव, शह...","[O, O, BNEM, O, BNEM, BNEL, O, BNEM, O, O, O]"
4,"[त्याला, अर्जुन, पुरस्कार, मिळाल्याची, बातमीही...","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
21488,"[नवीन, फोनसाठी, ई-कॉमर्स, वेबसाइटवर, मायक्रो, ...","[O, O, O, O, O, O, O, O]"
21489,"[लसीच्या, घोषणेनंतर, जागतिक, कमॉडिटी, बाजारात,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
21490,"[पुलवामा, हल्ल्यातील, आत्मघातकी, हल्लेखोराच्या...","[BNEL, O, O, O, O, O, O, O, O]"
21491,"[सुशांत, प्रियांकाविरुद्ध, रियाला, सांगताना, द...","[BNEP, BNEP, BNEP, O, O, O]"


In [9]:
train_df = sentence_m_train_df

In [10]:
# train_df = pd.read_json('train.json')
# train_df

In [11]:
# get unique values in ner_tags column
ner_tags = set()
for tags in train_df['labels']:
    for tag in tags:
        ner_tags.add(tag)

ner_tags = list(ner_tags)
ner_tags

['BNEO',
 'INETI',
 'BNEM',
 'BED',
 'BNEL',
 'INEP',
 'BNETI',
 'IED',
 'INEM',
 'BNEP',
 'INED',
 'INEL',
 'INEO',
 'O',
 'BNED']

In [12]:
# repeat the process for validation and test data
m_val_df = pd.read_table('valid_iob.txt')
m_val_df

sentence_m_val_df = make_sentence_df(m_val_df)


val_df = sentence_m_val_df

sentence_m_val_df

Unnamed: 0,sentence,labels
0,"[चक्रवर्ती, खरंच, गायब, झाली, का]","[INEP, O, O, O, O]"
1,"[नियंत्रण, रेषेवर, जानेवारी, महिन्यात, दहशतवाद...","[O, O, BNED, O, O, O, O, O, O, O, O]"
2,"[फक्त, १४५००, मेगावॉटची, मागणी, आहे]","[O, BNEM, O, O, O]"
3,"[भवानी, पेठेतील, मुलीच्या, आईने, तक्रार, दिली,...","[BNEL, BNEL, O, O, O, O, O]"
4,"[साथीच्या, काळात, जवळपास, सर्वच, आमदार, खासदार...","[O, O, O, O, O, O, O, O, O, O]"
...,...,...
1494,"[प्रोसेसिंग, फी, आकारली, जात, नाही]","[O, O, O, O, O]"
1495,"[करोना, रुग्ण, बरे, होण्याचे, प्रमाण, ७१]","[O, O, O, O, O, BNEM]"
1496,"[विषाणूचा, मेंदूच्या, पेशींवर, परिणाम, होऊ, शक...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1497,"[महापालिका, हद्दीतील, जास्तीज, जास्त, नागरिकां...","[INEO, O, O, O, O, O, O, O, O, O, O, O]"


In [13]:
# test
m_test_df = pd.read_table('test_iob.txt')

sentence_m_test_df = make_sentence_df(m_test_df)

test_df = sentence_m_test_df

sentence_m_test_df

Unnamed: 0,sentence,labels
0,"[या, दहशतवादी, संघटनेने, या, हल्ल्याची, जबाबदा...","[O, O, O, O, O, O, O, O]"
1,"[पाहणी, करण्यासाठी, पहिल्यादांचा, ड्रोनचा, वाप...","[O, O, O, O, O, O, O, O]"
2,"[या, फोनमध्ये, २९, मेगापिक्सलचा, कॅमेरा, दिला,...","[O, O, BNEM, O, O, O, O]"
3,"[देशांमध्येही, फिलिपिनी, नर्सेस, खालोखाल, भारत...","[O, O, O, O, O, O, O, O, O]"
4,"[असतानाही, दुहेरी, हत्याकांड, घडले, आहे]","[O, O, O, O, O]"
...,...,...
1993,"[काही, दिवसांमध्ये, आयपीएलला, सुरुवात, होणार, ...","[O, O, O, O, O, O]"
1994,"[सुरक्षा, रक्षकांपासून, ते, पुजाऱ्यांपर्यंत, अ...","[O, O, O, O, O, O, O, O]"
1995,"[अन्य, नेटवर्कवर, कॉलिंग, साठी, या, प्लानमध्ये...","[O, O, O, O, O, O, BNEM, INEM, O, O, O]"
1996,"[या, अमेरिकी, यानाकडून, सातत्याने, मोहिती, पाठ...","[O, O, O, O, O, O, O]"


In [14]:
# val_df = pd.read_json('validation.json')
# val_df

In [15]:
# get thee list of unique labels from the ner column in test_df and train_df
labels = list(set([label for sublist in train_df.labels.tolist() for label in sublist] + [label for sublist in test_df.labels.tolist() for label in sublist] + [label for sublist in val_df.labels.tolist() for label in sublist]))

# remove ' B-ORG'
# labels.remove(' B-ORG')

# labels = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']

In [16]:
labels

['BNEO',
 'INETI',
 'BNEM',
 'BED',
 'BNEL',
 'INEP',
 'BNETI',
 'IED',
 'INEM',
 'BNEP',
 'INED',
 'INEL',
 'INEO',
 'O',
 'BNED']

In [17]:
# # remove any tokens in the test_df and train_df which are not in the character_list.txt file 
# char_list = open('../ELMo/tamil_characters.txt').read().split('\n')
# char_list = [char for char in char_list if char != '']
# # char_list

In [18]:
# def clean_tokens(tokens, acceptable_chars):
#     cleaned_tokens = []
#     for token in tokens:
#         cleaned_token = re.sub(f'[^{acceptable_chars}]', '', token)
#         if cleaned_token != '':
#             cleaned_tokens.append(cleaned_token)
#     return cleaned_tokens

# # acceptable_chars = set(char_list)

# train_df['tokens'] = train_df['words'].apply(lambda x: clean_tokens(x, acceptable_chars))
# test_df['tokens'] = test_df['words'].apply(lambda x: clean_tokens(x, acceptable_chars))
# val_df['tokens'] = val_df['words'].apply(lambda x: clean_tokens(x, acceptable_chars))

# # # replace all instances of ' B-ORG' with 'B-ORG' in the ner column of test_df and train_df
# # train_df['ner'] = train_df['ner'].apply(lambda x: [label.replace(' B-ORG', 'B-ORG') for label in x])
# # test_df['ner'] = test_df['ner'].apply(lambda x: [label.replace(' B-ORG', 'B-ORG') for label in x])
# # val_df['ner'] = val_df['ner'].apply(lambda x: [label.replace(' B-ORG', 'B-ORG') for label in x])

# train_df

In [19]:
# # add a column which contains the length of thr words column

# train_df['token_length'] = train_df['tokens'].apply(lambda x: len(x))
# train_df['words_length'] = train_df['words'].apply(lambda x: len(x))

# train_df

In [20]:
# rename the sentences column to tokens 
train_df = train_df.rename(columns = {'sentence': 'tokens', 'labels': 'ner_tags'})
val_df = val_df.rename(columns = {'sentence': 'tokens', 'labels': 'ner_tags'})
test_df = test_df.rename(columns = {'sentence': 'tokens', 'labels': 'ner_tags'})



In [21]:
train_sentences = train_df['tokens'].tolist()
test_sentences = test_df['tokens'].tolist()
val_sentences = val_df['tokens'].tolist()

train_labels = train_df['ner_tags'].tolist()
test_labels = test_df['ner_tags'].tolist()
val_labels = val_df['ner_tags'].tolist()

# change the train labels to their corresponding index in the labels list
labels

['BNEO',
 'INETI',
 'BNEM',
 'BED',
 'BNEL',
 'INEP',
 'BNETI',
 'IED',
 'INEM',
 'BNEP',
 'INED',
 'INEL',
 'INEO',
 'O',
 'BNED']

In [22]:
print(len(train_sentences), len(train_labels))
print(len(test_sentences), len(test_labels))
print(len(val_sentences), len(val_labels))

21493 21493
1998 1998
1499 1499


In [23]:
labels_to_idx = {label: idx for idx, label in enumerate(labels)}

# add a pad_tag, end_tag, start_tag and oov_tag to the labels_to_idx dictionary
labels_to_idx['<PAD>'] = len(labels_to_idx)
labels_to_idx['<EOS>'] = len(labels_to_idx)
labels_to_idx['<BOS>'] = len(labels_to_idx)
labels_to_idx['<OOV>'] = len(labels_to_idx)

labels_to_idx

{'BNEO': 0,
 'INETI': 1,
 'BNEM': 2,
 'BED': 3,
 'BNEL': 4,
 'INEP': 5,
 'BNETI': 6,
 'IED': 7,
 'INEM': 8,
 'BNEP': 9,
 'INED': 10,
 'INEL': 11,
 'INEO': 12,
 'O': 13,
 'BNED': 14,
 '<PAD>': 15,
 '<EOS>': 16,
 '<BOS>': 17,
 '<OOV>': 18}

In [24]:
# change train_labels to their corresponding index using labels_to_idx 
train_labels = [[labels_to_idx[label] for label in sublist] for sublist in train_labels]
test_labels = [[labels_to_idx[label] for label in sublist] for sublist in test_labels]
val_labels = [[labels_to_idx[label] for label in sublist] for sublist in val_labels]

In [25]:
OUT_OF_VOCAB = '<OOV>'
PAD_TAG = '<PAD>'
START_TAG = '<BOS>'
END_TAG = '<EOS>'

from preprocessing import tokenize, CharLevelVocab, WordLevelVocab

class NERDataset(Dataset):
    def __init__(self, sentences, labels, word_vocab: WordLevelVocab, char_vocab: CharLevelVocab, labels_to_idx: dict, max_word_length: int = 15, max_sentence_length: int = 15):
        self.sentences = sentences # these are already cleaned and tokenised - only contained tokens with characters in the character_list.txt file
        self.labels = labels
        self.word_vocab = word_vocab
        self.char_vocab = char_vocab
        self.labels_to_idx = labels_to_idx
        self.max_word_length = max_word_length
        self.max_sentence_length = max_sentence_length

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        return [torch.tensor([self.char_vocab.char_to_index(char) for char in word], dtype=torch.long) for word in sentence], torch.tensor(self.labels[idx], dtype=torch.long)

    # def __getitem__(self, idx):
    #     sentence = self.sentences[idx]
    #     labels = self.labels[idx]

    #     # convert the sentence to word and character indices
    #     word_indices = self.word_vocab.word_to_index(sentence)
    #     char_indices = self.char_vocab.word_to_index(sentence)

    #     # pad the word and character indices
    #     word_indices = self.pad_sequence(word_indices, self.word_vocab.word_to_index(PAD_TAG), self.max_sentence_length)
    #     char_indices = [self.pad_sequence(word, self.char_vocab.char_to_index(PAD_TAG), self.max_word_length) for word in char_indices]
    #     char_indices = self.pad_sequence(char_indices, [self.char_vocab.char_to_index(PAD_TAG)] * self.max_word_length, self.max_sentence_length)

    #     # convert the labels to indices
    #     labels = self.pad_sequence(labels, self.labels_to_idx[PAD_TAG], self.max_sentence_length)

    #     return torch.tensor(word_indices), torch.tensor(char_indices), torch.tensor(labels)

    def collate_fn(self, batch):
        sentences, labels = zip(*batch)

        bos_token = []
        for c in START_TAG:
            bos_token.append(self.char_vocab.char_to_index(c))
        bos_token = torch.tensor(bos_token, dtype=torch.long)

        eos_token = []
        for c in END_TAG:
            eos_token.append(self.char_vocab.char_to_index(c))
        eos_token = torch.tensor(eos_token, dtype=torch.long)

        pad_token = []
        for c in PAD_TAG:
            pad_token.append(self.char_vocab.char_to_index(c))
        pad_token = torch.tensor(pad_token, dtype=torch.long)

        sentences = [[bos_token] + list(sentence) + [eos_token] for sentence in sentences]

        sentences = [sentence[:self.max_sentence_length] + [pad_token] * (self.max_sentence_length - len(sentence)) for sentence in sentences]

        for i in range(len(sentences)):
            for j in range(len(sentences[i])):
                sentences[i][j] = torch.cat([sentences[i][j][:self.max_word_length], torch.tensor([self.char_vocab.char_to_index(PAD_TAG)]*(self.max_word_length - len(sentences[i][j])), dtype=torch.long)])
        
        sentences = torch.stack([torch.stack(sentence) for sentence in sentences])

        labels = [[self.labels_to_idx[START_TAG]] + list(label) + [self.labels_to_idx[END_TAG]] for label in labels]

        labels = [label[:self.max_sentence_length] + [self.labels_to_idx[PAD_TAG]] * (self.max_sentence_length - len(label)) for label in labels]
        labels = torch.tensor(labels, dtype=torch.long)

        return sentences, labels

In [26]:
ner_train_dataset = NERDataset(train_sentences, train_labels, word_vocab, char_vocab, labels_to_idx, max_word_length=10, max_sentence_length=15)
ner_test_dataset = NERDataset(test_sentences, test_labels, word_vocab, char_vocab, labels_to_idx, max_word_length=10, max_sentence_length=15)
ner_val_dataset = NERDataset(val_sentences, val_labels, word_vocab, char_vocab, labels_to_idx, max_word_length=10, max_sentence_length=15)

#check the dataset
ner_train_dataset[0]

([tensor([13,  4,  3, 36,  2, 11, 25, 13,  4,  3, 36,  5]),
  tensor([13, 14, 12,  4,  3, 26, 11,  2, 11, 16,  6,  4]),
  tensor([12,  3, 13, 19,  3, 66,  6,  2]),
  tensor([ 9,  4, 20,  2, 61,  3,  7,  2, 16, 19,  3,  7,  2]),
  tensor([15, 11,  2, 15, 34,  3,  7,  5]),
  tensor([ 9,  4, 22, 11,  2, 15, 25, 35,  5]),
  tensor([13,  2, 54, 10,  5, 10,  3,  7,  2]),
  tensor([18,  3, 10,  2, 12,  3, 29, 14,  9]),
  tensor([13,  2, 18,  4,  2, 19,  8]),
  tensor([37,  8,  6,  8]),
  tensor([23, 17,  5])],
 tensor([13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]))

In [27]:
ner_test_dataset[0]

([tensor([7, 2]),
  tensor([21, 17, 28,  6, 13,  2, 21,  8]),
  tensor([12, 16, 41, 29, 11,  5, 11,  5]),
  tensor([7, 2]),
  tensor([17, 10,  3, 10,  3,  7,  2, 19,  8]),
  tensor([26, 32,  2, 32, 21,  2,  4,  8]),
  tensor([41,  5,  6, 10,  8]),
  tensor([17, 22,  6,  8])],
 tensor([13, 13, 13, 13, 13, 13, 13, 13]))

In [28]:
# make the dataloaders
ner_train_dataloader = DataLoader(ner_train_dataset, batch_size=32, shuffle=True, collate_fn=ner_train_dataset.collate_fn)
ner_test_dataloader = DataLoader(ner_test_dataset, batch_size=32, shuffle=True, collate_fn=ner_test_dataset.collate_fn)
ner_val_dataloader = DataLoader(ner_val_dataset, batch_size=32, shuffle=True, collate_fn=ner_val_dataset.collate_fn)

In [29]:
# chekc if the dataloaders are working
for batch in ner_train_dataloader:
    print(batch)
    break

(tensor([[[82, 90, 83,  ...,  0,  0,  0],
         [ 9, 56, 10,  ...,  4,  5,  0],
         [15,  8,  0,  ...,  0,  0,  0],
         ...,
         [82, 87, 88,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0]],

        [[82, 90, 83,  ...,  0,  0,  0],
         [13, 14,  6,  ...,  0,  0,  0],
         [ 9,  3, 36,  ...,  2,  6,  0],
         ...,
         [82, 91, 83,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0]],

        [[82, 90, 83,  ...,  0,  0,  0],
         [10,  9,  3,  ..., 13,  4,  0],
         [ 1,  1, 83,  ...,  0,  0,  0],
         ...,
         [82, 87, 88,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0],
         [82, 87, 88,  ...,  0,  0,  0]],

        ...,

        [[82, 90, 83,  ...,  0,  0,  0],
         [32,  5, 35,  ...,  0,  0,  0],
         [26, 14, 10,  ...,  2,  6,  8],
         ...,
         [17, 22,  6,  ...,  0,  0,  0],
         [82, 91, 83,

In [30]:
class NERModel(nn.Module):
    def __init__(self, elmo, embedding_dim, hidden_dim_1, hidden_dim_2, num_classes):
        super(NERModel, self).__init__()
        self.elmo = elmo
        self.lambdas = nn.Parameter(torch.randn(3))
        self.lstm = nn.LSTM(embedding_dim, hidden_dim_1, batch_first=True)
        self.fc = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.output = nn.Linear(hidden_dim_2, num_classes)
        self.non_linearity = nn.ReLU()

        for param in self.elmo.parameters():
            param.requires_grad = False


    def forward(self, x):
        forward_output, backward_output, final_embeddings = self.elmo(x)
        # print the types
        # print(f"forward_output type: {type(forward_output)}")
        # print(f"backward_output type: {type(backward_output)}")
        # print(f"final_embeddings type: {type(final_embeddings)}")

        encoding = torch.zeros_like(final_embeddings[0])
        # print(f"encoding shape after initialization: {encoding.shape}")

        for i in range(3):
            encoding += self.lambdas[i] * final_embeddings[i]
        # print(f"encoding shape after loop: {encoding.shape}")

        lstm_output, _ = self.lstm(encoding)
        # print(f"lstm_output shape: {lstm_output.shape}")

        fc_output = self.fc(lstm_output.contiguous().view(-1, lstm_output.shape[2]))
        # print(f"fc_output shape after fc layer: {fc_output.shape}")

        fc_output = self.non_linearity(fc_output)
        # print(f"fc_output shape after non-linearity: {fc_output.shape}")

        output = self.output(fc_output)
        # print(f"output shape: {output.shape}")

        return output

In [31]:
def train_ner(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, device):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        predictions = []
        true_labels = []
        val_predictions = []
        val_true_labels = []
        loss_val = 0

        for batch in tqdm(train_dataloader):
            sentences, labels = batch
            sentences, labels = sentences.to(device), labels.to(device)

            optimizer.zero_grad()

            # print shapes of sentences and labels
            # print(f"sentences shape: {sentences.shape}")
            # print(f"labels shape: {labels.shape}")

            output = model(sentences)

            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predictions.extend(output.argmax(dim=-1).view(-1).tolist())
            true_labels.extend(labels.view(-1).tolist())

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                sentences, labels = batch
                sentences, labels = sentences.to(device), labels.to(device)

                output = model(sentences)

                loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))
                val_loss += loss.item()
                val_predictions.extend(output.argmax(dim=-1).view(-1).tolist())
                val_true_labels.extend(labels.view(-1).tolist())

        wandb.log({'Train Loss': train_loss/len(train_dataloader), 'Val Loss': val_loss/len(val_dataloader)})
        print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_dataloader)}, Val Loss: {val_loss/len(val_dataloader)}')
        

In [32]:
model_ner = NERModel(elmo, 300, 150, 100, len(labels_to_idx))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_ner.parameters(), lr=0.0005)

In [33]:
config = {'model': 'ELMo', 'language': 'Marathi', 'epochs': 10, 'batch_size': 32, 'lr': 0.0005}

wandb.init(project='INLP-Project-ELMo', group='NER', name='Marathi with ELMo', config=config)
wandb.watch(model_ner)
          
train_ner(model_ner, ner_train_dataloader, ner_val_dataloader, optimizer, criterion, 10, device)

wandb.join()

# save the model
torch.save(model_ner.state_dict(), 'ner_model_marathi_elmo.pt')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mnandarajiv[0m ([33mproject-ai-scream[0m). Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 672/672 [00:48<00:00, 13.87it/s]
100%|██████████| 47/47 [00:03<00:00, 14.62it/s]


Epoch: 1/10, Train Loss: 0.3123381589095862, Val Loss: 0.17282879796433956


100%|██████████| 672/672 [00:47<00:00, 14.25it/s]
100%|██████████| 47/47 [00:03<00:00, 14.27it/s]


Epoch: 2/10, Train Loss: 0.1491884539524714, Val Loss: 0.13357453571355088


100%|██████████| 672/672 [00:47<00:00, 14.10it/s]
100%|██████████| 47/47 [00:03<00:00, 14.22it/s]


Epoch: 3/10, Train Loss: 0.12144575171017398, Val Loss: 0.1274186706447855


100%|██████████| 672/672 [00:46<00:00, 14.34it/s]
100%|██████████| 47/47 [00:03<00:00, 14.58it/s]


Epoch: 4/10, Train Loss: 0.1053201984946749, Val Loss: 0.12017979615546287


100%|██████████| 672/672 [00:47<00:00, 14.20it/s]
100%|██████████| 47/47 [00:03<00:00, 14.51it/s]


Epoch: 5/10, Train Loss: 0.09376769582186603, Val Loss: 0.11414399013874378


100%|██████████| 672/672 [00:47<00:00, 14.28it/s]
100%|██████████| 47/47 [00:03<00:00, 14.44it/s]


Epoch: 6/10, Train Loss: 0.08506280565190882, Val Loss: 0.11604854884616872


100%|██████████| 672/672 [00:47<00:00, 14.22it/s]
100%|██████████| 47/47 [00:03<00:00, 14.41it/s]


Epoch: 7/10, Train Loss: 0.07707048350545977, Val Loss: 0.11721687082280503


100%|██████████| 672/672 [00:47<00:00, 14.24it/s]
100%|██████████| 47/47 [00:03<00:00, 14.42it/s]


Epoch: 8/10, Train Loss: 0.07036440021779743, Val Loss: 0.11982634029489883


100%|██████████| 672/672 [00:47<00:00, 14.25it/s]
100%|██████████| 47/47 [00:03<00:00, 13.91it/s]


Epoch: 9/10, Train Loss: 0.06420431323238604, Val Loss: 0.12143072374957672


100%|██████████| 672/672 [00:47<00:00, 14.28it/s]
100%|██████████| 47/47 [00:03<00:00, 14.54it/s]


Epoch: 10/10, Train Loss: 0.058504576191938086, Val Loss: 0.12028853286136972


0,1
Train Loss,█▄▃▂▂▂▂▁▁▁
Val Loss,█▃▃▂▁▁▁▂▂▂

0,1
Train Loss,0.0585
Val Loss,0.12029


In [34]:
# model_ner = NERModel(elmo, 300, 150, 100, len(labels_to_idx))
# model_ner.load_state_dict(torch.load('ner_model_hindi_elmo.pt'))

In [45]:
model_ner.to(device)
def eval_ner(model, test_dataloader, device):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            sentences, labels = batch
            sentences, labels = sentences.to(device), labels.to(device)

            # flatten labels 
            labels = labels.view(-1)

            output = model(sentences)
            # print(f"output shape: {output.shape}")
            # print(f"labels shape: {labels.shape}")

            output = output.argmax(dim=-1)

            correct += (output == labels).sum().item()
            total += labels.numel()

    accuracy = correct/total
    return correct/total
    wandb.log({"accuracy": accuracy})

accuracy = eval_ner(model_ner, ner_test_dataloader, device)
accuracy
wandb.init(project="my_project", name="my_run")



  0%|          | 0/63 [00:00<?, ?it/s]

100%|██████████| 63/63 [00:04<00:00, 14.52it/s]


In [46]:
OUT_OF_VOCAB = '<OOV>'
PAD_TAG = '<PAD>'
START_TAG = '<BOS>'
END_TAG = '<EOS>'

class NERDataset_nonELMo(Dataset):
    def __init__(self, sentences, labels, word_vocab: WordLevelVocab, labels_to_idx: dict, max_sentence_length: int = 15):
        self.sentences = sentences
        self.all_labels = labels
        self.word_vocab = word_vocab
        self.max_sentence_length = max_sentence_length
        self.labels_to_idx = labels_to_idx

    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        # add padding and start and end tags to the sentence
        labels = self.all_labels[idx]
        return torch.tensor([self.word_vocab.word_to_index(word) for word in sentence]), torch.tensor(labels, dtype=torch.long)

    def collate_fn(self, batch):
        sentences, labels = zip(*batch)

        bos_token = word_vocab.word_to_index(START_TAG)
        eos_token = word_vocab.word_to_index(END_TAG)
        pad_token = word_vocab.word_to_index(PAD_TAG)

        sentences = [[bos_token] + list(sentence) + [eos_token] for sentence in sentences]
        sentences = [sentence[:self.max_sentence_length] + [pad_token] * (self.max_sentence_length - len(sentence)) for sentence in sentences]

        for i in range(len(sentences)):
            sentences[i] = torch.tensor(sentences[i], dtype=torch.long)

        # stack
        sentences = torch.stack(sentences)

        labels = [[labels_to_idx[START_TAG]] + list(label) + [labels_to_idx[END_TAG]] for label in labels]

        labels = [label[:self.max_sentence_length] + [labels_to_idx[PAD_TAG]] * (self.max_sentence_length - len(label)) for label in labels]
        labels = torch.tensor(labels, dtype=torch.long)

        return sentences, labels

In [47]:
ner_train_dataset_nonelmo = NERDataset_nonELMo(train_sentences, train_labels, word_vocab, labels_to_idx, max_sentence_length=15)
ner_test_dataset_nonelmo = NERDataset_nonELMo(test_sentences, test_labels, word_vocab, labels_to_idx, max_sentence_length=15)
ner_val_dataset_nonelmo = NERDataset_nonELMo(val_sentences, val_labels, word_vocab, labels_to_idx, max_sentence_length=15)

ner_train_dataloader_nonelmo = DataLoader(ner_train_dataset_nonelmo, batch_size=32, shuffle=True, collate_fn=ner_train_dataset_nonelmo.collate_fn)
ner_test_dataloader_nonelmo = DataLoader(ner_test_dataset_nonelmo, batch_size=32, shuffle=True, collate_fn=ner_test_dataset_nonelmo.collate_fn)
ner_val_dataloader_nonelmo = DataLoader(ner_val_dataset_nonelmo, batch_size=32, shuffle=True, collate_fn=ner_val_dataset_nonelmo.collate_fn)

In [48]:
# check 
for batch in ner_train_dataloader_nonelmo:
    print(batch)
    break

(tensor([[     2,    117,     21,    867,   3708,   1522,     21,    151,    867,
            687,      6,      3,      1,      1,      1],
        [     2,   3044,  73660, 103346,   3044,   8569,   3020,      6,      3,
              1,      1,      1,      1,      1,      1],
        [     2,    331,   2630,     18,   7586,      8,   2633,   1468,     25,
              3,      1,      1,      1,      1,      1],
        [     2,  96836,   2465,    364,     75,    112,    349, 141599,   3338,
           2242,   2825,      3,      1,      1,      1],
        [     2,  10234,    661,   3902,      7,      0, 122782,   9347,    327,
              3,      1,      1,      1,      1,      1],
        [     2,    238,  10508,  10447,     82,  61258,   7257,     50,      6,
              3,      1,      1,      1,      1,      1],
        [     2,  74534, 125186,     30,      0,  44529,  64039,    158,    406,
           1256,    171,   1951,      3,      1,      1],
        [     2,    701,  

In [49]:
class NERModel_nonELMo(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim_1, hidden_dim_2, num_classes):
        super(NERModel_nonELMo, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim_1, batch_first=True)
        self.fc = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.output = nn.Linear(hidden_dim_2, num_classes)
        self.non_linearity = nn.ReLU()

    def forward(self, x):
        embedding = self.embedding(x)
        lstm_output, _ = self.lstm(embedding)
        fc_output = self.fc(lstm_output.contiguous().view(-1, lstm_output.shape[2]))
        fc_output = self.non_linearity(fc_output)
        output = self.output(fc_output)
        return output

In [50]:
model_ner_nonelmo = NERModel_nonELMo(word_vocab.num_words, 300, 150, 100, len(labels_to_idx))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_ner_nonelmo.parameters(), lr=0.001)

In [51]:
def train_nonelmo(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, device):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        predictions = []
        true_labels = []
        val_predictions = []
        val_true_labels = []
        loss_val = 0

        for batch in tqdm(train_dataloader):
            sentences, labels = batch
            # print(type(sentences), type(labels))
            sentences, labels = sentences.to(device), labels.to(device)

            optimizer.zero_grad()

            output = model(sentences)

            # print shapes
            # print(f"output shape: {output.shape}")
            # print(f"labels shape: {labels.shape}")

            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predictions.extend(output.argmax(dim=-1).view(-1).tolist())
            true_labels.extend(labels.view(-1).tolist())

        model.eval()
        val_loss = 0.0

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                sentences, labels = batch
                sentences, labels = sentences.to(device), labels.to(device)

                output = model(sentences)

                loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))
                val_loss += loss.item()
                val_predictions.extend(output.argmax(dim=-1).view(-1).tolist())
                val_true_labels.extend(labels.view(-1).tolist())

        print(f'Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_dataloader)}, Val Loss: {val_loss/len(val_dataloader)}')
        wandb.log({'Train Loss': train_loss/len(train_dataloader), 'Val Loss': val_loss/len(val_dataloader)})

In [52]:
config = {'model': 'No ELMo', 'language': 'Marathi', 'epochs': 10, 'batch_size': 32, 'lr': 0.001}
wandb.init(project = 'INLP-Project-ELMo', group = 'NER', name = 'Marathi without ELMo', config = config)
wandb.watch(model_ner_nonelmo)

train_nonelmo(model_ner_nonelmo, ner_train_dataloader_nonelmo, ner_val_dataloader_nonelmo, optimizer, criterion, 10, device)

wandb.join()
torch.save(model_ner_nonelmo.state_dict(), 'ner_model_marathi_noelmo.pt')

100%|██████████| 672/672 [00:08<00:00, 83.02it/s]
100%|██████████| 47/47 [00:00<00:00, 261.92it/s]


Epoch: 1/10, Train Loss: 0.326531137738909, Val Loss: 0.19919749666401682


100%|██████████| 672/672 [00:07<00:00, 89.60it/s]
100%|██████████| 47/47 [00:00<00:00, 325.65it/s]


Epoch: 2/10, Train Loss: 0.16491283616051078, Val Loss: 0.16348176306866585


100%|██████████| 672/672 [00:08<00:00, 83.89it/s]
100%|██████████| 47/47 [00:00<00:00, 341.09it/s]


Epoch: 3/10, Train Loss: 0.11541456140167568, Val Loss: 0.15897054700775348


100%|██████████| 672/672 [00:07<00:00, 84.26it/s]
100%|██████████| 47/47 [00:00<00:00, 342.56it/s]


Epoch: 4/10, Train Loss: 0.08434223938855298, Val Loss: 0.17020990366631367


100%|██████████| 672/672 [00:07<00:00, 90.48it/s]
100%|██████████| 47/47 [00:00<00:00, 330.01it/s]


Epoch: 5/10, Train Loss: 0.0641193699224719, Val Loss: 0.18620026190864278


100%|██████████| 672/672 [00:07<00:00, 85.06it/s]
100%|██████████| 47/47 [00:00<00:00, 323.06it/s]


Epoch: 6/10, Train Loss: 0.05187079091861267, Val Loss: 0.20222787051758867


100%|██████████| 672/672 [00:08<00:00, 82.83it/s]
100%|██████████| 47/47 [00:00<00:00, 328.87it/s]


Epoch: 7/10, Train Loss: 0.04393549410728849, Val Loss: 0.21858685875826694


100%|██████████| 672/672 [00:08<00:00, 82.87it/s]
100%|██████████| 47/47 [00:00<00:00, 326.83it/s]


Epoch: 8/10, Train Loss: 0.038990888118167366, Val Loss: 0.2360266572300424


100%|██████████| 672/672 [00:07<00:00, 84.02it/s]
100%|██████████| 47/47 [00:00<00:00, 343.27it/s]


Epoch: 9/10, Train Loss: 0.03504767857832901, Val Loss: 0.25464376490166846


100%|██████████| 672/672 [00:08<00:00, 83.76it/s]
100%|██████████| 47/47 [00:00<00:00, 285.08it/s]


Epoch: 10/10, Train Loss: 0.03325738032054644, Val Loss: 0.246055875686889


0,1
Train Loss,█▄▃▂▂▁▁▁▁▁
Val Loss,▄▁▁▂▃▄▅▇█▇

0,1
Train Loss,0.03326
Val Loss,0.24606


In [53]:
def eval_ner_nonelmo(model, test_dataloader, device):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in test_dataloader:
            sentences, labels = batch
            sentences, labels = sentences.to(device), labels.to(device)

            # flatten labels 
            labels = labels.view(-1)

            output = model(sentences)
            # print(f"output shape: {output.shape}")
            # print(f"labels shape: {labels.shape}")

            output = output.argmax(dim=-1)

            correct += (output == labels).sum().item()
            total += labels.numel()

    accuracy = correct/total
    return correct/total
    # wandb.log({"accuracy": accuracy})

In [54]:
accuracy = eval_ner_nonelmo(model_ner_nonelmo, ner_test_dataloader_nonelmo, device)
accuracy

0.9542876209542877