<a href="https://colab.research.google.com/github/ratmcu/wiki_ner/blob/master/wiki_ner_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Data Loader

In [0]:
'''
An entry or sent looks like ...
SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O
Each mini-batch returns the followings:
words: list of input sents. ["The 26-year-old ...", ...]
x: encoded input sents. [N, T]. int64.
is_heads: list of head markers. [[1, 1, 0, ...], [...]]
tags: list of tags.['O O B-MISC ...', '...']
y: encoded tags. [N, T]. int64
seqlens: list of seqlens. [45, 49, 10, 50, ...]
'''
import numpy as np
import torch
import pandas as pd
from torch.utils import data
!pip install pytorch-pretrained-bert
from pytorch_pretrained_bert import BertTokenizer
# import traceback
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
# VOCAB = ('<PAD>', 'O', 'I-LOC', 'B-PER', 'I-PER', 'I-ORG', 'I-MISC', 'B-MISC', 'B-LOC', 'B-ORG')

tags = ['BD', 'BP', 'PR', 'SP', 'CH', 'ED']
VOCAB_list = ['<PAD>', 'O',]
for tag in tags:
    VOCAB_list.append('I-'+tag)
    VOCAB_list.append('B-'+tag)
VOCAB = tuple(VOCAB_list)
tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}

class NerDataset(data.Dataset):
    def __init__(self, fpath):
        """
        fpath: [train|valid|test].txt
        """
        entries = open(fpath, 'r').read().strip().split("\n\n")
        sents, tags_li = [], [] # list of lists
        for entry in entries:
#             print(entry)
            lines = entry.splitlines()
            words = [line.split()[0] for line in entry.splitlines() if len(line.split()) > 1]
#             try:
#                 words = [line.split()[0] for line in entry.splitlines()]
# #                 words = [line.split()[0] for line in entry.splitlines() if len(line.split())== 1 and line.split()[0] == 'O']
#             except Exception as e:
#                 print(traceback.format_exc())
#                 print('splitting failed: ', [(ord(char), char) for char in entry])
#                 continue
            tags = ([line.split()[-1] for line in entry.splitlines() if len(line.split()) > 1])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<PAD>"] + tags + ["<PAD>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            if ord(w[0]) in [65533, 8206, 150, 61656, 128, 157] : #bad tokens that causes miss matches in the token and is_head legths
                continue 
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<PAD>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)
        try:
          assert len(x)==len(y)==len(is_heads), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"
        except AssertionError:
          print(tags)
          for tag in words:
              for c in tag:
                  print(c, ord(c))
          print(words)
          print(x)
          print(y)
          print(is_heads)
          raise BaseException(f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}")
        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen
    
    def append(self, other):
        self.sents.extend(other.sents)
        self.tags_li.extend(other.tags_li)

def pad(batch):
    '''Pads to 50'''
    f = lambda x: [sample[x] for sample in batch]
    g = lambda x, seqlen: [sample[x] + [" #!#!",[0],[0]," <PAD>"][x] * (seqlen - len(sample[x])) if len(sample[x]) < seqlen else sample[x][:seqlen] for sample in batch]  
    seqlens = f(-1)
    maxlen = min(50, np.array(seqlens).max())
    # print(type(batch[0][3]))
    # words = g(0, maxlen)
    words = f(0)
    # print(type(words))
    is_heads = g(2, maxlen)
    # print(type(is_heads))
    tags = [sample[3] for sample in batch] #g(3, maxlen)
    # print(type(tags))
    # maxlen = np.array(seqlens).max()
    # g = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    g = lambda x, seqlen: [ sample[x] + [0] * (seqlen - len(sample[x])) if len(sample[x]) < seqlen else sample[x][:seqlen] for sample in batch]  # 0: <pad>

    x = g(1, maxlen)
    y = g(-2, maxlen)

    f = torch.LongTensor
    # print(maxlen)
    # print(len(tags))
    # print(tags)
    return words, f(x), is_heads, tags, f(y), [maxlen for sample in batch]

def pad_max(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)
    f = torch.LongTensor
    return words, f(x), is_heads, tags, f(y), seqlens

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 35.5MB/s eta 0:00:01[K     |█████▎                          | 20kB 3.1MB/s eta 0:00:01[K     |████████                        | 30kB 4.0MB/s eta 0:00:01[K     |██████████▋                     | 40kB 3.0MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 3.5MB/s eta 0:00:01[K     |███████████████▉                | 61kB 4.1MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 4.5MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 4.6MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 5.1MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 5.0MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 5.0MB/s eta 0:00:01[K     |██████████████████████

100%|██████████| 213450/213450 [00:00<00:00, 1060742.61B/s]


In [0]:
#experiment_code
import os
import time
!pip install wget
import wget
import logging
import pickle
import ast
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
import tarfile
if not os.path.exists('dataset.tar.gz'):
    wget.download('https://github.com/ratmcu/wiki_ner/blob/master/dataset.tar.gz?raw=true')
tar = tarfile.open('dataset.tar.gz', mode='r')
tar.extractall('./')
tar.close()

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=7a32906f0d82dddcf5fdb818e4769c129b75252515ab36b45f2e8878dabbc31b
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [0]:
def toConllTxt(path, save_file = None):
    df = pd.read_csv(path)
#     dir_path, _  = os.path.split(path)
    if not save_file:
        save_file = os.path.join(os.path.split(path)[0], '%s.txt' % path.split('/')[-1])
    with open(save_file, 'w') as file:    
        for i, row in enumerate(df.iterrows()):
            if (row[1]['words'] == '\n' and row[1]['tags'] == '\n'):
                file.write('\n')
            else:
                try:
                    file.write(row[1]['words']+' ')
                except:
                    file.write( str(row[1]['words']) + ' ')
                file.write(row[1]['tags']+'\n')
    return save_file

In [0]:
#experiment_code
paths = sorted([os.path.join(f[0], name) for f in os.walk('./dataset') if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv'])
import random
rand_paths = random.choices(paths, k=10)
dataset = NerDataset(toConllTxt(rand_paths[0]))
for i, path in enumerate(rand_paths[1:]):
    print(path.split('/')[-2])
    txt_path = toConllTxt(path)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

Mihai Ghimpu
./dataset/politicians/Moldova/Mihai Ghimpu/conll_tagged.csv.txt
49   0
Bujar Nishani
./dataset/politicians/Albania/Bujar Nishani/conll_tagged.csv.txt
28   1
Natsagiin Bagabandi
./dataset/politicians/Mongolia/Natsagiin Bagabandi/conll_tagged.csv.txt
6   2
Shavkat Mirziyoyev
./dataset/politicians/Uzbekistan/Shavkat Mirziyoyev/conll_tagged.csv.txt
101   3
Pandeli Majko
./dataset/politicians/Albania/Pandeli Majko/conll_tagged.csv.txt
21   4
Vincent Auriol
./dataset/politicians/France/Vincent Auriol/conll_tagged.csv.txt
46   5
Erik Gustaf Boström
./dataset/politicians/Poland/Erik Gustaf Boström/conll_tagged.csv.txt
30   6
Gaafar Nimeiry
./dataset/politicians/South Sudan/Gaafar Nimeiry/conll_tagged.csv.txt
80   7
Sai Mauk Kham
./dataset/politicians/Myanmar/Sai Mauk Kham/conll_tagged.csv.txt
11   8
413


### **testing dataloader on all pages**

In [0]:
#experiment_code
paths = sorted([os.path.join(f[0], name) for f in os.walk('./dataset') if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv'])
dataset = NerDataset(toConllTxt(paths[0]))
for i, path in enumerate(paths[1:]):
    print(path.split('/')[-2])
    txt_path = toConllTxt(path)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

In [0]:
#experiment_code
print(len(dataset))
# dataset = NerDataset(toConllTxt(paths[0]))
for sent in dataset:
    print(sent)

## working on Mass Dataset

### creating the text files suitable for the dataloader

In [0]:
#experiment_code
import tarfile
from google.colab import drive
drive.mount('/content/drive')
tar = tarfile.open('/content/drive/My Drive/imrsv/Colab Notebooks/dataset.tar.gz', mode='r')
tar.extractall('./dataset_2')
tar.close()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#experiment_code
import os
paths_annot = sorted([os.path.join(f[0], name) for f in os.walk('./dataset_2') 
                if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv' and name.split('_')[0]=='annot'],
               key=lambda path: int(path.split('_')[-1].split('.')[0]))

In [0]:
#experiment_code
import random
rand_paths = random.choices(paths_annot, k=10)
dataset = NerDataset(toConllTxt(rand_paths[0]))
for i, path in enumerate(rand_paths[1:]):bl
    print(path.split('/')[-2])
    txt_path = toConllTxt(path, save_file = None)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

John_Fleming_(priest)
./dataset_2/scrapes/John_Fleming_(priest)/annot_csv_519.csv.txt
39   0
Princess_Alexandra,_The_Honourable_Lady_Ogilvy
./dataset_2/scrapes/Princess_Alexandra,_The_Honourable_Lady_Ogilvy/annot_csv_22710.csv.txt
75   1
Hugh_Cholmondeley,_5th_Baron_Delamere
./dataset_2/scrapes/Hugh_Cholmondeley,_5th_Baron_Delamere/annot_csv_6436.csv.txt
18   2
M%C3%B3nica_Echeverr%C3%ADa
./dataset_2/scrapes/M%C3%B3nica_Echeverr%C3%ADa/annot_csv_20534.csv.txt
27   3
Kate_O%27Regan
./dataset_2/scrapes/Kate_O%27Regan/annot_csv_13227.csv.txt
68   4
Hwang_Shin-hye
./dataset_2/scrapes/Hwang_Shin-hye/annot_csv_22632.csv.txt
17   5
Lakshmi_Manchu
./dataset_2/scrapes/Lakshmi_Manchu/annot_csv_10045.csv.txt
16   6
David_Eisenhower
./dataset_2/scrapes/David_Eisenhower/annot_csv_2482.csv.txt
34   7
Yoshitha_Rajapaksa
./dataset_2/scrapes/Yoshitha_Rajapaksa/annot_csv_4140.csv.txt
48   8
361


## model

In [0]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertModel

class Net(nn.Module):
    def __init__(self, top_rnns=False, vocab_size=None, device='cpu', finetuning=False):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.top_rnns=top_rnns
        if top_rnns:
            self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768//2, batch_first=True)
        self.fc = nn.Linear(768, vocab_size)

        self.device = device
        self.finetuning = finetuning

    def forward(self, x, y, ):
        '''
        x: (N, T). int64
        y: (N, T). int64
        Returns
        enc: (N, T, VOCAB)
        '''
        x = x.to(self.device)
        y = y.to(self.device)

        if self.training and self.finetuning:
            # print("->bert.train()")
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]

        if self.top_rnns:
            enc, _ = self.rnn(enc)
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

##Loading the final dataset


In [2]:
#experiment_code
import tarfile
from google.colab import drive
drive.mount('/content/drive')
tar = tarfile.open('/content/drive/My Drive/imrsv/dataset_txt.tar.gz', mode='r')
tar.extractall('./dataset_txt')
tar.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#experiment_code
import os
paths_annot = sorted([os.path.join(f[0], name) for f in os.walk('./dataset_txt') 
                if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.txt' and name.split('_')[0]=='annot'],
               key=lambda path: int(path.split('_')[-1].split('.')[0]))

training_files = [path for path in paths_annot if 'training' in path.split('/')]
eval_files = [path for path in paths_annot if 'eval' in path.split('/')]
test_files = [path for path in paths_annot if 'test' in path.split('/')]

In [0]:
#experiment_code
train_dataset = NerDataset(training_files[10])
_ = [train_dataset.append(NerDataset(train_file)) for train_file in training_files[11:]]

eval_dataset = NerDataset(eval_files[0])
_ = [eval_dataset.append(NerDataset(eval_file)) for eval_file in eval_files[1:10]]

test_dataset = NerDataset(test_files[0])
_ = [test_dataset.append(NerDataset(test_file)) for test_file in test_files[1:]]

In [0]:
#experiment_code
len(train_dataset), len(eval_dataset), len(test_dataset)

(868503, 753, 20318)

Iterator tests

In [0]:
#experiment_code
from torch.utils import data
train_iter = data.DataLoader(dataset=train_dataset,
                              batch_size=32,
                              shuffle=True,
                              num_workers=16,
                              collate_fn=pad)
eval_iter = data.DataLoader(dataset=eval_dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=16,
                              collate_fn=pad)
test_iter = data.DataLoader(dataset=test_dataset,
                              batch_size=32,
                              shuffle=True,
                              num_workers=16,
                              collate_fn=pad)
print(len(train_iter))
print(len(eval_iter))
print(len(test_iter))

27141
753
635


In [0]:
#experiment_code
for i, batch in enumerate(train_iter):
    words, x, is_heads, tags, y, seqlens = batch 
    # print(words, x, is_heads, tags, y, seqlens)
for i, batch in enumerate(eval_iter):
    words, x, is_heads, tags, y, seqlens = batch 
    # print(words, x, is_heads, tags, y, seqlens)
for i, batch in enumerate(test_iter):
    words, x, is_heads, tags, y, seqlens = batch 
    # print(words, x, is_heads, tags, y, seqlens) 

503


## Dataset introspection

In [0]:
#experiment_code
invstgt_iter = data.DataLoader(dataset=train_dataset,
                              batch_size=1,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)
tag_count_dict = {} 
[tag_count_dict.update({tag:0}) for tag in tags]
for i, batch in enumerate(invstgt_iter):
    words, x, is_heads, tags_, y, seqlens = batch 
    # [ print(tag) for tag in tags_[0].split() if len(tag) > 2 and tag[2:] in tag_count_dict.keys() and tag[0] == 'B' ]
    [ tag_count_dict.update({tag[2:]:tag_count_dict[tag[2:]]+1}) for tag in tags_[0].split() if len(tag) > 2 and tag[2:] in tag_count_dict.keys() and tag[0] == 'B' ]
    # if i == 10: break
tag_count_dict

{'BD': 16880, 'BP': 0, 'CH': 10824, 'ED': 24365, 'PR': 6325, 'SP': 25163}

In [0]:
#experiment_code
#{'BD': 16880, 'BP': 0, 'CH': 10824, 'ED': 24365, 'PR': 6325, 'SP': 25163}
tag_count_dict

{'BD': 16880, 'BP': 0, 'CH': 10824, 'ED': 24365, 'PR': 6325, 'SP': 25163}

In [0]:
#experiment_code
import multiprocessing
multiprocessing.cpu_count()

2

# Training

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
# from model import Net
# from data_load import NerDataset, pad, VOCAB, tokenizer, tag2idx, idx2tag
import os
import numpy as np
import argparse

def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        if i==0:
            print("=====sanity check======")
            print("words:", words[0])
            print("x:", x.cpu().numpy()[0][:seqlens[0]])
            print("tokens:", tokenizer.convert_ids_to_tokens(x.cpu().numpy()[0])[:seqlens[0]])
            print("is_heads:", is_heads[0])
            print("y:", _y.cpu().numpy()[0][:seqlens[0]])
            print("tags:", tags[0])
            print("seqlen:", seqlens[0])
            print("=======================")

        if i%10==0: # monitoring
            print(f"step: {i}, loss: {loss.item()}")
            # return

def eval(model, iterator, f):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("temp", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            # print(f"len y_hat: {len(y_hat)} is_head {len(is_heads)} \n")
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            # assert len(preds)==len(words.split())==len(tags.split())
            # assert len(words.split())==len(tags.split())
            # print(preds, words, tags.split()[:len(preds)])
            words = [word for word in words.split()]
            # tokens = [tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w] for w in words.split()] 
            # print(preds, words, tags.split()[:len(preds)])
            tags = tags.split()[:len(preds)]
            # print(len(preds), len(words), len(tags))
            # assert len(preds)==len(words),  f"len(peds)={len(preds)}, len(words)={len(words)}, len(is_heads)={len(is_heads)}"
            for w, t, p in zip(words[1:-1], tags[1:-1], preds[1:-1]):
                fout.write(f"{w} {t} {p}\n")
            fout.write("\n")

    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open("temp", 'r').read().splitlines() if len(line) > 0]) #original tags
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open("temp", 'r').read().splitlines() if len(line) > 0]) #predicted tags

    num_proposed = len(y_pred[y_pred>1])
    num_correct = (np.logical_and(y_true==y_pred, y_true>1)).astype(np.int).sum()
    num_gold = len(y_true[y_true>1])

    print(f"num_proposed:{num_proposed}")
    print(f"num_correct:{num_correct}")
    print(f"num_gold:{num_gold}")
    
    try:
        precision = num_correct / num_proposed
    except ZeroDivisionError:
        precision = 1.0

    try:
        recall = num_correct / num_gold
    except ZeroDivisionError:
        recall = 1.0

    try:
        f1 = 2*precision*recall / (precision + recall)
    except ZeroDivisionError:
        if precision*recall==0:
            f1=1.0
        else:
            f1=0

    final = f + ".P%.2f_R%.2f_F%.2f" %(precision, recall, f1)
    with open(final, 'w') as fout:
        result = open("temp", "r").read()
        fout.write(f"{result}\n")
        fout.write(f"precision={precision}\n")
        fout.write(f"recall={recall}\n")
        fout.write(f"f1={f1}\n")

    os.remove("temp")
    print("precision=%.2f"%precision)
    print("recall=%.2f"%recall)
    print("f1=%.2f"%f1)
    return precision, recall, f1


1735235584
1735235584
1735235584


In [0]:
#experiment_code
# if __name__=="__main__":
if True:
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--batch_size", type=int, default=128)
#     parser.add_argument("--lr", type=float, default=0.0001)
#     parser.add_argument("--n_epochs", type=int, default=30)
#     parser.add_argument("--finetuning", dest="finetuning", action="store_true")
#     parser.add_argument("--top_rnns", dest="top_rnns", action="store_true")
#     parser.add_argument("--logdir", type=str, default="checkpoints/01")
#     parser.add_argument("--trainset", type=str, default="conll2003/train.txt")
#     parser.add_argument("--validset", type=str, default="conll2003/valid.txt")
#     hp = parser.parse_args()
    print(torch.cuda.max_memory_allocated(device=None))
    batch_size = 32
    lr = 0.0001
    n_epochs = 30
    finetuning = True
    top_rnns = False
    logdir = "checkpoints/01"
    trainset = "conll2003/train.txt"
    validset = "conll2003/valid.txt"
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     device = 'cpu'

#     model = Net(hp.top_rnns, len(VOCAB), device, hp.finetuning).cuda()
    model = Net(top_rnns=top_rnns, vocab_size=len(VOCAB), device=device, finetuning=finetuning).cuda()
#     model = Net(top_rnns=top_rnns, vocab_size=len(VOCAB), device=device, finetuning=finetuning)
    print(torch.cuda.max_memory_allocated(device=None))
    model = nn.DataParallel(model)
    print(torch.cuda.max_memory_allocated(device=None))

    # train_dataset = NerDataset(trainset)
    # eval_dataset = NerDataset(validset)
    # pad_fn = pad
    # train_iter = data.DataLoader(dataset=train_dataset,
    #                              batch_size=batch_size,
    #                              shuffle=True,
    #                              num_workers=4,
    #                              collate_fn=pad_fn)
    # eval_iter = data.DataLoader(dataset=eval_dataset,
    #                              batch_size=batch_size,
    #                              shuffle=False,
    #                              num_workers=4,
    #                              collate_fn=pad_fn)

    optimizer = optim.Adam(model.parameters(), lr = lr)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # for epoch in range(1, n_epochs+1):
    #     train(model, train_iter, optimizer, criterion)

    #     print(f"=========eval at epoch={epoch}=========")
    #     if not os.path.exists(logdir): os.makedirs(logdir)
    #     fname = os.path.join(logdir, str(epoch))
    #     precision, recall, f1 = eval(model, eval_iter, fname)

    #     torch.save(model.state_dict(), f"{fname}.pt")
    #     print(f"weights were saved to {fname}.pt")

In [0]:
#experiment_code
for epoch in range(1, 1+1):
    train(model, train_iter, optimizer, criterion)
    # print(f"=========eval at epoch={epoch}=========")
    # if not os.path.exists(logdir): os.makedirs(logdir)
    # fname = os.path.join(logdir, str(epoch))
    
    # precision, recall, f1 = eval(model, eval_iter, fname)
    # torch.save(model.state_dict(), f"{fname}.pt")
    # print(f"weights were saved to {fname}.pt")
print(torch.cuda.max_memory_allocated(device=None))

words: [CLS] When the Boyaner Rebbe of New York died of a stroke on 2 March 1971 , the Boyaner Hasidim were left leaderless . [SEP]
x: [  101  1332  1103  4596  6354  1197 11336 20584  1104  1203  1365  1452
  1104   170  6625  1113   123  1345  2507   117  1103  4596  6354  1197
 10736  2386  4060  1127  1286  2301  2008   119   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
tokens: ['[CLS]', 'When', 'the', 'Boy', '##ane', '##r', 'Re', '##bbe', 'of', 'New', 'York', 'died', 'of', 'a', 'stroke', 'on', '2', 'March', '1971', ',', 'the', 'Boy', '##ane', '##r', 'Has', '##id', '##im', 'were', 'left', 'leader', '##less', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
is_heads: [1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
#experiment_code
fname = '/content/drive/My Drive/imrsv/Colab Notebooks/weights_overall.pt'
# torch.save(model.state_dict(), f"{fname}")
print(f"weights were saved to {fname}")

In [0]:
print(torch.cuda.max_memory_allocated(device=None))

3655972864


## load model

In [0]:
#experiment_code
fname = '/content/drive/My Drive/imrsv/Colab Notebooks/weights_overall.pt'
finetuning = True
top_rnns = False
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Net(top_rnns=top_rnns, vocab_size=len(VOCAB), device=device, finetuning=finetuning).cuda()
model = nn.DataParallel(model)
# model.load_state_dict(torch.load(fname))

## evaluate model

In [0]:
#experiment_code
eval_iter = data.DataLoader(dataset=eval_dataset,
                              batch_size=32,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=pad)
eval_dataset = NerDataset(eval_files[0])
_ = [eval_dataset.append(NerDataset(eval_file)) for eval_file in eval_files[1:200]]

In [0]:
#experiment_code
precision, recall, f1 = eval(model, eval_iter, './')

num_proposed:260974
num_correct:307
num_gold:4625
precision=0.00
recall=0.07
f1=0.00


In [0]:
#experiment_code
precision, recall, f1

(0.0008326713770085149, 0.05167567567567567, 0.0016389339386188382)