<a href="https://colab.research.google.com/github/ratmcu/wiki_ner/blob/master/wiki_ner_loader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
'''
An entry or sent looks like ...
SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O
Each mini-batch returns the followings:
words: list of input sents. ["The 26-year-old ...", ...]
x: encoded input sents. [N, T]. int64.
is_heads: list of head markers. [[1, 1, 0, ...], [...]]
tags: list of tags.['O O B-MISC ...', '...']
y: encoded tags. [N, T]. int64
seqlens: list of seqlens. [45, 49, 10, 50, ...]
'''
import numpy as np
import torch
from torch.utils import data
!pip install pytorch-pretrained-bert
from pytorch_pretrained_bert import BertTokenizer
# import traceback
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
# VOCAB = ('<PAD>', 'O', 'I-LOC', 'B-PER', 'I-PER', 'I-ORG', 'I-MISC', 'B-MISC', 'B-LOC', 'B-ORG')

tags = ['BD', 'BP', 'PR', 'SP', 'CH', 'ED']
VOCAB_list = ['<PAD>', 'O',]
for tag in tags:
    VOCAB_list.append('I-'+tag)
    VOCAB_list.append('B-'+tag)
VOCAB = tuple(VOCAB_list)
tag2idx = {tag: idx for idx, tag in enumerate(VOCAB)}
idx2tag = {idx: tag for idx, tag in enumerate(VOCAB)}

class NerDataset(data.Dataset):
    def __init__(self, fpath):
        """
        fpath: [train|valid|test].txt
        """
        entries = open(fpath, 'r').read().strip().split("\n\n")
        sents, tags_li = [], [] # list of lists
        for entry in entries:
#             print(entry)
            lines = entry.splitlines()
            words = [line.split()[0] for line in entry.splitlines() if len(line.split()) > 1]
#             try:
#                 words = [line.split()[0] for line in entry.splitlines()]
# #                 words = [line.split()[0] for line in entry.splitlines() if len(line.split())== 1 and line.split()[0] == 'O']
#             except Exception as e:
#                 print(traceback.format_exc())
#                 print('splitting failed: ', [(ord(char),char) for char in entry])
#                 continue
            tags = ([line.split()[-1] for line in entry.splitlines() if len(line.split()) > 1])
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<PAD>"] + tags + ["<PAD>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<PAD>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), f"len(x)={len(x)}, len(y)={len(y)}, len(is_heads)={len(is_heads)}"

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen
    
    def append(self, other):
        self.sents.extend(other.sents)
        self.tags_li.extend(other.tags_li)

def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 18.6MB/s eta 0:00:01[K     |█████▎                          | 20kB 6.3MB/s eta 0:00:01[K     |████████                        | 30kB 8.8MB/s eta 0:00:01[K     |██████████▋                     | 40kB 5.5MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 6.8MB/s eta 0:00:01[K     |███████████████▉                | 61kB 8.0MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 9.1MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 10.1MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 11.2MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 8.9MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 8.9MB/s eta 0:00:01[K     |████████████████████

100%|██████████| 213450/213450 [00:00<00:00, 837744.59B/s]


In [0]:
# from torch.utils import data as torch_data_utils
# train_iter = torch_data_utils.DataLoader(dataset=train_dataset,
#                              batch_size=1,
#                              shuffle=True,
#                              num_workers=4,
#                              collate_fn=pad)
# eval_iter = torch_data_utils.DataLoader(dataset=eval_dataset,
#                             batch_size=1,
#                             shuffle=False,
#                             num_workers=4,
#                             collate_fn=pad)

In [0]:
#experiment_code
import os
import time
!pip install wget
import wget
import logging
import pickle
import ast
import pandas as pd
import numpy as np
import urllib
from bs4 import BeautifulSoup
import tarfile
if not os.path.exists('dataset.tar.gz'):
    wget.download('https://github.com/ratmcu/wiki_ner/blob/master/dataset.tar.gz?raw=true')
tar = tarfile.open('dataset.tar.gz', mode='r')
tar.extractall('./')
tar.close()

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp36-none-any.whl size=9681 sha256=7a32906f0d82dddcf5fdb818e4769c129b75252515ab36b45f2e8878dabbc31b
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [0]:
def toConllTxt(path, save_file = None):
    df = pd.read_csv(path)
#     dir_path, _  = os.path.split(path)
    if not save_file:
        save_file = os.path.join(os.path.split(path)[0], '%s.txt' % path.split('/')[-1])
    with open(save_file, 'w') as file:    
        for i, row in enumerate(df.iterrows()):
            if (row[1]['words'] == '\n' and row[1]['tags'] == '\n'):
                file.write('\n')
            else:
                try:
                    file.write(row[1]['words']+' ')
                except:
                    file.write( str(row[1]['words']) + ' ')
                file.write(row[1]['tags']+'\n')
    return save_file

In [0]:
#experiment_code
paths = sorted([os.path.join(f[0], name) for f in os.walk('./dataset') if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv'])
import random
rand_paths = random.choices(paths, k=10)
dataset = NerDataset(toConllTxt(rand_paths[0]))
for i, path in enumerate(rand_paths[1:]):
    print(path.split('/')[-2])
    txt_path = toConllTxt(path)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

Mihai Ghimpu
./dataset/politicians/Moldova/Mihai Ghimpu/conll_tagged.csv.txt
49   0
Bujar Nishani
./dataset/politicians/Albania/Bujar Nishani/conll_tagged.csv.txt
28   1
Natsagiin Bagabandi
./dataset/politicians/Mongolia/Natsagiin Bagabandi/conll_tagged.csv.txt
6   2
Shavkat Mirziyoyev
./dataset/politicians/Uzbekistan/Shavkat Mirziyoyev/conll_tagged.csv.txt
101   3
Pandeli Majko
./dataset/politicians/Albania/Pandeli Majko/conll_tagged.csv.txt
21   4
Vincent Auriol
./dataset/politicians/France/Vincent Auriol/conll_tagged.csv.txt
46   5
Erik Gustaf Boström
./dataset/politicians/Poland/Erik Gustaf Boström/conll_tagged.csv.txt
30   6
Gaafar Nimeiry
./dataset/politicians/South Sudan/Gaafar Nimeiry/conll_tagged.csv.txt
80   7
Sai Mauk Kham
./dataset/politicians/Myanmar/Sai Mauk Kham/conll_tagged.csv.txt
11   8
413


### **testing dataloader on all pages**

In [0]:
#experiment_code
paths = sorted([os.path.join(f[0], name) for f in os.walk('./dataset') if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv'])
dataset = NerDataset(toConllTxt(paths[0]))
for i, path in enumerate(paths[1:]):
    print(path.split('/')[-2])
    txt_path = toConllTxt(path)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

In [0]:
#experiment_code
print(len(dataset))
# dataset = NerDataset(toConllTxt(paths[0]))
for sent in dataset:
    print(sent)

## working on Mass Dataset

### creating the text files suitable for the dataloader

In [0]:
#experiment_code
import tarfile
from google.colab import drive
drive.mount('/content/drive')
tar = tarfile.open('/content/drive/My Drive/Colab Notebooks/dataset.tar.gz', mode='r')
tar.extractall('./dataset_2')
tar.close()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#experiment_code
import os
paths_annot = sorted([os.path.join(f[0], name) for f in os.walk('./dataset_2') 
                if len(f[2])!=0 for name in f[2] if os.path.splitext(name)[-1] == '.csv' and name.split('_')[0]=='annot'],
               key=lambda path: int(path.split('_')[-1].split('.')[0]))

In [0]:
#experiment_code
import random
rand_paths = random.choices(paths_annot, k=10)
dataset = NerDataset(toConllTxt(rand_paths[0]))
for i, path in enumerate(rand_paths[1:]):bl
    print(path.split('/')[-2])
    txt_path = toConllTxt(path, save_file = None)
    print(txt_path)
    data_page = NerDataset(txt_path)
    dataset.append(data_page)
    print(len(data_page), ' ', i)
print(len(dataset))

John_Fleming_(priest)
./dataset_2/scrapes/John_Fleming_(priest)/annot_csv_519.csv.txt
39   0
Princess_Alexandra,_The_Honourable_Lady_Ogilvy
./dataset_2/scrapes/Princess_Alexandra,_The_Honourable_Lady_Ogilvy/annot_csv_22710.csv.txt
75   1
Hugh_Cholmondeley,_5th_Baron_Delamere
./dataset_2/scrapes/Hugh_Cholmondeley,_5th_Baron_Delamere/annot_csv_6436.csv.txt
18   2
M%C3%B3nica_Echeverr%C3%ADa
./dataset_2/scrapes/M%C3%B3nica_Echeverr%C3%ADa/annot_csv_20534.csv.txt
27   3
Kate_O%27Regan
./dataset_2/scrapes/Kate_O%27Regan/annot_csv_13227.csv.txt
68   4
Hwang_Shin-hye
./dataset_2/scrapes/Hwang_Shin-hye/annot_csv_22632.csv.txt
17   5
Lakshmi_Manchu
./dataset_2/scrapes/Lakshmi_Manchu/annot_csv_10045.csv.txt
16   6
David_Eisenhower
./dataset_2/scrapes/David_Eisenhower/annot_csv_2482.csv.txt
34   7
Yoshitha_Rajapaksa
./dataset_2/scrapes/Yoshitha_Rajapaksa/annot_csv_4140.csv.txt
48   8
361


In [0]:
eval_paths_annot = 