In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
import numpy as np
import string
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
#df = pd.read_csv("data/train.txt", sep=" ")
df = pd.read_csv("data/dev.txt", sep=" ")
#df = pd.read_csv("data/test.txt", sep=" ")

In [4]:
df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,CRICKET,NNP,B-NP,O
1,-,:,O,O
2,LEICESTERSHIRE,NNP,B-NP,B-ORG
3,TAKE,NNP,I-NP,O
4,OVER,IN,B-PP,O


In [5]:
df.drop(['-X-', '-X-.1'], axis=1, inplace=True)

In [6]:
df.columns = ['Word', 'Tag']

In [7]:
df.head()

Unnamed: 0,Word,Tag
0,CRICKET,O
1,-,O
2,LEICESTERSHIRE,B-ORG
3,TAKE,O
4,OVER,O


In [8]:
df.drop(df[df['Word'] == "-DOCSTART-"].index, axis=0, inplace=True)

In [9]:
df['Tag'].unique()

array(['O', 'B-ORG', 'B-LOC', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER',
       'I-LOC', 'I-ORG', nan], dtype=object)

In [10]:
tags = df['Tag'].values
words = df['Word'].values

In [11]:
new_tags = []
for t in tags:
    if t == "B-ORG":
        new_tags.append("ORG")
    elif t == "B-PER":
        new_tags.append("PER")
    elif t == "I-PER":
        new_tags.append("PER")
    elif t == "B-LOC":
        new_tags.append("LOC")
    elif t == "I-ORG":
        new_tags.append("ORG")
    elif t == "I-LOC":
        new_tags.append("LOC")
    else:
        new_tags.append("O")

In [12]:
df.drop(['Tag'], axis=1, inplace=True)
df.drop(['Word'], axis=1, inplace=True)

In [13]:
df['Tag'] = new_tags
df['Word'] = words

In [14]:
len(df)

51362

In [15]:
sentence_no = 0
sentence = []
for w in words:
    sentence.append(sentence_no)
    if w == ".":
        sentence_no = sentence_no + 1

In [16]:
df.insert(0, 'Sentence #', sentence)

In [17]:
df.head()

Unnamed: 0,Sentence #,Tag,Word
0,0,O,CRICKET
1,0,O,-
2,0,ORG,LEICESTERSHIRE
3,0,O,TAKE
4,0,O,OVER


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_whole_word_mask=True)

In [19]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [20]:
getter = SentenceGetter(df)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [21]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [22]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]
#tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
#labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [23]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [24]:
tokenized_texts[0:50]

[['CR',
  '##IC',
  '##KE',
  '##T',
  '-',
  'L',
  '##EI',
  '##CE',
  '##ST',
  '##ER',
  '##S',
  '##H',
  '##IR',
  '##E',
  'T',
  '##A',
  '##KE',
  'O',
  '##VE',
  '##R',
  'AT',
  'TO',
  '##P',
  'A',
  '##FT',
  '##ER',
  'IN',
  '##NI',
  '##NG',
  '##S',
  'VI',
  '##CT',
  '##OR',
  '##Y',
  '.'],
 ['L',
  '##ON',
  '##D',
  '##ON',
  '1996',
  '-',
  '08',
  '-',
  '30',
  'West',
  'Indian',
  'all',
  '-',
  'round',
  '##er',
  'Phil',
  'Simmons',
  'took',
  'four',
  'for',
  '38',
  'on',
  'Friday',
  'as',
  'Leicestershire',
  'beat',
  'Somerset',
  'by',
  'an',
  'innings',
  'and',
  '39',
  'runs',
  'in',
  'two',
  'days',
  'to',
  'take',
  'over',
  'at',
  'the',
  'head',
  'of',
  'the',
  'county',
  'championship',
  '.'],
 ['Their',
  'stay',
  'on',
  'top',
  ',',
  'though',
  ',',
  'may',
  'be',
  'short',
  '-',
  'lived',
  'as',
  'title',
  'rivals',
  'Essex',
  ',',
  'Derbyshire',
  'and',
  'Surrey',
  'all',
  'closed',
  'in',
 

In [25]:
for sentence in tokenized_texts:
    print(sentence)
    break

['CR', '##IC', '##KE', '##T', '-', 'L', '##EI', '##CE', '##ST', '##ER', '##S', '##H', '##IR', '##E', 'T', '##A', '##KE', 'O', '##VE', '##R', 'AT', 'TO', '##P', 'A', '##FT', '##ER', 'IN', '##NI', '##NG', '##S', 'VI', '##CT', '##OR', '##Y', '.']


In [27]:
i = 0
sentence_no = 0
new_sentence = []
new_data = []
for sentence, label in zip(tokenized_texts, labels):
    new_tokens = []
    new_tags = []
    for token, tag in zip(sentence, label):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_tokens.append(token)
            new_tags.append(tag)
    for new_token, new_tag in zip(new_tokens, new_tags):
        if new_token in string.punctuation:
            continue
        else:
            new_data.append((sentence_no, new_token, new_tag))
    sentence_no = sentence_no + 1

In [28]:
new_data[0:50]

[(0, 'CRICKET', 'O'),
 (0, 'LEICESTERSHIRE', 'ORG'),
 (0, 'TAKE', 'O'),
 (0, 'OVER', 'O'),
 (0, 'AT', 'O'),
 (0, 'TOP', 'O'),
 (0, 'AFTER', 'O'),
 (0, 'INNINGS', 'O'),
 (0, 'VICTORY', 'O'),
 (1, 'LONDON', 'LOC'),
 (1, '1996', 'O'),
 (1, '08', 'O'),
 (1, '30', 'O'),
 (1, 'West', 'O'),
 (1, 'Indian', 'O'),
 (1, 'all', 'O'),
 (1, 'rounder', 'O'),
 (1, 'Phil', 'PER'),
 (1, 'Simmons', 'PER'),
 (1, 'took', 'O'),
 (1, 'four', 'O'),
 (1, 'for', 'O'),
 (1, '38', 'O'),
 (1, 'on', 'O'),
 (1, 'Friday', 'O'),
 (1, 'as', 'O'),
 (1, 'Leicestershire', 'ORG'),
 (1, 'beat', 'O'),
 (1, 'Somerset', 'ORG'),
 (1, 'by', 'O'),
 (1, 'an', 'O'),
 (1, 'innings', 'O'),
 (1, 'and', 'O'),
 (1, '39', 'O'),
 (1, 'runs', 'O'),
 (1, 'in', 'O'),
 (1, 'two', 'O'),
 (1, 'days', 'O'),
 (1, 'to', 'O'),
 (1, 'take', 'O'),
 (1, 'over', 'O'),
 (1, 'at', 'O'),
 (1, 'the', 'O'),
 (1, 'head', 'O'),
 (1, 'of', 'O'),
 (1, 'the', 'O'),
 (1, 'county', 'O'),
 (1, 'championship', 'O'),
 (2, 'Their', 'O'),
 (2, 'stay', 'O')]

In [29]:
data = pd.DataFrame(new_data, columns=['sentence_no', 'text', 'tag'])

In [30]:
df.to_csv("data/conll_dev_preprocessed_without_punctuation.csv")