In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
import numpy as np
import string
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
df = pd.read_csv("data/train.txt", sep=" ")
#df = pd.read_csv("data/dev.txt", sep=" ")
#df = pd.read_csv("data/test.txt", sep=" ")

In [4]:
df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,EU,NNP,B-NP,B-ORG
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,B-MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O


In [5]:
df.drop(['-X-', '-X-.1'], axis=1, inplace=True)

In [6]:
df.columns = ['Word', 'Tag']

In [7]:
df.head()

Unnamed: 0,Word,Tag
0,EU,B-ORG
1,rejects,O
2,German,B-MISC
3,call,O
4,to,O


In [8]:
df.drop(df[df['Word'] == "-DOCSTART-"].index, axis=0, inplace=True)

In [9]:
df['Tag'].unique()

array(['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', nan,
       'I-MISC', 'I-LOC'], dtype=object)

In [10]:
tags = df['Tag'].values
words = df['Word'].values

In [11]:
new_tags = []
for t in tags:
    if t == "B-ORG":
        new_tags.append("ORG")
    elif t == "B-PER":
        new_tags.append("PER")
    elif t == "I-PER":
        new_tags.append("PER")
    elif t == "B-LOC":
        new_tags.append("LOC")
    elif t == "I-ORG":
        new_tags.append("ORG")
    elif t == "I-LOC":
        new_tags.append("LOC")
    else:
        new_tags.append("O")

In [12]:
df.drop(['Tag'], axis=1, inplace=True)
df.drop(['Word'], axis=1, inplace=True)

In [13]:
df['Tag'] = new_tags
df['Word'] = words

In [14]:
len(df)

203621

In [15]:
sentence_no = 0
sentence = []
for w in words:
    sentence.append(sentence_no)
    if w == ".":
        sentence_no = sentence_no + 1

In [16]:
df.insert(0, 'Sentence #', sentence)

In [17]:
df.head()

Unnamed: 0,Sentence #,Tag,Word
0,0,ORG,EU
1,0,O,rejects
2,0,O,German
3,0,O,call
4,0,O,to


In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_whole_word_mask=True)

In [19]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [20]:
getter = SentenceGetter(df)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [21]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [22]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]
#tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
#labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [23]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [24]:
tokenized_texts[0:50]

[['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'la',
  '##mb',
  '.'],
 ['Peter',
  'Blackburn',
  'BR',
  '##US',
  '##SE',
  '##LS',
  '1996',
  '-',
  '08',
  '-',
  '22',
  'The',
  'European',
  'Commission',
  'said',
  'on',
  'Thursday',
  'it',
  'disagreed',
  'with',
  'German',
  'advice',
  'to',
  'consumers',
  'to',
  's',
  '##hun',
  'British',
  'la',
  '##mb',
  'until',
  'scientists',
  'determine',
  'whether',
  'mad',
  'cow',
  'disease',
  'can',
  'be',
  'transmitted',
  'to',
  'sheep',
  '.'],
 ['Germany',
  "'",
  's',
  'representative',
  'to',
  'the',
  'European',
  'Union',
  "'",
  's',
  'veterinary',
  'committee',
  'Werner',
  'Z',
  '##wing',
  '##mann',
  'said',
  'on',
  'Wednesday',
  'consumers',
  'should',
  'buy',
  'sheep',
  '##me',
  '##at',
  'from',
  'countries',
  'other',
  'than',
  'Britain',
  'until',
  'the',
  'scientific',
  'advice',
  'was',
  'clearer',
  '.'],
 ['We',
  'do',
  'n',
 

In [25]:
for sentence in tokenized_texts:
    print(sentence)
    break

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.']


In [27]:
i = 0
sentence_no = 0
new_sentence = []
new_data = []
for sentence, label in zip(tokenized_texts, labels):
    new_tokens = []
    new_tags = []
    for token, tag in zip(sentence, label):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_tokens.append(token)
            new_tags.append(tag)
    for new_token, new_tag in zip(new_tokens, new_tags):
        if new_token in string.punctuation:
            continue
        else:
            new_data.append((sentence_no, new_token, new_tag))
    sentence_no = sentence_no + 1

In [28]:
new_data[0:50]

[(0, 'EU', 'ORG'),
 (0, 'rejects', 'O'),
 (0, 'German', 'O'),
 (0, 'call', 'O'),
 (0, 'to', 'O'),
 (0, 'boycott', 'O'),
 (0, 'British', 'O'),
 (0, 'lamb', 'O'),
 (1, 'Peter', 'PER'),
 (1, 'Blackburn', 'PER'),
 (1, 'BRUSSELS', 'LOC'),
 (1, '1996', 'O'),
 (1, '08', 'O'),
 (1, '22', 'O'),
 (1, 'The', 'O'),
 (1, 'European', 'ORG'),
 (1, 'Commission', 'ORG'),
 (1, 'said', 'O'),
 (1, 'on', 'O'),
 (1, 'Thursday', 'O'),
 (1, 'it', 'O'),
 (1, 'disagreed', 'O'),
 (1, 'with', 'O'),
 (1, 'German', 'O'),
 (1, 'advice', 'O'),
 (1, 'to', 'O'),
 (1, 'consumers', 'O'),
 (1, 'to', 'O'),
 (1, 'shun', 'O'),
 (1, 'British', 'O'),
 (1, 'lamb', 'O'),
 (1, 'until', 'O'),
 (1, 'scientists', 'O'),
 (1, 'determine', 'O'),
 (1, 'whether', 'O'),
 (1, 'mad', 'O'),
 (1, 'cow', 'O'),
 (1, 'disease', 'O'),
 (1, 'can', 'O'),
 (1, 'be', 'O'),
 (1, 'transmitted', 'O'),
 (1, 'to', 'O'),
 (1, 'sheep', 'O'),
 (2, 'Germany', 'LOC'),
 (2, 's', 'O'),
 (2, 'representative', 'O'),
 (2, 'to', 'O'),
 (2, 'the', 'O'),
 (2, 'Europea

In [29]:
data = pd.DataFrame(new_data, columns=['sentence_no', 'Word', 'Tag'])

In [None]:
df.to_csv("data/conll_train_preprocessed.csv")