In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 10000)
import numpy as np
import string
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

In [3]:
#df = pd.read_csv("data/train.txt", sep=" ")
df = pd.read_csv("../../data/conll03/dev.txt", sep=" ")
#df = pd.read_csv("data/test.txt", sep=" ")

In [4]:
df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,CRICKET,NNP,B-NP,O
1,-,:,O,O
2,LEICESTERSHIRE,NNP,B-NP,B-ORG
3,TAKE,NNP,I-NP,O
4,OVER,IN,B-PP,O


In [5]:
df = df[df['O'].notna()]

In [6]:
df.drop(['-X-', '-X-.1'], axis=1, inplace=True)

In [7]:
df.columns = ['Word', 'Tag']

In [8]:
df.head()

Unnamed: 0,Word,Tag
0,CRICKET,O
1,-,O
2,LEICESTERSHIRE,B-ORG
3,TAKE,O
4,OVER,O


In [9]:
df.drop(df[df['Word'] == "-DOCSTART-"].index, axis=0, inplace=True)

In [10]:
df['Tag'].unique()

array(['O', 'B-ORG', 'B-LOC', 'B-MISC', 'I-MISC', 'B-PER', 'I-PER',
       'I-LOC', 'I-ORG'], dtype=object)

In [11]:
tags = df['Tag'].values
words = df['Word'].values

In [12]:
new_tags = []
for t in tags:
    if t == "B-ORG":
        new_tags.append("ORG")
    elif t == "B-PER":
        new_tags.append("PER")
    elif t == "I-PER":
        new_tags.append("PER")
    elif t == "B-LOC":
        new_tags.append("LOC")
    elif t == "I-ORG":
        new_tags.append("ORG")
    elif t == "I-LOC":
        new_tags.append("LOC")
    else:
        new_tags.append("O")

In [13]:
df.drop(['Tag'], axis=1, inplace=True)
df.drop(['Word'], axis=1, inplace=True)

In [14]:
df['Tag'] = new_tags
df['Word'] = words

In [15]:
len(df)

50722

In [16]:
sentence_no = 0
sentence = []
for w in words:
    sentence.append(sentence_no)
    if w == ".":
        sentence_no = sentence_no + 1

In [17]:
df.insert(0, 'Sentence #', sentence)

In [18]:
df.head()

Unnamed: 0,Sentence #,Tag,Word
0,0,O,CRICKET
1,0,O,-
2,0,ORG,LEICESTERSHIRE
3,0,O,TAKE
4,0,O,OVER


In [19]:
g_test = df.groupby("Sentence #")
x = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                       "Tag": g_test.apply(lambda sdf: ",".join(map(str,sdf.Tag)))})

In [20]:
import re

def decontracted(phrase):
    # general
    phrase = re.sub(r" \'s", "\'s", phrase)
    phrase = re.sub(r" \'t", "\'t", phrase)
    phrase = re.sub(r" n't", "n't", phrase)
    phrase = re.sub(r" \'re", "\'re", phrase)
    phrase = re.sub(r" \'d", "\'d", phrase)
    phrase = re.sub(r" \'ll", "\'ll", phrase)
    phrase = re.sub(r" \'t", "\'t", phrase)
    phrase = re.sub(r" \'ve", "\'ve", phrase)
    phrase = re.sub(r" \'m", "\'m", phrase)
    return phrase

In [21]:
p_data = []
sentence_no = 0
new_df = []
for sentence, tags in zip(x['Sentence'].values.tolist(), x['Tag'].values.tolist()):
    words = sentence.split(" ")
    ind = [ i for i, word in enumerate(words) if word == "'s" or word == "'t" or word == "'re" or word == "'d" or word == "'ll" or word == "'t" or word == "'ve" or word == "'m" or word == "n't"]
    labels = tags.split(",")
    sentence = " ".join(words)
    sentence = decontracted(sentence)
    tags = [ elem for i, elem in enumerate(labels) if i not in ind]
    stripped = sentence.translate(str.maketrans('', '', string.punctuation))
    stripped = stripped.split(" ")
    for word, label in zip(stripped, tags):
        p_data.append((sentence_no, word, label))
    sentence_no = sentence_no + 1

In [22]:
df = pd.DataFrame(p_data, columns=['Sentence #', 'Word', 'Tag'])

In [23]:
df.head()

Unnamed: 0,Sentence #,Word,Tag
0,0,CRICKET,O
1,0,,O
2,0,LEICESTERSHIRE,ORG
3,0,TAKE,O
4,0,OVER,O


In [24]:
df.drop(df[df['Word'] == ""].index, axis=0, inplace=True)

In [25]:
g_test = df.groupby("Sentence #")
x = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                       "Tag": g_test.apply(lambda sdf: ",".join(map(str,sdf.Tag)))})

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_whole_word_mask=True)

In [27]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [28]:
getter = SentenceGetter(df)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [29]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [30]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]
#tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
#labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [31]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [32]:
tokenized_texts[0:50]

[['CR',
  '##IC',
  '##KE',
  '##T',
  'L',
  '##EI',
  '##CE',
  '##ST',
  '##ER',
  '##S',
  '##H',
  '##IR',
  '##E',
  'T',
  '##A',
  '##KE',
  'O',
  '##VE',
  '##R',
  'AT',
  'TO',
  '##P',
  'A',
  '##FT',
  '##ER',
  'IN',
  '##NI',
  '##NG',
  '##S',
  'VI',
  '##CT',
  '##OR',
  '##Y'],
 ['L',
  '##ON',
  '##D',
  '##ON',
  '1996',
  '##0',
  '##8',
  '##30',
  'West',
  'Indian',
  'all',
  '##rou',
  '##nder',
  'Phil',
  'Simmons',
  'took',
  'four',
  'for',
  '38',
  'on',
  'Friday',
  'as',
  'Leicestershire',
  'beat',
  'Somerset',
  'by',
  'an',
  'innings',
  'and',
  '39',
  'runs',
  'in',
  'two',
  'days',
  'to',
  'take',
  'over',
  'at',
  'the',
  'head',
  'of',
  'the',
  'county',
  'championship'],
 ['Their',
  'stay',
  'on',
  'top',
  'though',
  'may',
  'be',
  'short',
  '##li',
  '##ved',
  'as',
  'title',
  'rivals',
  'Essex',
  'Derbyshire',
  'and',
  'Surrey',
  'all',
  'closed',
  'in',
  'on',
  'victory',
  'while',
  'Kent',
  'ma

In [33]:
for sentence in tokenized_texts:
    print(sentence)
    break

['CR', '##IC', '##KE', '##T', 'L', '##EI', '##CE', '##ST', '##ER', '##S', '##H', '##IR', '##E', 'T', '##A', '##KE', 'O', '##VE', '##R', 'AT', 'TO', '##P', 'A', '##FT', '##ER', 'IN', '##NI', '##NG', '##S', 'VI', '##CT', '##OR', '##Y']


In [34]:
i = 0
sentence_no = 0
new_sentence = []
new_data = []
for sentence, label in zip(tokenized_texts, labels):
    new_tokens = []
    new_tags = []
    for token, tag in zip(sentence, label):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_tokens.append(token)
            new_tags.append(tag)
    for new_token, new_tag in zip(new_tokens, new_tags):
        if new_token in string.punctuation:
            continue
        else:
            new_data.append((sentence_no, new_token, new_tag))
    sentence_no = sentence_no + 1

In [35]:
new_data[0:50]

[(0, 'CRICKET', 'O'),
 (0, 'LEICESTERSHIRE', 'ORG'),
 (0, 'TAKE', 'O'),
 (0, 'OVER', 'O'),
 (0, 'AT', 'O'),
 (0, 'TOP', 'O'),
 (0, 'AFTER', 'O'),
 (0, 'INNINGS', 'O'),
 (0, 'VICTORY', 'O'),
 (1, 'LONDON', 'LOC'),
 (1, '19960830', 'O'),
 (1, 'West', 'O'),
 (1, 'Indian', 'O'),
 (1, 'allrounder', 'O'),
 (1, 'Phil', 'PER'),
 (1, 'Simmons', 'PER'),
 (1, 'took', 'O'),
 (1, 'four', 'O'),
 (1, 'for', 'O'),
 (1, '38', 'O'),
 (1, 'on', 'O'),
 (1, 'Friday', 'O'),
 (1, 'as', 'O'),
 (1, 'Leicestershire', 'ORG'),
 (1, 'beat', 'O'),
 (1, 'Somerset', 'ORG'),
 (1, 'by', 'O'),
 (1, 'an', 'O'),
 (1, 'innings', 'O'),
 (1, 'and', 'O'),
 (1, '39', 'O'),
 (1, 'runs', 'O'),
 (1, 'in', 'O'),
 (1, 'two', 'O'),
 (1, 'days', 'O'),
 (1, 'to', 'O'),
 (1, 'take', 'O'),
 (1, 'over', 'O'),
 (1, 'at', 'O'),
 (1, 'the', 'O'),
 (1, 'head', 'O'),
 (1, 'of', 'O'),
 (1, 'the', 'O'),
 (1, 'county', 'O'),
 (1, 'championship', 'O'),
 (2, 'Their', 'O'),
 (2, 'stay', 'O'),
 (2, 'on', 'O'),
 (2, 'top', 'O'),
 (2, 'though', 'O')]

In [36]:
data = pd.DataFrame(new_data, columns=['sentence_no', 'Word', 'Tag'])

In [37]:
data.to_csv("../../data/conll03/new_conll_dev_preprocessed_without_punctuation.csv")