In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 10000)
import numpy as np
import string
import transformers
from transformers import BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW

In [3]:
df = pd.read_csv("../../data/conll03/test.txt", sep=" ")

In [4]:
df.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,SOCCER,NN,B-NP,O
1,-,:,O,O
2,JAPAN,NNP,B-NP,B-LOC
3,GET,VB,B-VP,O
4,LUCKY,NNP,B-NP,O


In [5]:
df.drop(['-X-', '-X-.1'], axis=1, inplace=True)

In [6]:
df.columns = ['Word', 'Tag']

In [7]:
df.head()

Unnamed: 0,Word,Tag
0,SOCCER,O
1,-,O
2,JAPAN,B-LOC
3,GET,O
4,LUCKY,O


In [8]:
df['Tag'].unique()

array(['O', 'B-LOC', 'B-PER', 'I-PER', 'I-LOC', 'B-MISC', 'I-MISC',
       'B-ORG', 'I-ORG', nan], dtype=object)

In [9]:
df.dropna(axis=0, inplace=True)

In [10]:
df['Tag'].unique()

array(['O', 'B-LOC', 'B-PER', 'I-PER', 'I-LOC', 'B-MISC', 'I-MISC',
       'B-ORG', 'I-ORG'], dtype=object)

In [11]:
tags = df['Tag'].values
words = df['Word'].values

In [12]:
new_tags = []
for t in tags:
    if t == "B-ORG":
        new_tags.append("ORG")
    elif t == "B-PER":
        new_tags.append("PER")
    elif t == "I-PER":
        new_tags.append("PER")
    elif t == "B-LOC":
        new_tags.append("LOC")
    elif t == "I-ORG":
        new_tags.append("ORG")
    elif t == "I-LOC":
        new_tags.append("LOC")
    else:
        new_tags.append("O")

In [13]:
df.drop(['Tag'], axis=1, inplace=True)
df.drop(['Word'], axis=1, inplace=True)

In [14]:
df['Tag'] = new_tags
df['Word'] = words

In [15]:
sentence_no = 0
sentence = []
for w in words:
    sentence.append(sentence_no)
    if w == ".":
        sentence_no = sentence_no + 1

In [16]:
len(sentence)

46244

In [17]:
df.insert(0, 'Sentence #', sentence)

In [18]:
df.head(100)

Unnamed: 0,Sentence #,Tag,Word
0,0,O,SOCCER
1,0,O,-
2,0,LOC,JAPAN
3,0,O,GET
4,0,O,LUCKY
5,0,O,WIN
6,0,O,","
7,0,PER,CHINA
8,0,O,IN
9,0,O,SURPRISE


In [19]:
df.isnull().sum()

Sentence #    0
Tag           0
Word          0
dtype: int64

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_whole_word_mask=True)

In [21]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [22]:
getter = SentenceGetter(df)
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
labels = [[s[1] for s in sentence] for sentence in getter.sentences]

In [23]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(str(word))
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [24]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, labels)]
#tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
#labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [25]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [27]:
i = 0
sentence_no = 0
new_sentence = []
new_data = []
for sentence, label in zip(tokenized_texts, labels):
    new_tokens = []
    new_tags = []
    for token, tag in zip(sentence, label):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_tokens.append(token)
            new_tags.append(tag)
    for new_token, new_tag in zip(new_tokens, new_tags):
        new_data.append((sentence_no, new_token, new_tag))
    sentence_no = sentence_no + 1

In [28]:
data = pd.DataFrame(new_data, columns=['Sentence #', 'Word', 'Tag'])

In [29]:
data.head(100)

Unnamed: 0,Sentence #,Word,Tag
0,0,SOCCER,O
1,0,-,O
2,0,JAPAN,LOC
3,0,GET,O
4,0,LUCKY,O
5,0,WIN,O
6,0,",",O
7,0,CHINA,PER
8,0,IN,O
9,0,SURPRISE,O


In [30]:
g_test = data.groupby("Sentence #")
test_df = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(sdf.Word)),
                       "Tag": g_test.apply(lambda sdf: ",".join(sdf.Tag))})
test_df.reset_index(inplace=True)

In [33]:
dataset=[]
for sentence_no, sentence, tag in zip(test_df['Sentence #'].values.tolist(), test_df['Sentence'].values.tolist(), test_df['Tag'].values.tolist()):
    if(len(sentence.split()) >= 510):
        print(str(sentence_no) + "---" + str(len(sentence.split())))
        print(sentence)
        continue
    dataset.append((sentence_no, sentence, tag))
test_df = pd.DataFrame(dataset, columns=['sentence_no', 'labels', 'token'])

1412---612
VAIL , Colorado 1996 - 12 - 07 Women ' s World Cup standings after Saturday ' s downhill race : Downhill Standings 1 . Katja Seizinger ( Germany ) 180 points 2 . Renate Goetschl ( Austria ) 132 3 . Carole Montillet ( France ) 86 4 . Pernilla Wiberg ( Sweden ) 75 5 . Heidi Zurbriggen ( Switzerland ) 69 6 . Regina Haeusl ( Germany ) 66 7 . Alexandra Meissnitzer ( Austria ) 65 8 . Isolde Kostner ( Italy ) 60 9 . Ingeborg Helen Markein ( Norway ) 58 10 = Megan Gerety ( U . S . ) 51 10 = Warwara Zelenskaja ( Russia ) 51 10 = Florence Masnada ( France ) 51 13 = Picabo Street ( U . S . ) 50 13 = Stefanie Schuster ( Austria ) 50 15 . Miriam Vogt ( Germany ) 47 16 . Bibiana Perez ( Italy ) 45 17 . Hilde Gerg ( Germany ) 42 18 . Barbara Merlin ( Germany ) 38 19 = Kate Pace Lindsay ( Canada ) 23 19 = Svetlana Gladishiva ( Russia ) 23 19 = Regine Cavagnoud ( France ) 23 Overall women ' s World Cup standings leaders after Saturday ' s downhill and super G races : 1 . Katja Seizinger ( Ge

In [34]:
test_df.head()

Unnamed: 0,sentence_no,labels,token
0,0,"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .","O,O,LOC,O,O,O,O,PER,O,O,O,O"
1,1,"Nadim Ladki AL - AIN , United Arab Emirates 1996 - 12 - 06 Japan began the defence of their Asian Cup title with a lucky 2 - 1 win against Syria in a Group C championship match on Friday .","PER,PER,LOC,LOC,LOC,O,LOC,LOC,LOC,O,O,O,O,O,LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,LOC,O,O,O,O,O,O,O,O,O"
2,2,"But China saw their luck desert them in the second match of the group , crashing to a surprise 2 - 0 defeat to newcomers Uzbekistan .","O,LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,LOC,O"
3,3,China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net .,"LOC,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,PER,PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,4,"Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area .","PER,PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [None]:
data.to_csv("../../data/conll_test_preprocessed.csv")