In [1]:
import os
import csv
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [2]:
torch.manual_seed(3)
np.random.seed(3)
torch.cuda.manual_seed_all(3)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Reading .conll files into dataframes
Of train, dev and test corpora

In [3]:
train_df = pd.read_csv('train_hebtb-gold.lattices', sep='\t', quotechar='"', quoting=csv.QUOTE_NONE, header=None)
train_df.columns = ['ZeroIx','Index','FORM', 'LEMMA', 'POS', 'PPOS', 'FEAT', 'CorrespondingTokenIx']
train_df['sent_id'] = -1

dev_df = pd.read_csv('dev_hebtb-gold.lattices', sep='\t', quotechar='"', quoting=csv.QUOTE_NONE, header=None)
dev_df.columns = ['ZeroIx','Index','FORM', 'LEMMA', 'POS', 'PPOS', 'FEAT', 'CorrespondingTokenIx']
dev_df['sent_id'] = -1

test_df = pd.read_csv('test_hebtb-gold.lattices', sep='\t', quotechar='"', quoting=csv.QUOTE_NONE, header=None)
test_df.columns = ['ZeroIx','Index','FORM', 'LEMMA', 'POS', 'PPOS', 'FEAT', 'CorrespondingTokenIx']
test_df['sent_id'] = -1

dev_df.tail(30)

Unnamed: 0,ZeroIx,Index,FORM,LEMMA,POS,PPOS,FEAT,CorrespondingTokenIx,sent_id
11271,17,18,גופתו,גופה,NN,NN_S_PP,gen=F|num=S|suf_gen=M|suf_num=S|suf_per=3,13,-1
11272,18,19,של,של,POS,POS,_,14,-1
11273,19,20,מאיר,מאיר,NNP,NNP,_,15,-1
11274,20,21,כהנא,כהנא,NNP,NNP,_,16,-1
11275,21,22,אל,אל,IN,IN,_,17,-1
11276,22,23,ה,ה,DEF,DEF,_,18,-1
11277,23,24,קבר,קבר,NN,NN,gen=M|num=S,18,-1
11278,24,25,.,_,yyDOT,yyDOT,_,19,-1
11279,0,1,עכשיו,עכשיו,RB,RB,_,1,-1
11280,1,2,קוראים,קרא,BN,BN,gen=M|num=P|per=A,2,-1


In [4]:
def number_sents(gold_df):
    sentence_number = 0
    for index in gold_df.index:
        if gold_df.at[index, 'ZeroIx'] == 0:
            sentence_number += 1
        gold_df.at[index, 'sent_id'] = sentence_number
    
    
number_sents(train_df)
number_sents(dev_df)
number_sents(test_df)

In [None]:
# dev_sents = dev_df.groupby(['sent_id'],  axis='columns')
# dev_sents.tolist()
# print(dev_sents[1])

In [None]:
# dev_df = dev_df.drop(dev_df.index[dev_df.sent_id == 296])
# dev_df = dev_df.drop(dev_df.index[dev_df.sent_id == 226])
# dev_df = dev_df.drop(dev_df.index[dev_df.sent_id == 57])
# dev_df = dev_df.drop(dev_df.index[dev_df.sent_id == 49])

In [None]:
# dev_df[dev_df['sent_id'] == 49]
# len(dev_df)

In [5]:
class sentenceGetter(object):
    def __init__(self, dataframe, max_sent=None):
        self.index = 0
        self.max_sent = max_sent
        self.tokens = dataframe['FORM']
        self.labels = dataframe['POS']
        #for evaluating by word-accuracy
        self.correspondingToken = dataframe['CorrespondingTokenIx']
        self.orig_sent_id = dataframe['sent_id']
    
    def sentences(self):
        sent = []
        counter = 0
        
        for token,label, corres_tok, sent_id in zip(self.tokens, self.labels, self.correspondingToken, self.orig_sent_id):
            sent.append((token, label, corres_tok, sent_id))
            if token.strip() == ".":
                yield sent
                sent = []
                counter += 1
            if self.max_sent is not None and counter >= self.max_sent:
                return

train_getter = sentenceGetter(train_df)
dev_getter = sentenceGetter(dev_df)
test_getter = sentenceGetter(test_df)

train_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]
train_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in train_getter.sentences()]

dev_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_corresTokens = [[corres_tok for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]
dev_sent_ids = [[sent_id for token, label, corres_tok, sent_id in sent] for sent in dev_getter.sentences()]

test_sentences = [[token for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]
test_labels = [[label for token, label, corres_tok, sent_id in sent] for sent in test_getter.sentences()]

print(len(dev_sentences))
# print(train_sentences[0])
# print(train_labels[0])
# print(type(train_sentences))
dummy_sentences = dev_sentences[0:2]
dummy_labels = dev_labels[0:2]
# print(dev_sentences[49])
# print(dummy_sentences[49])
# print(dummy_labels[49])
# sent = dummy_sentences[35]
# labs = dummy_labels[35]
# for word, lab in zip(sent, labs):
#     print(word, lab)


490


In [6]:
# longest_sent_len = 0
# for sent in dev_sentences:
#     if len(sent) >= longest_sent_len:
#         print(len(sent))
#         longest_sent_len = len(sent)
#         print("index of longest sentence:{} ".format(dev_sentences.index(sent)))

# longest_sent_len

del dev_sentences[296]
del dev_labels[296]
del dev_corresTokens[296]
del dev_sent_ids[296]


del dev_sentences[226]
del dev_labels[226]
del dev_corresTokens[226]
del dev_sent_ids[226]


del dev_sentences[57]
del dev_labels[57]
del dev_corresTokens[57]
del dev_sent_ids[57]


del dev_sentences[49]
del dev_labels[49]
del dev_corresTokens[49]
del dev_sent_ids[49]



# for sent in test_sentences:
#     if len(sent) >= longest_sent_len:
#         print(len(sent))
#         longest_sent_len = len(sent)
#         print("index of longest sentence:{} ".format(test_sentences.index(sent)))

# longest_sent_len
# del test_sentences[396]
# del test_labels[396]
# del test_sentences[394]
# del test_labels[394]
# del test_sentences[387]
# del test_labels[387]
# del test_sentences[151]
# del test_labels[151]
# del test_sentences[134]
# del test_labels[134]
# del test_sentences[104]
# del test_labels[104]


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print("Device: " + str(device))
print("Number of gpus: " + str(n_gpu))
print("Name of gpu: " + torch.cuda.get_device_name(0))

Device: cuda
Number of gpus: 4
Name of gpu: GeForce GTX 1080 Ti


In [8]:
tags_data = pd.concat([train_df, dev_df, test_df])
tag_vals = list(set(tags_data["POS"].values))
tags = ['PAD'] + tag_vals
print(tags)
print(len(tags))

['PAD', 'yyCM', 'yyDOT', ' PREPOSITION', 'QW', 'CDT', 'yyLRB', 'JJ', 'PREPOSITION', 'yyQM', 'REL', 'ADVERB', '??', 'PREPOSITIONIN', 'yyCLN', 'DTT', 'BN', 'TEMP', 'NNT', 'POS', 'NN', 'NNP', 'CONJ', 'AT', 'CD', 'EX', 'PRP', 'JJT', 'yySCLN', 'COP', 'S_ANP', 'ZVL', 'NCD', 'CC', 'yyEXCL', 'S_PRN', 'DUMMY_AT', 'INTJ', 'IN', 'P', 'BNT', 'DT', 'VB', 'MD', 'RB', 'NEG', 'yyQUOT', 'yyELPS', 'yyDASH', 'DEF', 'yyRRB']
51


In [9]:
tag2idx = {tag:idx for idx, tag in enumerate(tags)}
idx2tag = {idx:tag for idx, tag in enumerate(tags)}

In [None]:
# tokenizer = BertTokenizer.from_pretrained('./multi_cased_L-12_H-768_A-12/')

In [None]:
# print(train_sentences[0])
# print(train_labels[0])

In [10]:
MAX_LEN = 150
bs = 32

In [11]:
tokenizer = BertTokenizer.from_pretrained('./multi_cased_L-12_H-768_A-12/')

def tokenize(sentences, orig_labels):    
    tokenized_texts = []
    labels = []
    sents, tags_li = [], []
    for sent, sent_labels in zip(sentences, orig_labels):
        bert_tokens = []
        bert_labels = []
        for orig_token, orig_label in zip(sent, sent_labels):
            b_tokens = tokenizer.tokenize(orig_token)
            bert_tokens.extend(b_tokens)
            for b_token in b_tokens:
                bert_labels.append(orig_label)
        tokenized_texts.append(bert_tokens)
        labels.append(bert_labels)
        assert len(bert_tokens) == len(bert_labels)
    return tokenized_texts, labels

train_tokenized_texts, train_tokenized_labels = tokenize(train_sentences, train_labels)
print(train_sentences[10])
print(train_tokenized_texts[10])

['הם', 'התבקשו', 'לדווח', 'ל', 'ה', 'משטרה', 'על', 'תנועותיהם', '.']
['הם', 'ה', '##ת', '##בק', '##שו', 'ל', '##דו', '##וח', 'ל', 'ה', 'מ', '##ש', '##טר', '##ה', 'על', 'ת', '##נוע', '##ות', '##יהם', '.']


In [12]:
def pad_sentences_and_labels(tokenized_texts, labels):
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen = MAX_LEN, dtype = "int", truncating = "post", padding = "post")
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], 
                         maxlen = MAX_LEN, value = tag2idx['PAD'], padding = "post",
                        dtype = "int", truncating = "post")
    attention_masks = [[float(i>0) for i in ii] for ii in input_ids]
    return input_ids, tags, attention_masks

input_ids, tags, attention_masks = pad_sentences_and_labels(train_tokenized_texts, train_tokenized_labels)
# print(tags)
# print(input_ids)
# print(attention_masks)

In [13]:
tr_inputs = torch.tensor(input_ids, dtype=torch.long)
tr_tags = torch.tensor(tags, dtype=torch.long)
tr_masks = torch.tensor(attention_masks, dtype=torch.long)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
# train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)

In [None]:
# tokenizer.tokenize("הוא הגיש תלונה במשטרה")

In [14]:
model = BertForTokenClassification.from_pretrained('./multi_cased_L-12_H-768_A-12', num_labels=len(tag2idx))
model.cuda()

# lr = 1e-3
# max_grad_norm = 1.0
# num_total_steps = 1000
# num_warmup_steps = 100
# warmup_proportion = float(num_warmup_steps) / float(num_total_steps)

FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)


# optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)

from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

epochs = 15
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
#         print("Input: ")
#         print(b_input_ids)
#         print("Labels: ")
#         print(b_labels)
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss / nb_tr_steps))

Epoch:   7%|▋         | 1/15 [01:21<19:04, 81.73s/it]

Train loss: 0.8764334999417004


Epoch:  13%|█▎        | 2/15 [02:45<17:52, 82.47s/it]

Train loss: 0.23451287828777967


Epoch:  20%|██        | 3/15 [04:10<16:36, 83.04s/it]

Train loss: 0.14928182155678146


Epoch:  27%|██▋       | 4/15 [05:37<15:26, 84.21s/it]

Train loss: 0.10477634311016452


Epoch:  33%|███▎      | 5/15 [07:09<14:27, 86.73s/it]

Train loss: 0.07998400458477829


Epoch:  40%|████      | 6/15 [08:34<12:54, 86.09s/it]

Train loss: 0.06014351236731991


Epoch:  47%|████▋     | 7/15 [09:59<11:25, 85.67s/it]

Train loss: 0.04574003522774499


Epoch:  53%|█████▎    | 8/15 [11:23<09:57, 85.35s/it]

Train loss: 0.039042434220104234


Epoch:  60%|██████    | 9/15 [12:48<08:30, 85.05s/it]

Train loss: 0.030989381172204095


Epoch:  67%|██████▋   | 10/15 [14:12<07:04, 84.93s/it]

Train loss: 0.02690853578602209


Epoch:  73%|███████▎  | 11/15 [15:36<05:38, 84.62s/it]

Train loss: 0.02274791951487331


Epoch:  80%|████████  | 12/15 [17:00<04:13, 84.49s/it]

Train loss: 0.019322652587250463


Epoch:  87%|████████▋ | 13/15 [18:24<02:48, 84.37s/it]

Train loss: 0.017739925548850902


Epoch:  93%|█████████▎| 14/15 [19:49<01:24, 84.29s/it]

Train loss: 0.01617034224297018


Epoch: 100%|██████████| 15/15 [21:13<00:00, 84.27s/it]

Train loss: 0.012969936423408063





In [None]:
# output_dir = "./"
# model_to_save = model.module if hasattr(model, 'module') else model

# output_model_file = os.path.join(output_dir, 'pytorch_model.bin')
# output_config_file = os.path.join(output_dir, 'bert_config.json')

# torch.save(model_to_save.state_dict(), output_model_file)
# model_to_save.config.to_json_file(output_config_file)
# tokenizer.save_vocabulary(output_dir)


#     model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
#     output_model_file = args.output_dir / "pytorch_model.bin"
#     torch.save(model_to_save.state_dict(), str(output_model_file))


In [None]:
# model = BertForTokenClassification.from_pretrained(output_dir)
# tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)

In [15]:
# Function receives a sentence with its labels, and the tokenized sentence and labels
def aggr_toks_labels_tags(orig_words, orig_labels, tok_wordps, tok_labels, predicted_tags, test_tags):
    
    joint_tokens = []
    joint_labels = []
    joint_predicted = []
    joint_test = []
    
    for word in orig_words:
        aggregated_tokenized = ""
        aggregated_label = ""
        aggregated_predicted = ""
        aggregated_test = ""
        
        while aggregated_tokenized != word:
#             print(len(tok_sent))
            tmpTok = tok_wordps.pop(0)
#             print(tmpTok)
#             print(joint_tokens)
            if tmpTok.startswith("##"):
                tmpTok = tmpTok[2:]
                
            tmpLab = tok_labels.pop(0)
            if aggregated_label == "":
                aggregated_label = tmpLab
                
            tmpPred = predicted_tags.pop(0)
            if aggregated_predicted == "":
                aggregated_predicted = tmpPred
                
            tmpTest = test_tags.pop(0)
            if aggregated_test == "":
                aggregated_test = tmpTest
                
            aggregated_tokenized += tmpTok
#             print(aggregated_tokenized)
            
        joint_tokens.append(aggregated_tokenized)
        joint_labels.append(aggregated_label)
        joint_predicted.append(aggregated_predicted)
        joint_test.append(aggregated_test)
        
    assert len(joint_tokens) == len(orig_words)
    assert len(joint_tokens) == len(joint_predicted)
    return joint_tokens, joint_labels, joint_predicted, joint_test

In [16]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def delete_pads_from_preds(predicted_tags, test_tags):
    clean_predicted = []
    clean_test = []
    
    for ix in range(0, len(test_tags)):
        if test_tags[ix] != 'PAD':
            clean_predicted.append(predicted_tags[ix])
            clean_test.append(test_tags[ix])
            
    return clean_predicted, clean_test
    
def calculate_accuracy(df):
    numOfCorrectPredictions = 0
    for index in df.index:
        orig_pos = df.at[index, 'test_tag']
        pred_pos = df.at[index, 'predicted_tag']
        if orig_pos == pred_pos:
            numOfCorrectPredictions += 1
    return numOfCorrectPredictions/len(df)
                
def test_model(sentence, labels, tok_sent, tok_labels, corres_tokens, sent_id):
    input_ids, tags, attention_masks = pad_sentences_and_labels([tok_sent], [tok_labels])

    val_inputs = torch.tensor(input_ids, dtype=torch.long)
    val_tags = torch.tensor(tags, dtype=torch.long)
    val_masks = torch.tensor(attention_masks, dtype=torch.long)

    test_data = TensorDataset(val_inputs, val_masks, val_tags)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions, true_labels = [], []
    counter = 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                         attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append([list(p) for p in np.argmax(logits, axis=2)])
        
        true_labels.append(label_ids)
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    
    pred_tags = [idx2tag[p_ii] for p in predictions for p_i in p for p_ii in p_i]
    test_tags = [idx2tag[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
#     print(list(zip(pred_tags, test_tags)))
    # -----------------------------------------------------------------------
    clean_predicted, clean_test = delete_pads_from_preds(pred_tags, test_tags)
    joint_tokenized, joint_labels, preds, tests = aggr_toks_labels_tags(sentence, labels, tok_sent, tok_labels, 
                                                                        clean_predicted, clean_test)
    
    tmp = {'word': sentence, 'orig_label': labels, 'predicted_tag': preds, 'test_tag': tests, 
           'corresToken': corres_tokens, 'sent_id': sent_id}
    tmp_df = pd.DataFrame(data=tmp)
    # -----------------------------------------------------------------------
    
    y_true = pd.Series(test_tags)
    y_pred = pd.Series(pred_tags)
    cross_tab = pd.crosstab(y_true, y_pred, rownames=['Real Label'], colnames=['Prediction'], margins=True)
    report = classification_report(y_true, y_pred)
#     print(report)
#     print(tmp_df)
    return tmp_df

full_df = pd.DataFrame()
dev_tokenized_texts, dev_tokenized_labels = tokenize(dev_sentences, dev_labels)
for sent, label, tok_sent, tok_label, corresTokens, sent_id in zip(dev_sentences, dev_labels, dev_tokenized_texts, 
                                                                   dev_tokenized_labels, dev_corresTokens, dev_sent_ids):
    test_df = test_model(sent, label, tok_sent, tok_label, corresTokens, sent_id)
    full_df = full_df.append(test_df, ignore_index=True, sort=False)

# full_df
f1_accuracy = calculate_accuracy(full_df)
print("Accuracy (F1): = {}".format(f1_accuracy))


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy (F1): = 0.9556933175312243


In [None]:
full_df.head(10)

In [17]:
full_df.to_csv('BE-Oracle-tag-5.csv')

In [18]:
word_acc_df = full_df.groupby(['sent_id', 'corresToken']).apply(lambda x: '^'.join(x.predicted_tag)).reset_index()

In [None]:
word_acc_df.tail(20)

In [19]:
word_acc_df.to_csv('word-acc-Oracle-tag-5.csv')