In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 10000)
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
import matplotlib.pyplot as plt
import os
import transformers
from transformers import BertForTokenClassification, AdamW
from seqeval.metrics import f1_score, accuracy_score
import Levenshtein
import string
import difflib

transformers.__version__

torch.__version__

'1.8.1'

In [3]:
tag_values = ['O', 'PER', 'LOC', 'ORG']
#tag_values = ['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_whole_word_mask=True)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx), output_attentions = False, output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [4]:
#model.load_state_dict(torch.load("../model_2/bert_base_conll_50.pt", map_location=torch.device('cpu'))) 
model.load_state_dict(torch.load("../model/with_punctuation_with_all_broken_entities_vertical_concatenation_100_2.pt", map_location=torch.device('cpu'))) # WITH PHONETIC NOISE
#model.load_state_dict(torch.load("../model_2/bert_base_conll_with_punctuation_with_broken_entities_75.pt", map_location=torch.device('cpu'))) # WITH BROKEN ENTITIES

<All keys matched successfully>

In [5]:
def prepare_data_for_test(filepath):
    df = pd.read_csv(filepath, sep=";")
    #df.drop(['Unnamed: 0'], axis=1, inplace=True)
    #df = df[:6723]
    g_test = df.groupby("Sentence #")
    test_df = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(sdf.Word)),
                       "Tag": g_test.apply(lambda sdf: ",".join(sdf.Tag))})
    test_df.reset_index(inplace=True)
    return df, test_df

In [6]:
def model_test(data, tokenizer, model):
    test = []
    #results = open("conll03_base_ljspeech_asr_test_without_gpe_uncased_results_lower.txt", "a+")
    #test_data=original_data['sentence'].values.tolist()
    #test_data=original_sentence
    #test_data=test_df['Sentence'].values.tolist()
    test_data=data

    # ASR TEST DATE LATEST
    sentence_no = 0
    for data in test_data:
        tokenized_sentence = tokenizer.encode(data.lower().strip())
        #tokenized_sentence = nlp(data.lower().strip())
        input_ids = torch.tensor([tokenized_sentence])
        #input_ids = torch.tensor([tokenized_sentence._.trf_word_pieces])

        with torch.no_grad():
             output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
        #tokens = _.trf_word_pieces_
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)

        for token, label in zip(new_tokens, new_labels):
            #result = str(sentence_no) + "\t" + label + "\t" + token + "\n"
            #results.write(result)
            test.append((str(sentence_no), label, token))
        sentence_no = sentence_no + 1
    test_df = pd.DataFrame(test, columns=['sentence_no', 'labels', 'token'])
    return test_df
    #test_df.to_csv("final_asr_test_dataframe.csv", index=False)

In [7]:
def prepare_model_output(test_df, df):
    indexNames = test_df[test_df['token'] == "[CLS]" ].index
    test_df.drop(indexNames, inplace=True)
    indexNames = test_df[test_df['token'] == "[SEP]" ].index
    test_df.drop(indexNames, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    test_df['label_asr'] = df['Tag']
    test_df['token_asr'] = df['Word']
    return test_df

In [8]:
def statistics(test_df, tags):
    new_acc = accuracy_score(test_df['labels'].values.tolist(), test_df['label_asr'].values.tolist())
    print(new_acc)

    new_f1 = f1_score(test_df['labels'].values.tolist(), test_df['label_asr'].values.tolist())
    print(new_f1)
    print("---STATISTICS ON EACH LABEL---")
    for tag in tags:
        true_positive = test_df[((test_df['labels'].str.contains(tag)) & (test_df['label_asr'].str.contains(tag)))]
        print(len(true_positive))
        false_positive = test_df[((test_df['labels'].str.contains(tag)) & (~test_df['label_asr'].str.contains(tag)))]
        print(len(false_positive))
        false_negative = test_df[((~test_df['labels'].str.contains(tag)) & (test_df['label_asr'].str.contains(tag)))]
        print(len(false_negative))
        true_negative = test_df[((~test_df['labels'].str.contains(tag)) & (~test_df['label_asr'].str.contains(tag)))]
        print(len(true_negative))
        prec = len(true_positive) / (len(true_positive) + len(false_positive))
        print(prec)
        recall = len(true_positive) / (len(true_positive) + len(false_negative))
        print(recall)
        f_measure = (2 * prec * recall) / (prec + recall)
        print(f_measure)
        print("---------------------------------------")

In [9]:
df, test_df = prepare_data_for_test('unprocessed_sampled_asr2.csv')

In [10]:
test_df.head()

Unnamed: 0,Sentence #,Sentence,Tag
0,2.0,for although the Chinese took Impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands by a similar process,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,LOC,O,O,O,O"
1,23.0,what the first Bible actually dated which also was printed at mace by Peter Shaffer in the year 1462,"O,O,O,O,O,O,O,O,O,O,O,LOC,O,PER,PER,O,O,O,O"
2,26.0,especially as regards to lowercase letters and type very similar was used during the next 15 or 20 years not only by chauffeur,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,PER"
3,27.0,Buy printers in Strasburg basil Paris Lubec and other cities,"O,O,O,LOC,O,LOC,LOC,O,O,O"
4,28.0,but don ' t expect in Italy letter with most often used,"O,O,O,O,O,O,LOC,O,O,O,O,O"


In [11]:
test_df = model_test(test_df['Sentence'].values.tolist(), tokenizer, model)

In [12]:
test_df = prepare_model_output(test_df, df)

In [13]:
test_df.tail()

Unnamed: 0,sentence_no,labels,token,label_asr,token_asr
8749,472,O,called,O,called
8750,472,O,the,O,The
8751,472,ORG,cellar,ORG,Cellar
8752,472,ORG,coffee,ORG,Coffee
8753,472,O,house,O,House


In [14]:
output = pd.DataFrame()
output['ground_truth_token'] = test_df['token']
output['ground_truth_label'] = test_df['label_asr']
output['model_label'] = test_df['labels']

In [16]:
output.head()

Unnamed: 0,ground_truth_token,ground_truth_label,model_label
0,for,O,O
1,although,O,O
2,the,O,O
3,chinese,O,O
4,took,O,O


In [17]:
output.to_csv('with_punctuation_with_all_broken_entities_vertical_concatenation_100_2', sep='\t', header=None, index=None)

In [18]:
test_df['label_asr'].unique()

array(['O', 'LOC', 'PER', 'ORG'], dtype=object)

In [19]:
g_test = test_df.groupby("sentence_no")
test = pd.DataFrame({"model_tag": g_test.apply(lambda sdf: sdf.labels.values.tolist()),
                       "asr_tag": g_test.apply(lambda sdf: sdf.label_asr.values.tolist())})

In [20]:
test.head()

Unnamed: 0_level_0,model_tag,asr_tag
sentence_no,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O, O]"
1,"[O, O, O, O, O, O, O, O, O, O, O, LOC, O, PER, PER, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, LOC, O, PER, PER, O, O, O, O]"
10,"[O, O, LOC, O, O, O, O, O, O, O, O, LOC, O, O, O, O]","[O, O, LOC, O, O, O, O, O, O, O, O, LOC, O, O, O, O]"
100,"[O, O, O, O, O, O, O, O, O, PER, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
101,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, PER, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, PER, O, O, O, O]"


In [21]:
test['asr_sentence_no'] = test.index
test[["asr_sentence_no"]] = test[["asr_sentence_no"]].apply(pd.to_numeric)
test.sort_values('asr_sentence_no', inplace=True)
test.reset_index(drop=True, inplace=True)

In [22]:
test.head()

Unnamed: 0,model_tag,asr_tag,asr_sentence_no
0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, LOC, O, O, O, O]",0
1,"[O, O, O, O, O, O, O, O, O, O, O, LOC, O, PER, PER, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, LOC, O, PER, PER, O, O, O, O]",1
2,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, PER]",2
3,"[O, O, O, LOC, ORG, LOC, ORG, O, O, O]","[O, O, O, LOC, O, LOC, LOC, O, O, O]",3
4,"[O, O, O, O, O, O, LOC, O, O, O, O, O]","[O, O, O, O, O, O, LOC, O, O, O, O, O]",4


In [23]:
print("Accuracy: " , accuracy_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
print("F1 Score: ",f1_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
#statistics(test_df, ['PER', 'ORG', 'LOC', 'O'])
#0.7758389261744967 without punctuation
#0.676056338028169 with punctuation 1

Accuracy:  0.9710989262051634
F1 Score:  0.8881469115191986




In [None]:
def prepare_data_for_analysis(test_df, original_data_path):
    g_asr = test_df.groupby("sentence_no")
    asr_df = pd.DataFrame({'Sentence': g_asr.apply(lambda sdf: " ".join(map(str,sdf.token))),
                      'Tag': g_asr.apply(lambda sdf: ",".join(sdf.labels))})
    asr_df['asr_sentence_no'] = asr_df.index
    asr_df[["asr_sentence_no"]] = asr_df[["asr_sentence_no"]].apply(pd.to_numeric)
    asr_df.sort_values('asr_sentence_no', inplace=True)
    asr_df.reset_index(drop=True, inplace=True)
    original = pd.read_csv(original_data_path, sep=";")
    #original.drop(['Unnamed: 0'], axis=1, inplace=True)
    #original = original[:6723]
    g_original = original.groupby("Sentence #")
    original_df = pd.DataFrame({'Sentence': g_original.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                      'Tag': g_original.apply(lambda sdf: ",".join(sdf.Tag))})
    original_df.reset_index(inplace=True)
    combined_df = pd.DataFrame({"original_sentence": original_df['Sentence'].str.lower(),
                           "original_tags": original_df['Tag'], 
                           "asr_sentence": asr_df['Sentence'],
                           "asr_tags": asr_df['Tag']})
    return asr_df, combined_df

In [None]:
def pattern_finding(tag, combined_df):
#tag = "PER"
    analysis = []
    for i in range(0, len(combined_df), 1):
        sample = combined_df.loc[[i]]
        for original_sentence, asr_sentence, original_tag, asr_tag in zip(sample['original_sentence'].values.tolist(),
                                                                          sample['asr_sentence'].values.tolist(),
                                                                          sample['original_tags'].values.tolist(),
                                                                          sample['asr_tags'].values.tolist()):
            original_tag_token = np.array(original_tag.split(","))
            asr_tag_token = np.array(asr_tag.split(","))
            original_label = np.array(original_sentence.lower().split())
            asr_label = np.array(asr_sentence.lower().split())

            if tag in original_tag_token:
                original_tag_ind = [index for index, element in enumerate(original_tag_token) if
                                    original_tag_token[index] == tag]
                if tag in asr_tag_token:
                    asr_tag_ind = [index for index, element in enumerate(asr_tag_token) if
                                       asr_tag_token[index] == tag]
                    
                    asr_tokens = []
                    original_tokens = []
                    errors = []
                        # Sweynheim pannartz
                        # Swain heim pannartz
                    for ind in original_tag_ind:
                        original_entity = original_label[ind]
                        asr_entity = difflib.get_close_matches(original_entity, asr_label[asr_tag_ind])
                        if len(asr_entity) > 0:
                            asr_entity = asr_entity[0]
                            error = (1 - (Levenshtein.distance(original_entity, asr_entity) / max(len(original_entity), len(asr_entity)))) * 100
                            if error >= 50:
                                asr_tokens.append(asr_entity)
                                original_tokens.append(original_entity)
                                errors.append(error)
                            else:
                                asr_tokens.append(asr_label[ind])
                                original_tokens.append(original_entity)
                                errors.append(0.0)
                        else:
                            asr_tokens.append(asr_label[ind])
                            original_tokens.append(original_entity)
                            errors.append(0.0)
                    analysis.append((i, original_tokens, asr_tokens, errors, np.mean(errors), True))
                else:
                    check = []
                    o_label = original_label[original_tag_ind]
                    for lab in o_label:
                        j = 0
                        for asr_lab in asr_label:
                            local_error = (1 - (Levenshtein.distance(lab, asr_lab) / max(len(lab), len(asr_lab)))) * 100
                            if local_error >= 50.0:
                                check.append(j)
                            j = j + 1
                    if len(check) > 0:
                        asr_tokens = []
                        original_tokens = []
                        errors = []
                        for ind in original_tag_ind:
                            original_entity = original_label[ind]
                            asr_entity = difflib.get_close_matches(original_entity, asr_label[check])
                            if len(asr_entity) > 0:
                                asr_entity = asr_entity[0]
                                error = (1 - (Levenshtein.distance(original_entity, asr_entity) / max(
                                len(original_entity), len(asr_entity)))) * 100
                                asr_tokens.append(asr_entity)
                                original_tokens.append(original_entity)
                                errors.append(error)
                            else:
                                asr_tokens.append(asr_label[ind])
                                original_tokens.append(original_entity)
                                errors.append(0.0)
                        analysis.append((i, original_tokens, asr_tokens, errors, np.mean(errors), False))
                    else:
                        analysis.append((i, original_label[original_tag_ind], ["None"], [0.0], 0.0, False))
    return analysis

In [None]:
asr_df, combined_df = prepare_data_for_analysis(test_df, 'unprocessed_sampled_asr2.csv')

In [None]:
combined_df.head()

In [None]:
count = 0
total = 0
sample = []
for sample_no, original_tag, asr_tag in zip(combined_df.index, combined_df['original_tags'].values.tolist(), combined_df['asr_tags'].values.tolist()):
    x_tag = original_tag.split(",")
    y_tag = asr_tag.split(",")
    for x,y in zip(x_tag, y_tag):
        if x == "ORG" and y == "O":
            sample.append(sample_no)
print(sample)

# ================================================== BASELINE ================================================

# [1, 1, 6, 6, 6, 9, 14, 14, 15, 15, 18, 20, 21, 21, 21, 21, 27, 27, 29, 30, 30, 33, 35, 37, 38, 38, 38, 45, 46, 47, 50, 50, 52, 53, 55, 62, 63, 66, 66, 70, 72, 72, 72, 78, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 87, 87, 89, 93, 101, 106, 109, 110, 110, 110, 111, 118, 126, 138, 139, 142, 143, 145, 145, 146, 146, 159, 159, 160, 161, 163, 164, 165, 170, 171, 172, 173, 174, 174, 176, 177, 178, 179, 180, 181, 181, 182, 182, 187, 193, 204, 204, 205, 206, 208, 208, 211, 211, 212, 212, 212, 212, 212, 212, 213, 213, 214, 214, 214, 215, 215, 215, 215, 216, 216, 218, 219, 219, 220, 221, 222, 223, 224, 225, 226, 227, 232, 233, 234, 237, 237, 238, 239, 244, 244, 246, 246, 247, 248, 250, 255, 255, 262, 262, 272, 273, 275, 279, 280, 292, 298, 312, 312, 312, 315, 315, 317, 325, 325, 326, 326, 330, 330, 331, 331, 333, 333, 333, 333, 334, 334, 338, 338, 338, 339, 340, 341, 342, 342, 343, 343, 351, 352, 352, 357, 359, 359, 359, 363, 363, 365, 366, 366, 368, 369, 370, 373, 378, 378, 379, 380, 380, 383, 386, 391, 391, 393, 398, 402, 404, 404, 417, 420, 423, 425, 429, 430, 430, 431, 432, 434, 436, 444, 446, 446, 447, 449, 449, 449, 451, 454, 455, 455, 455, 460, 462, 465, 465, 470, 471] PER -> PER
# [17, 31, 77, 199, 215, 269, 383, 397, 399, 430] PER -> LOC
# [39, 65, 94, 107, 251, 363, 367, 367] PER -> ORG
# [2, 67, 71, 84, 91, 125, 125, 125, 207, 207, 213, 215, 228, 281, 293, 399, 399, 400, 417] PER -> O



# [0, 1, 3, 3, 3, 4, 5, 5, 5, 6, 6, 8, 9, 10, 10, 12, 13, 16, 19, 22, 22, 24, 26, 28, 30, 32, 32, 33, 33, 34, 37, 39, 40, 40, 41, 41, 42, 43, 48, 51, 51, 54, 56, 60, 60, 60, 60, 61, 62, 62, 62, 66, 68, 68, 68, 69, 69, 71, 79, 80, 84, 90, 90, 91, 91, 95, 96, 96, 96, 97, 103, 104, 104, 114, 114, 115, 115, 115, 117, 118, 123, 124, 126, 128, 129, 129, 129, 129, 130, 130, 131, 131, 134, 134, 134, 135, 135, 136, 136, 136, 139, 139, 140, 141, 144, 147, 150, 150, 150, 150, 150, 151, 151, 153, 154, 154, 154, 156, 159, 159, 162, 165, 166, 168, 169, 172, 175, 183, 183, 184, 185, 186, 189, 189, 193, 193, 193, 193, 196, 196, 197, 197, 197, 202, 205, 210, 213, 217, 230, 230, 234, 235, 235, 236, 240, 241, 241, 241, 242, 243, 246, 247, 249, 253, 256, 257, 259, 266, 267, 267, 267, 286, 288, 290, 290, 291, 294, 298, 299, 300, 300, 300, 301, 303, 303, 305, 306, 307, 307, 308, 309, 316, 319, 320, 320, 323, 323, 323, 328, 332, 335, 335, 336, 337, 338, 339, 341, 354, 358, 359, 360, 361, 372, 376, 377, 377, 379, 379, 379, 379, 380, 380, 380, 380, 380, 381, 381, 381, 381, 381, 382, 382, 382, 382, 383, 383, 384, 384, 384, 384, 385, 385, 385, 386, 386, 386, 388, 388, 388, 388, 389, 389, 389, 389, 390, 391, 391, 392, 392, 392, 392, 392, 393, 393, 393, 393, 393, 394, 394, 394, 395, 395, 395, 396, 396, 397, 397, 397, 398, 398, 398, 399, 400, 400, 400, 401, 403, 403, 403, 403, 404, 404, 404, 404, 405, 405, 406, 406, 406, 406, 406, 406, 406, 407, 407, 408, 408, 408, 408, 409, 410, 410, 410, 410, 410, 410, 411, 411, 411, 411, 412, 412, 412, 413, 413, 413, 413, 413, 414, 414, 414, 414, 414, 415, 415, 415, 415, 415, 415, 415, 416, 416, 416, 416, 416, 416, 416, 417, 417, 418, 418, 418, 418, 418, 418, 418, 419, 419, 419, 420, 420, 420, 420, 421, 421, 421, 421, 421, 421, 421, 422, 422, 422, 422, 423, 423, 423, 424, 424, 424, 424, 424, 424, 424, 425, 425, 425, 426, 426, 426, 426, 427, 427, 427, 427, 427, 428, 428, 428, 428, 429, 429, 429, 429, 431, 431, 431, 431, 432, 432, 432, 433, 433, 433, 434, 434, 435, 435, 435, 436, 436, 436, 436, 437, 437, 437, 437, 438, 438, 438, 438, 439, 439, 439, 439, 439, 440, 440, 440, 441, 441, 441, 441, 441, 442, 442, 442, 442, 443, 443, 443, 444, 444, 445, 445, 445, 445, 446, 446, 447, 447, 447, 447, 448, 448, 448, 448, 449, 449, 449, 449, 450, 450, 450, 450, 451, 451, 451, 451, 452, 452, 452, 452, 453, 453, 453, 453, 454, 454, 454, 454, 455, 455, 455, 455, 456, 456, 456, 456, 457, 457, 457, 457, 458, 458, 458, 458, 459, 459, 459, 460, 460, 460, 460, 461, 461, 461, 461, 462, 462, 462, 463, 463, 463, 463, 463, 464, 464, 465, 465, 465, 465, 466, 466, 466, 466, 467, 467, 467, 467, 468, 468, 469, 469, 469, 469, 469, 469, 469, 470, 470, 470, 471, 471, 471, 471] LOC -> LOC
# [] LOC -> PER
# [11, 25, 27, 111, 146, 182, 304, 382, 425, 433, 433, 433, 433, 433, 433, 434, 451] LOC -> ORG
# [84, 113, 114, 137, 137, 145, 145, 182, 182, 182, 204, 204, 209, 209, 231, 231, 268, 268, 307, 323, 327, 353, 353, 387, 390, 390, 390, 395, 395, 397, 401, 401, 409, 409, 413, 413, 417, 417, 417, 432, 434, 443, 444, 446, 446, 451, 459, 464, 464, 464] LOC -> O



# [58, 59, 59, 59, 98, 99, 99, 99, 105, 105, 105, 105, 109, 109, 109, 119, 119, 119, 120, 122, 127, 155, 191, 191, 191, 192, 192, 192, 195, 211, 211, 211, 214, 214, 214, 217, 217, 217, 229, 229, 229, 229, 245, 251, 252, 252, 252, 258, 258, 258, 313, 313, 313, 318, 318, 318, 318, 321, 321, 322, 322, 324, 329, 339, 339, 339, 347, 348, 355, 362, 362, 362, 362, 362, 362, 362, 364, 374, 374, 375, 375, 375, 394, 394, 394, 405, 405, 405, 407, 425, 425, 425, 425, 425, 425, 425, 440, 443, 443, 444, 448, 470, 470] ORG -> ORG
# [444] ORG -> PER
# [73, 96, 97, 98, 125, 256, 302, 316, 316, 344, 345, 345, 346, 347, 347, 347, 347, 350, 419] ORG -> LOC
# [72, 72, 72, 72, 72, 72, 73, 73, 74, 75, 96, 97, 98, 98, 98, 99, 112, 112, 112, 120, 121, 121, 122, 125, 129, 133, 170, 196, 198, 198, 203, 254, 254, 254, 260, 260, 260, 260, 260, 260, 260, 266, 266, 274, 276, 276, 277, 277, 277, 279, 283, 283, 283, 289, 289, 311, 311, 322, 322, 324, 324, 324, 325, 325, 329, 334, 334, 347, 347, 347, 347, 347, 347, 348, 349, 349, 349, 355, 362, 362, 371, 371, 432, 432, 444, 444, 472, 472] ORG -> O




# ================================================== Broken ================================================
# [1, 1, 6, 6, 9, 14, 14, 15, 15, 18, 20, 21, 21, 21, 21, 27, 27, 30, 30, 37, 38, 38, 38, 45, 46, 47, 50, 50, 52, 53, 55, 62, 63, 66, 66, 70, 72, 72, 72, 78, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 87, 87, 89, 93, 94, 101, 107, 109, 110, 110, 111, 118, 125, 126, 138, 139, 142, 143, 145, 145, 146, 146, 159, 159, 160, 161, 163, 164, 165, 170, 171, 172, 173, 174, 174, 176, 177, 178, 179, 180, 181, 181, 182, 182, 187, 193, 199, 204, 204, 205, 206, 207, 208, 208, 211, 211, 212, 212, 212, 212, 212, 212, 213, 213, 214, 214, 214, 215, 215, 215, 215, 215, 215, 216, 218, 219, 219, 220, 221, 222, 223, 224, 225, 226, 227, 232, 233, 234, 237, 237, 238, 239, 244, 244, 246, 246, 247, 248, 250, 255, 255, 262, 262, 269, 272, 273, 275, 279, 280, 292, 298, 312, 312, 312, 315, 315, 325, 325, 326, 326, 330, 330, 331, 331, 333, 333, 333, 333, 334, 334, 338, 338, 338, 339, 340, 341, 342, 342, 343, 343, 351, 352, 352, 357, 359, 359, 359, 363, 363, 363, 365, 366, 366, 367, 368, 369, 370, 373, 378, 378, 379, 380, 380, 383, 383, 386, 391, 393, 397, 398, 399, 399, 400, 402, 404, 404, 417, 417, 420, 423, 425, 429, 430, 430, 430, 431, 432, 434, 436, 444, 446, 446, 447, 449, 449, 449, 451, 454, 455, 455, 455, 460, 462, 465, 465, 470, 471] PER -> PER
# [228] PER -> LOC
# [6, 17, 33, 65, 251, 367] PER -> ORG
# [2, 29, 31, 35, 39, 67, 71, 77, 84, 91, 106, 110, 125, 125, 207, 213, 216, 281, 293, 317, 391, 399] PER -> O



# [0, 1, 3, 3, 4, 5, 5, 5, 6, 6, 8, 9, 10, 10, 12, 13, 16, 19, 22, 22, 24, 26, 28, 30, 32, 32, 33, 33, 34, 37, 39, 40, 40, 41, 41, 42, 43, 48, 54, 56, 60, 60, 60, 60, 61, 62, 62, 62, 66, 68, 68, 68, 71, 80, 84, 84, 90, 90, 91, 91, 95, 96, 96, 96, 97, 103, 104, 104, 114, 114, 114, 115, 115, 115, 118, 123, 124, 126, 129, 129, 129, 129, 130, 130, 131, 134, 134, 134, 135, 135, 136, 136, 136, 140, 141, 147, 150, 150, 150, 150, 150, 151, 154, 154, 154, 156, 159, 159, 162, 165, 166, 168, 169, 172, 175, 182, 182, 183, 183, 184, 185, 186, 189, 193, 193, 193, 193, 196, 196, 197, 197, 197, 205, 210, 217, 230, 230, 234, 235, 235, 236, 240, 241, 241, 241, 242, 243, 246, 247, 249, 253, 256, 257, 259, 266, 267, 267, 267, 286, 288, 290, 290, 291, 294, 298, 299, 300, 300, 300, 305, 308, 309, 316, 320, 320, 323, 323, 323, 323, 328, 332, 335, 335, 336, 337, 338, 339, 341, 354, 358, 359, 360, 361, 372, 377, 377, 379, 379, 379, 379, 380, 380, 380, 380, 380, 381, 381, 381, 381, 381, 382, 382, 382, 382, 382, 383, 383, 384, 384, 384, 384, 385, 385, 385, 386, 386, 386, 387, 388, 388, 388, 388, 389, 389, 389, 389, 390, 390, 391, 391, 392, 392, 392, 392, 392, 393, 393, 393, 393, 393, 394, 394, 394, 395, 395, 395, 395, 395, 396, 396, 397, 397, 397, 397, 398, 398, 398, 399, 400, 400, 400, 401, 403, 403, 403, 403, 404, 404, 404, 404, 405, 405, 406, 406, 406, 406, 406, 406, 406, 407, 407, 408, 408, 408, 408, 409, 410, 410, 410, 410, 410, 410, 411, 411, 411, 411, 412, 412, 412, 413, 413, 413, 413, 413, 413, 413, 414, 414, 414, 414, 414, 415, 415, 415, 415, 415, 415, 415, 416, 416, 416, 416, 416, 416, 416, 417, 417, 417, 417, 417, 418, 418, 418, 418, 418, 418, 418, 419, 419, 419, 420, 420, 420, 421, 421, 421, 421, 421, 421, 421, 422, 422, 422, 422, 423, 423, 423, 424, 424, 424, 424, 424, 424, 424, 425, 425, 425, 425, 426, 426, 426, 426, 427, 427, 427, 427, 427, 428, 428, 428, 428, 429, 429, 429, 431, 431, 431, 431, 432, 432, 432, 432, 433, 433, 433, 433, 433, 433, 433, 433, 433, 434, 434, 434, 434, 435, 435, 435, 436, 436, 436, 436, 437, 437, 437, 437, 438, 438, 438, 438, 439, 439, 439, 439, 439, 440, 440, 440, 441, 441, 441, 441, 441, 442, 442, 442, 442, 443, 443, 443, 443, 444, 444, 444, 445, 445, 445, 445, 446, 446, 446, 447, 447, 447, 447, 448, 448, 448, 448, 449, 449, 449, 449, 450, 450, 450, 450, 451, 451, 451, 451, 451, 451, 452, 452, 452, 452, 453, 453, 453, 453, 454, 454, 454, 454, 455, 455, 455, 455, 456, 456, 456, 456, 457, 457, 457, 457, 458, 458, 458, 458, 459, 459, 459, 459, 460, 460, 460, 460, 461, 461, 461, 461, 462, 462, 462, 463, 463, 463, 463, 463, 464, 464, 464, 464, 464, 465, 465, 465, 465, 466, 466, 466, 466, 467, 467, 467, 467, 468, 468, 469, 469, 469, 469, 469, 469, 469, 470, 470, 470, 471, 471, 471, 471] LOC -> LOC
# [301, 304, 319] LOC -> PER
# [3, 25, 27, 69, 111, 146, 182, 204, 204, 303, 303, 376] LOC -> ORG
# [11, 51, 51, 69, 79, 113, 117, 128, 131, 137, 137, 139, 139, 144, 145, 145, 151, 153, 182, 189, 202, 209, 209, 213, 231, 231, 268, 268, 306, 307, 307, 307, 327, 353, 353, 390, 390, 401, 401, 409, 409, 420, 429, 446] LOC -> O



# [58, 59, 59, 59, 74, 96, 97, 99, 99, 99, 105, 105, 105, 105, 109, 109, 109, 112, 112, 112, 119, 119, 119, 120, 127, 191, 191, 191, 195, 211, 211, 211, 214, 214, 214, 217, 217, 217, 229, 229, 229, 229, 245, 251, 252, 252, 252, 254, 258, 258, 258, 260, 260, 260, 260, 260, 313, 313, 313, 318, 318, 318, 318, 321, 321, 322, 324, 324, 324, 329, 339, 339, 339, 345, 347, 347, 347, 347, 347, 347, 347, 347, 347, 349, 349, 355, 362, 362, 362, 362, 362, 362, 362, 362, 362, 374, 374, 375, 375, 375, 394, 394, 394, 405, 405, 405, 407, 419, 425, 425, 425, 425, 425, 425, 425, 432, 432, 440, 443, 443, 444, 444, 444, 444, 448, 470, 470, 472, 472] ORG -> ORG
# [364] ORG -> PER
# [72, 72, 73, 99, 122, 125, 125, 256, 302, 316, 316, 344, 345, 350] ORG -> LOC
# [72, 72, 72, 72, 73, 73, 75, 96, 97, 98, 98, 98, 98, 98, 120, 121, 121, 122, 129, 133, 155, 170, 192, 192, 192, 196, 198, 198, 203, 254, 254, 260, 260, 266, 266, 274, 276, 276, 277, 277, 277, 279, 283, 283, 283, 289, 289, 311, 311, 322, 322, 322, 324, 325, 325, 329, 334, 334, 346, 347, 347, 348, 348, 349, 355, 371, 371] ORG -> O

In [None]:
len([72, 72, 72, 72, 73, 73, 75, 96, 97, 98, 98, 98, 98, 98, 120, 121, 121, 122, 129, 133, 155, 170, 192, 192, 192, 196, 198, 198, 203, 254, 254, 260, 260, 266, 266, 274, 276, 276, 277, 277, 277, 279, 283, 283, 283, 289, 289, 311, 311, 322, 322, 322, 324, 325, 325, 329, 334, 334, 346, 347, 347, 348, 348, 349, 355, 371, 371])

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar

In [None]:
orig_asr_similar.to_csv("baseline_model_broken_entity_not_recognized.csv")

In [None]:
orig_asr_similar_df = combined_df.loc[orig_asr_similar['Sample #'].values.tolist()]

In [None]:
orig_asr_similar_df

In [None]:
count = 0
total = 0
sample = []
for sample_no, model_tag, asr_tag in zip(orig_asr_similar_df.index, orig_asr_similar_df['original_tags'].values.tolist(), orig_asr_similar_df['asr_tags'].values.tolist()):
    x_tag = model_tag.split(",")
    y_tag = asr_tag.split(",")
    for x,y in zip(x_tag, y_tag):
        if x == "ORG" and x != y:
            sample.append(sample_no)
print(sample)
# [2, 17, 29, 31, 33, 35, 39, 65, 67, 71, 77, 91, 106, 228, 251, 281, 293, 317] --- 11.464968152866243 --- 18 PER broken
# [11, 25, 27, 51, 51, 69, 69, 79, 111, 113, 117, 128, 137, 137, 139, 139, 144, 145, 145, 146, 153, 202, 204, 204, 209, 209, 213, 231, 231, 268, 268, 301, 303, 303, 304, 306, 307, 307, 307, 319, 327, 353, 353] --- 18.867924528301888 --- 30 LOC broken
# [72, 72, 72, 72, 72, 72, 73, 73, 73, 75, 98, 98, 98, 98, 98, 121, 121, 122, 122, 129, 133, 155, 170, 192, 192, 192, 196, 198, 198, 203, 256, 266, 266, 274, 276, 276, 277, 277, 277, 279, 283, 283, 283, 289, 289, 302, 311, 311, 316, 316, 325, 325, 334, 334, 344, 346, 348, 348, 350, 364, 371, 371] --- 45.83333333333333 --- 33 --- ORG broken




# [2, 17, 31, 34, 39, 65, 65, 67, 67, 71, 77, 91, 94, 107, 125, 125, 125, 134, 137, 175, 207, 207, 228, 251, 269, 281, 293, 356, 367, 367] --- 14.906832298136646 --- 24 PER
# [100, 111, 113, 127, 145, 145, 146, 152, 152, 182, 182, 182, 182, 194, 194, 200, 201, 201, 231, 231, 244, 251, 268, 268, 268, 314, 314, 314, 353, 353] --- 10.897435897435898 --- 17 LOC
# [10, 10, 21, 37, 72, 72, 72, 72, 72, 72, 73, 73, 73, 74, 74, 74, 74, 74, 74, 75, 75, 89, 89, 89, 89, 96, 96, 97, 97, 102, 102, 102, 112, 112, 112, 121, 121, 125, 125, 129, 129, 129, 129, 132, 133, 134, 134, 134, 135, 135, 136, 136, 137, 137, 148, 156, 158, 158, 158, 165, 170, 178, 196, 198, 198, 198, 199, 203, 203, 203, 204, 204, 204, 204, 209, 209, 213, 254, 254, 254, 255, 255, 255, 256, 260, 260, 260, 260, 260, 260, 260, 261, 262, 263, 264, 265, 266, 266, 269, 270, 271, 274, 276, 276, 277, 277, 277, 278, 279, 283, 283, 283, 284, 285, 289, 289, 297, 302, 308, 311, 311, 314, 315, 316, 316, 325, 325, 327, 334, 334, 334, 334, 345, 345, 346, 349, 349, 349, 350, 371, 371] --- 62.28070175438597 --- 71

In [None]:
df = combined_df.loc[[72, 72, 72, 72, 72, 72, 73, 73, 73, 75, 98, 98, 98, 98, 98, 121, 121, 122, 122, 129, 133, 155, 170, 192, 192, 192, 196, 198, 198, 203, 256, 266, 266, 274, 276, 276, 277, 277, 277, 279, 283, 283, 283, 289, 289, 302, 311, 311, 316, 316, 325, 325, 334, 334, 344, 346, 348, 348, 350, 364, 371, 371]]

In [None]:
sample = []
for sample_no, model_tag, asr_tag in zip(df.index, df['original_tags'].values.tolist(), df['asr_tags'].values.tolist()):
    x_tag = model_tag.split(",")
    y_tag = asr_tag.split(",")
    for x,y in zip(x_tag, y_tag):
        if x == "ORG" and y == "LOC":
            sample.append(sample_no)
print(sample)
# [17, 33, 65, 251] PER -> ORG
# [228] PER -> LOC
# [2, 29, 31, 35, 39, 67, 71, 77, 91, 106, 281, 293, 317] PER -> O


# [301, 304, 319] LOC -> PER
# [25, 27, 69, 69, 111, 146, 204, 204, 204, 204, 303, 303, 303, 303] LOC -> ORG
# [11, 51, 51, 51, 51, 69, 69, 79, 113, 117, 128, 137, 137, 137, 137, 139, 139, 139, 139, 144, 145, 145, 145, 145, 153, 202, 209, 209, 209, 209, 213, 231, 231, 231, 231, 268, 268, 268, 268, 306, 307, 307, 307, 307, 307, 307, 307, 307, 307, 327, 353, 353, 353, 353] LOC -> O


# [364] ORG -> PER
# [72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 122, 122, 256, 302, 316, 316, 316, 316, 344, 350] ORG -> LOC
# [72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 75, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 121, 121, 121, 121, 122, 122, 129, 133, 155, 170, 192, 192, 192, 192, 192, 192, 192, 192, 192, 196, 198, 198, 198, 198, 203, 266, 266, 266, 266, 274, 276, 276, 276, 276, 277, 277, 277, 277, 277, 277, 277, 277, 277, 279, 283, 283, 283, 283, 283, 283, 283, 283, 283, 289, 289, 289, 289, 311, 311, 311, 311, 325, 325, 325, 325, 334, 334, 334, 334, 346, 348, 348, 348, 348, 371, 371, 371, 371] ORG -> O










# [39, 65, 65, 65, 65, 94, 107, 251, 367, 367, 367, 367] PER -> ORG
# [17, 31, 34, 77, 269] PER -> LOC
# [2, 67, 67, 67, 67, 71, 91, 125, 125, 125, 125, 125, 125, 125, 125, 125, 134, 137, 175, 207, 207, 207, 207, 228, 281, 293, 356] PER -> O


# [244] LOC -> PER
# [111, 127, 146, 182, 182, 182, 182, 251] LOC -> ORG
# [100, 113, 145, 145, 145, 145, 152, 152, 152, 152, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 194, 194, 194, 194, 200, 201, 201, 201, 201, 231, 231, 231, 231, 268, 268, 268, 268, 268, 268, 268, 268, 268, 314, 314, 314, 314, 314, 314, 314, 314, 314, 353, 353, 353, 353] LOC -> O


# [] ORG -> PER
# [10, 10, 37, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 96, 96, 97, 97, 125, 125, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 134, 134, 134, 134, 134, 134, 134, 134, 134, 136, 136, 136, 136, 156, 199, 213, 256, 302, 316, 316, 316, 316, 345, 345, 345, 345, 346, 350] ORG -> LOC
# [10, 10, 21, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 75, 75, 75, 75, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89, 96, 96, 97, 97, 102, 102, 102, 102, 102, 102, 102, 102, 102, 112, 112, 112, 112, 112, 112, 112, 112, 112, 121, 121, 121, 121, 125, 125, 129, 129, 129, 129, 132, 133, 135, 135, 135, 135, 137, 137, 137, 137, 148, 158, 158, 158, 158, 158, 158, 158, 158, 158, 165, 170, 178, 196, 198, 198, 198, 198, 198, 198, 198, 198, 198, 203, 203, 203, 203, 203, 203, 203, 203, 203, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 209, 209, 209, 209, 254, 254, 254, 254, 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 260, 261, 262, 263, 264, 265, 266, 266, 266, 266, 269, 270, 271, 274, 276, 276, 276, 276, 277, 277, 277, 277, 277, 277, 277, 277, 277, 278, 279, 283, 283, 283, 283, 283, 283, 283, 283, 283, 284, 285, 289, 289, 289, 289, 297, 308, 311, 311, 311, 311, 314, 315, 325, 325, 325, 325, 327, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 334, 349, 349, 349, 349, 349, 349, 349, 349, 349, 371, 371, 371, 371] ORG -> O

In [None]:
len([72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 75, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 121, 121, 121, 121, 122, 122, 129, 133, 155, 170, 192, 192, 192, 192, 192, 192, 192, 192, 192, 196, 198, 198, 198, 198, 203, 266, 266, 266, 266, 274, 276, 276, 276, 276, 277, 277, 277, 277, 277, 277, 277, 277, 277, 279, 283, 283, 283, 283, 283, 283, 283, 283, 283, 289, 289, 289, 289, 311, 311, 311, 311, 325, 325, 325, 325, 334, 334, 334, 334, 346, 348, 348, 348, 348, 371, 371, 371, 371])

In [None]:
combined_df.loc[[39, 65, 65, 65, 65, 94, 107, 251, 367, 367, 367, 367]]

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
orig_asr_nofound.head()

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]
# [83.43949044585987, 5.095541401273886, 11.464968152866243, 0.0] PER broken
# [131, 8, 18, 0] PER broken count


# [77.9874213836478, 3.1446540880503147, 18.867924528301888, 0.0] LOC broken
# [124, 5, 30, 0] LOC broken count


# [37.5, 16.666666666666664, 45.83333333333333, 0.0] ORG broken
# [124, 5, 30, 0] ORG broken count


# [198.94, 24.89, 76.15] total broken
# [66.31, 8.29, 25.38]





# [80.74534161490683, 4.3478260869565215, 14.906832298136646, 0.0] PER
# [130, 7, 24, 0] PER


# [84.61538461538461, 4.487179487179487, 10.897435897435898, 0.0] LOC
# [132, 7, 17, 0] LOC


# [21.929824561403507, 15.789473684210526, 62.28070175438597, 0.0] ORG
# [25, 18, 71, 0] ORG


# [187.27, 24.6, 88.07]
# [62.42, 8.20, 29.35]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
# [131, 8, 18, 0]

In [None]:
error_df = pd.concat([orig_asr_found, orig_asr_similar], axis=0)

In [None]:
len(error_df)

In [None]:
error_df.head()

In [None]:
error_df.sample(n=10)

In [None]:
combined_df.loc[error_df['Sample #'].values.tolist()]
# [153,154,13,47,1,52,78,20,30,150]
# [6, 84, 110, 125, 207, 213, 216, 367, 2, 17, 29, 31, 33, 35, 39, 65, 67, 71, 77, 91, 106, 228, 251, 281, 293, 317]
#[72, 84, 110, 208, 213, 215, 367, 2, 17, 31, 35, 39, 65, 67, 71, 77, 78, 91, 94, 106, 125, 177, 199, 207, 223, 224, 225, 228, 251, 269, 272, 279, 280, 281, 293, 317]

In [None]:
new_df = combined_df.loc[error_df['Sample #'].values.tolist()]

In [None]:
count = 0
total = 0
for model_tag, asr_tag in zip(new_df['original_tags'].values.tolist(), new_df['asr_tags'].values.tolist()):
    x_tag = model_tag.split(",")
    y_tag = asr_tag.split(",")
    total = total + len(x_tag)
    for x,y in zip(x_tag, y_tag):
        if x == y and x != "O" and y != "O":
            count = count + 1
print(count)
print(total)

In [None]:
# 23 (broken entity with simulation) / 472 total

In [None]:
def pattern_analysis(sample_df, combined_df):
    ind = np.array(sample_df['Sample #'].values.tolist())
    df = combined_df.loc[ind]
    df.insert(2,'Original',sample_df['Original'].values.tolist())
    df.insert(5,'ASR',sample_df['ASR'].values.tolist())
    df.drop(['original_tags', 'asr_tags'], axis=1, inplace=True)
    return df

In [None]:
error_pattern = pattern_analysis(orig_asr_similar, combined_df)

In [None]:
len(error_pattern)

In [None]:
error_pattern.head()

In [None]:
def error_sampling(df):
    i = 0
    equal_length_samples = []
    variable_length_samples = []
    for sample, original, asr in zip(df.index, 
                                     df['Original'],
                                     df['ASR']):
        if len(original) == len(asr):
            equal_length_samples.append(sample)
        else:
            variable_length_samples.append(sample)
    equal_length_samples.sort()
    variable_length_samples.sort()
    equal_length_samples_df = df.loc[equal_length_samples]
    variable_length_samples_df = df.loc[variable_length_samples]
    return equal_length_samples_df, variable_length_samples_df

In [None]:
equal_length_words_samples_df, variable_length_words_samples_df = error_sampling(error_pattern)

In [None]:
len(equal_length_words_samples_df)

In [None]:
equal_length_words_samples_df.head()

In [None]:
len(variable_length_words_samples_df)

In [None]:
variable_length_words_samples_df.head(10)

In [None]:
def equal_words_simulation(sampled_df):
    simulated_asr = []
    for sample, original_sentence, asr_sentence, original, asr in zip(sampled_df.index,
                                     sampled_df['original_sentence'],
                                     sampled_df['asr_sentence'],
                                     sampled_df['Original'],
                                     sampled_df['ASR']):

        for x,y in zip(original, asr):
            #original_words.append(x)
            #asr_words.append(y)
            if y in asr_sentence:
                asr_sentence = asr_sentence.replace(y, x)
            
        simulated_asr.append((sample, asr_sentence))
    simulated_asr_df = pd.DataFrame(simulated_asr)
    return simulated_asr_df

In [None]:
def variable_words_simulation(df):
    check = []
    for sample, original_sentence, asr_sentence, original_tag, asr_tag in zip(
            df.index,
            df['original_sentence'].values.tolist(),
            df['asr_sentence'].values.tolist(),
            df['Original'].values.tolist(),
            df['ASR'].values.tolist()):

        original_label = np.array(original_sentence.split())
        asr_label = np.array(asr_sentence.split())
        original_tag_ind = [index for index, element in enumerate(original_label) if original_label[index] in original_tag]
        asr_tag_ind = [index for index, element in enumerate(asr_label) if asr_label[index] in asr_tag]
        original_bigrams = []
        asr_bigrams = []
        o_label = original_label[original_tag_ind]
        for lab in original_tag:
            for asr_lab in asr_tag:
                local_error = (1 - (Levenshtein.distance(lab, asr_lab) / max(len(lab), len(asr_lab)))) * 100
                if local_error >= 50.0:
                    asr_sentence = asr_sentence.replace(asr_lab, lab)
        check.append((sample, asr_sentence))
    new_asr = pd.DataFrame(check)
    return new_asr

In [None]:
def update_df(asr_df, simulated_df):
    asr_df.loc[simulated_df[0].values.tolist(), 'Sentence'] = simulated_df[1].values.tolist()
    return asr_df

In [None]:
simulated_asr_df = equal_words_simulation(equal_length_words_samples_df)

In [None]:
simulated_asr_df.head()

In [None]:
asr_df.loc[simulated_asr_df[0].values.tolist(), 'Sentence'] = test_org

In [None]:
asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
asr_df.head()

In [None]:
#simulated_asr_df = variable_words_simulation(variable_length_words_samples_df)
#simulated_asr_df.head()

In [None]:
#asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
test_df = model_test(asr_df['Sentence'].values.tolist(), tokenizer, model)

In [None]:
indexNames = test_df[test_df['token'] == "[CLS]" ].index
test_df.drop(indexNames, inplace=True)
indexNames = test_df[test_df['token'] == "[SEP]" ].index
test_df.drop(indexNames, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
test_df.tail()

In [None]:
#test_df = prepare_model_output(test_df, new_df)
test_df = prepare_model_output(test_df, df)

In [None]:
test_df.tail()

In [None]:
test_df['labels'].unique()

In [None]:
g_test = test_df.groupby("sentence_no")
test = pd.DataFrame({"model_tag": g_test.apply(lambda sdf: sdf.labels.values.tolist()),
                       "asr_tag": g_test.apply(lambda sdf: sdf.label_asr.values.tolist())})

In [None]:
test['asr_sentence_no'] = test.index
test[["asr_sentence_no"]] = test[["asr_sentence_no"]].apply(pd.to_numeric)
test.sort_values('asr_sentence_no', inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
test.head()

In [None]:
print("Accuracy: " , accuracy_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
print("F1 Score: ",f1_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
#statistics(test_df, ['PER', 'ORG', 'LOC', 'O'])
#0.7758389261744967 without punctuation
#0.676056338028169 with punctuation 1

In [None]:
asr_df, combined_df = prepare_data_for_analysis(test_df, 'unprocessed_sampled_original.csv')

In [None]:
analysis_df = pd.DataFrame(pattern_finding("ORG", combined_df), columns=['Sample #', 'Original', 'ASR', 'Lavenstein','Lavenstein Mean', 'Flag'])

In [None]:
analysis_df.head(10)

In [None]:
len(analysis_df)

In [None]:
len(combined_df)

In [None]:
orig_asr_found_complete = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] == 100.0)]
orig_asr_found_complete_per = (len(orig_asr_found_complete) / len(analysis_df)) * 100
print(orig_asr_found_complete_per)
orig_asr_found_complete.head()
print(len(orig_asr_found_complete))

In [None]:
orig_asr_found = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] < 100.0) & (analysis_df['Lavenstein Mean'] >= 0.0)]
orig_asr_found_per = (len(orig_asr_found) / len(analysis_df)) * 100
print(orig_asr_found_per)
print(len(orig_asr_found))
orig_asr_found.head()
#40.88050314465409
#65

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar.head()

In [None]:
len(orig_asr_similar)

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
#!/usr/bin/python3
import matplotlib.pyplot as plt

data = [orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]
plt.bar(['Correctly Identified', 'Identified with missing entities', 'Similar tag but not identified', 'No Tag identification'], data)
plt.xticks(rotation=45)
plt.show()

In [None]:
orig_asr_similar.head(14)

In [None]:
context_simulated_df = combined_df.loc[orig_asr_similar['Sample #'].values.tolist(),['original_sentence','original_tags']]

In [None]:
context_simulated_df.head(30)

In [None]:
g_test = df.groupby("Sentence #")
x = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(sdf.Word)),
                       "Tag": g_test.apply(lambda sdf: ",".join(sdf.Tag))})

In [None]:
sentence_no = list(range(0, len(x)))

In [None]:
x.index = sentence_no

In [None]:
x.head()

In [None]:
x.loc[context_simulated_df.index.tolist(), 'Sentence'] = context_simulated_df['original_sentence'].values.tolist()
x.loc[context_simulated_df.index.tolist(), 'Tag'] = context_simulated_df['original_tags'].values.tolist()

In [None]:
x.head(18)

In [None]:
asr_df.head()

In [None]:
context_simulated_df.index.tolist()

In [None]:
asr_df.loc[context_simulated_df.index.tolist(), 'Sentence'] = context_simulated_df['original_sentence'].values.tolist()
asr_df.loc[context_simulated_df.index.tolist(), 'Tag'] = context_simulated_df['original_tags'].values.tolist()

In [None]:
asr_df.head(18)

In [None]:
test_df = model_test(asr_df['Sentence'].values.tolist(), tokenizer, model)

In [None]:
sentence_no = 0
dataset=[]
for sentences, tags in zip(x['Sentence'].values.tolist(), x['Tag'].values.tolist()):
    sentence=sentences.split(" ")
    tag = tags.split(",")
    for word, label in zip(sentence, tag):
        dataset.append((sentence_no, word, label))
    sentence_no = sentence_no + 1

In [None]:
new_df = pd.DataFrame(dataset, columns=['Sentence #', 'Word', 'Tag'])

In [None]:
new_df.head()

In [None]:
test_df = prepare_model_output(test_df, new_df)

In [None]:
test_df.tail()

In [None]:
statistics(test_df, ['PER', 'ORG', 'LOC', 'O'])

In [None]:
original = pd.read_csv('unprocessed_sampled_original.csv')
original.drop(['Unnamed: 0'], axis=1, inplace=True)
original = original[:7851]
g_original = original.groupby("Sentence #")
original_df = pd.DataFrame({'Sentence': g_original.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                      'Tag': g_original.apply(lambda sdf: ",".join(sdf.Tag))})
original_df.reset_index(inplace=True)
combined_df = pd.DataFrame({"original_sentence": original_df['Sentence'],
                           "original_tags": original_df['Tag'], 
                           "asr_sentence": asr_df['Sentence'],
                           "asr_tags": asr_df['Tag']})

In [None]:
asr_df, combined_df = prepare_data_for_analysis(test_df, 'unprocessed_sampled_original.csv')

In [None]:
analysis_df = pd.DataFrame(pattern_finding("PER", combined_df), columns=['Sample #', 'Original', 'ASR', 'Lavenstein','Lavenstein Mean', 'Flag'])

In [None]:
analysis_df.head(10)

In [None]:
len(analysis_df)

In [None]:
len(combined_df)

In [None]:
orig_asr_found_complete = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] == 100.0)]
orig_asr_found_complete_per = (len(orig_asr_found_complete) / len(analysis_df)) * 100
print(orig_asr_found_complete_per)
orig_asr_found_complete.head()
print(len(orig_asr_found_complete))

In [None]:
orig_asr_found = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] < 100.0) & (analysis_df['Lavenstein Mean'] >= 0.0)]
orig_asr_found_per = (len(orig_asr_found) / len(analysis_df)) * 100
print(orig_asr_found_per)
print(len(orig_asr_found))
orig_asr_found.head()
#40.88050314465409
#65

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar.head()

In [None]:
len(orig_asr_similar)

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
equal_length_words_samples_df.head()

In [None]:
def finding_context(df, n_grams):
    check = []
    for sample, original_sentence, asr_sentence, original_tag, asr_tag in zip(
                df.index,
                df['original_sentence'].values.tolist(),
                df['asr_sentence'].values.tolist(),
                df['Original'].values.tolist(),
                df['ASR'].values.tolist()):

        original_label = np.array(original_sentence.split())
        asr_label = np.array(asr_sentence.split())
        original_tag_ind = [index for index, element in enumerate(original_label) if original_label[index] in original_tag]
        asr_tag_ind = [index for index, element in enumerate(asr_label) if asr_label[index] in asr_tag]
        original_bigrams = []
        asr_bigrams = []
        for l in original_tag_ind:
            if l <= (len(original_label)-1) - n_grams:
                data = ""
                for c in range(-n_grams, n_grams+1, 1):
                    if l+c >= 0:
                        data = data + original_label[l + c] + " "
                    else:
                        continue
                original_bigrams.append(data)
            else:
                data = ""
                for c in range(-n_grams, 1, 1):
                    if l+c < len(original_label):
                        data = data + original_label[l + c] + " "
                    else:
                        continue
                original_bigrams.append(data)
        for l in asr_tag_ind:
            if l <= (len(asr_label) - 1) - n_grams:
                data = ""
                for c in range(-n_grams, n_grams + 1, 1):
                    if l + c >= 0:
                        data = data + asr_label[l + c] + " "
                    else:
                        continue
                asr_bigrams.append(data)
            else:
                data = ""
                for c in range(-n_grams, 1, 1):
                    if l + c < len(asr_label):
                        data = data + asr_label[l + c] + " "
                    else:
                        continue
                asr_bigrams.append(data)
        
        check.append((sample, original_bigrams[0], original_sentence, original_tag, asr_bigrams[0], asr_sentence, asr_tag))
    context = pd.DataFrame(check)
    context.columns = ['Sample #', 'Original N-Grams', "original_sentence", "Original", "ASR N-Grams", "asr_sentence", "ASR"]
    return context

In [None]:
def error_sampling3(context):
    check = []
    for sample, original_ngrams, original_sentence, asr_ngrams, asr_sentence, original_tag, asr_tag in zip(
            context['Sample #'].values.tolist(),
            context['Original N-Grams'].values.tolist(),
            context['original_sentence'].values.tolist(),
            context['ASR N-Grams'].values.tolist(),
            context['asr_sentence'].values.tolist(),
            context['Original'].values.tolist(),
            context['ASR'].values.tolist()):
        
        original_ngrams = np.array(original_ngrams.split(" "))
        asr_ngrams = np.array(asr_ngrams.split(" "))
        
        local_errors = []
        i = 0
        j = 0
        for _original in original_tag:
            if _original in asr_tag:
                if len(asr_ngrams) < len(asr_tag):
                    continue
                
                print(asr_sentence)
                asr_sentence = asr_sentence.replace("".join(asr_ngrams[i].rstrip()), "".join(original_ngrams[i].rstrip()))
                print(asr_sentence)
                #print(check)
                i = i + 1
                j = j + 1
            else:
                j = j + 1
        check.append((sample, asr_sentence))
        print("---------------")
    new_asr = pd.DataFrame(check)
    return new_asr

In [None]:
context = finding_context(equal_length_words_samples_df, 5)

In [None]:
context.head()

In [None]:
simulated_asr_df = error_sampling3(context)
simulated_asr_df.head(50)

In [None]:
asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
#context = finding_context(variable_length_words_samples_df, 5)

In [None]:
#simulated_asr_df = error_sampling3(context)
#simulated_asr_df.head(50)

In [None]:
#asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
asr_df.head()

In [None]:
test = ["and though the famous family of aldus restored its technical excellence , rejecting battered letters ,","most of caxton ' s zone types of an earlier character", "are the leaders in this luckless change , though our own baskerville , who was at work some years before them , went much on the same lines",
       "now come into general use that are obviously a great improvement on the ordinary \" modern style \" and use in england , which is in fact the bodoni type" , "on the top of the jail , continues neild , arawatch - house and a century - box , where two or more guards , with dogs and firearms ," ,
       "these courts were extended to centuries later to several large provincial towns , and all were in full activity when neild road ," , "he had been in the employ of a corn - chandler at islington , and went into london with his master ' s cart and horse .",
       "shameful malpractices of bambridge ," , "if they happened to be in funds - - among whom was the marquis of slego in 1811", 
       "mister . neild , a second howard ,", "again the 22 charles ii . c20 order the jailer to keep felons and debtors \" separate and apart from one another ,",
       "prisoners were crowded together in the jail , contrary to the requirements of the for george the 4th ."]

In [None]:
test = "have now come into general use and are obviously a great improvement on the ordinary \" modern style \" in use in england , which is in fact the bodoni type"

In [None]:
xyz = model_test([test], tokenizer, model)

In [None]:
xyz