In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 10000)
import numpy as np
from tqdm import tqdm, trange
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
import matplotlib.pyplot as plt
import os
import transformers
from transformers import BertForTokenClassification, AdamW
from seqeval.metrics import f1_score, accuracy_score
import Levenshtein
import string
import difflib

transformers.__version__

torch.__version__

'1.7.1'

In [None]:
tag_values = ['O', 'PER', 'LOC', 'ORG']
#tag_values = ['B-ORG', 'O', 'B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'I-ORG', 'I-MISC', 'I-LOC']
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_whole_word_mask=True)
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx), output_attentions = False, output_hidden_states = False)
model.load_state_dict(torch.load("../model/bert_base_conll_lower_case_100.pt", map_location=torch.device('cpu')), strict=False)

In [None]:
def prepare_data_for_test(filepath):
    df = pd.read_csv(filepath)
    df.drop(['Unnamed: 0'], axis=1, inplace=True)
    df = df[:6723]
    g_test = df.groupby("Sentence #")
    test_df = pd.DataFrame({"Sentence": g_test.apply(lambda sdf: " ".join(sdf.Word)),
                       "Tag": g_test.apply(lambda sdf: ",".join(sdf.Tag))})
    test_df.reset_index(inplace=True)
    return df, test_df

In [None]:
def model_test(data, tokenizer, model):
    test = []
    #results = open("conll03_base_ljspeech_asr_test_without_gpe_uncased_results_lower.txt", "a+")
    #test_data=original_data['sentence'].values.tolist()
    #test_data=original_sentence
    #test_data=test_df['Sentence'].values.tolist()
    test_data=data

    # ASR TEST DATE LATEST
    sentence_no = 0
    for data in test_data:
        tokenized_sentence = tokenizer.encode(data.lower().strip())
        #tokenized_sentence = nlp(data.lower().strip())
        input_ids = torch.tensor([tokenized_sentence])
        #input_ids = torch.tensor([tokenized_sentence._.trf_word_pieces])

        with torch.no_grad():
             output = model(input_ids)
        label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

        # join bpe split tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
        #tokens = _.trf_word_pieces_
        new_tokens, new_labels = [], []
        for token, label_idx in zip(tokens, label_indices[0]):
            if token.startswith("##"):
                new_tokens[-1] = new_tokens[-1] + token[2:]
            else:
                new_labels.append(tag_values[label_idx])
                new_tokens.append(token)

        for token, label in zip(new_tokens, new_labels):
            #result = str(sentence_no) + "\t" + label + "\t" + token + "\n"
            #results.write(result)
            test.append((str(sentence_no), label, token))
        sentence_no = sentence_no + 1
    test_df = pd.DataFrame(test, columns=['sentence_no', 'labels', 'token'])
    return test_df
    #test_df.to_csv("final_asr_test_dataframe.csv", index=False)

In [None]:
def prepare_model_output(test_df, df):
    indexNames = test_df[test_df['token'] == "[CLS]" ].index
    test_df.drop(indexNames, inplace=True)
    indexNames = test_df[test_df['token'] == "[SEP]" ].index
    test_df.drop(indexNames, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    test_df['label_asr'] = df['Tag']
    test_df['token_asr'] = df['Word']
    return test_df

In [None]:
def statistics(test_df, tags):
    new_acc = accuracy_score(test_df['labels'].values.tolist(), test_df['label_asr'].values.tolist())
    print(new_acc)

    new_f1 = f1_score(test_df['labels'].values.tolist(), test_df['label_asr'].values.tolist())
    print(new_f1)
    print("---STATISTICS ON EACH LABEL---")
    for tag in tags:
        true_positive = test_df[((test_df['labels'].str.contains(tag)) & (test_df['label_asr'].str.contains(tag)))]
        print(len(true_positive))
        false_positive = test_df[((test_df['labels'].str.contains(tag)) & (~test_df['label_asr'].str.contains(tag)))]
        print(len(false_positive))
        false_negative = test_df[((~test_df['labels'].str.contains(tag)) & (test_df['label_asr'].str.contains(tag)))]
        print(len(false_negative))
        true_negative = test_df[((~test_df['labels'].str.contains(tag)) & (~test_df['label_asr'].str.contains(tag)))]
        print(len(true_negative))
        prec = len(true_positive) / (len(true_positive) + len(false_positive))
        print(prec)
        recall = len(true_positive) / (len(true_positive) + len(false_negative))
        print(recall)
        f_measure = (2 * prec * recall) / (prec + recall)
        print(f_measure)
        print("---------------------------------------")

In [None]:
df, test_df = prepare_data_for_test('unprocessed_sampled_asr.csv')

In [None]:
df.head()

In [None]:
test_df.tail()

In [None]:
test_df = model_test(test_df['Sentence'].values.tolist(), tokenizer, model)

In [None]:
test_df = prepare_model_output(test_df, df)

In [None]:
test_df.tail()

In [None]:
test_df['label_asr'].unique()

In [None]:
g_test = test_df.groupby("sentence_no")
test = pd.DataFrame({"model_tag": g_test.apply(lambda sdf: sdf.labels.values.tolist()),
                       "asr_tag": g_test.apply(lambda sdf: sdf.label_asr.values.tolist())})

In [None]:
test['asr_sentence_no'] = test.index
test[["asr_sentence_no"]] = test[["asr_sentence_no"]].apply(pd.to_numeric)
test.sort_values('asr_sentence_no', inplace=True)
test.reset_index(drop=True, inplace=True)

In [None]:
test.head()

In [None]:
print("Accuracy: " , accuracy_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
print("F1 Score: ",f1_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
#statistics(test_df, ['PER', 'ORG', 'LOC', 'O'])
#0.7758389261744967 without punctuation
#0.676056338028169 with punctuation 1

In [None]:
def prepare_data_for_analysis(test_df, original_data_path):
    g_asr = test_df.groupby("sentence_no")
    asr_df = pd.DataFrame({'Sentence': g_asr.apply(lambda sdf: " ".join(map(str,sdf.token))),
                      'Tag': g_asr.apply(lambda sdf: ",".join(sdf.labels))})
    asr_df['asr_sentence_no'] = asr_df.index
    asr_df[["asr_sentence_no"]] = asr_df[["asr_sentence_no"]].apply(pd.to_numeric)
    asr_df.sort_values('asr_sentence_no', inplace=True)
    asr_df.reset_index(drop=True, inplace=True)
    original = pd.read_csv(original_data_path)
    original.drop(['Unnamed: 0'], axis=1, inplace=True)
    original = original[:7851]
    g_original = original.groupby("Sentence #")
    original_df = pd.DataFrame({'Sentence': g_original.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                      'Tag': g_original.apply(lambda sdf: ",".join(sdf.Tag))})
    original_df.reset_index(inplace=True)
    combined_df = pd.DataFrame({"original_sentence": original_df['Sentence'].str.lower(),
                           "original_tags": original_df['Tag'], 
                           "asr_sentence": asr_df['Sentence'],
                           "asr_tags": asr_df['Tag']})
    return asr_df, combined_df

In [None]:
def pattern_finding(tag, combined_df):
#tag = "PER"
    analysis = []
    for i in range(0, len(combined_df), 1):
        sample = combined_df.loc[[i]]
        for original_sentence, asr_sentence, original_tag, asr_tag in zip(sample['original_sentence'].values.tolist(),
                                                                          sample['asr_sentence'].values.tolist(),
                                                                          sample['original_tags'].values.tolist(),
                                                                          sample['asr_tags'].values.tolist()):
            original_tag_token = np.array(original_tag.split(","))
            asr_tag_token = np.array(asr_tag.split(","))
            original_label = np.array(original_sentence.lower().split())
            asr_label = np.array(asr_sentence.lower().split())

            if tag in original_tag_token:
                original_tag_ind = [index for index, element in enumerate(original_tag_token) if
                                    original_tag_token[index] == tag]
                if tag in asr_tag_token:
                    asr_tag_ind = [index for index, element in enumerate(asr_tag_token) if
                                       asr_tag_token[index] == tag]
                    
                    asr_tokens = []
                    original_tokens = []
                    errors = []
                        # Sweynheim pannartz
                        # Swain heim pannartz
                    for ind in original_tag_ind:
                        original_entity = original_label[ind]
                        asr_entity = difflib.get_close_matches(original_entity, asr_label[asr_tag_ind])
                        if len(asr_entity) > 0:
                            asr_entity = asr_entity[0]
                            error = (1 - (Levenshtein.distance(original_entity, asr_entity) / max(len(original_entity), len(asr_entity)))) * 100
                            if error >= 50:
                                asr_tokens.append(asr_entity)
                                original_tokens.append(original_entity)
                                errors.append(error)
                            else:
                                asr_tokens.append("None")
                                original_tokens.append(original_entity)
                                errors.append(0.0)
                        else:
                            asr_tokens.append("None")
                            original_tokens.append(original_entity)
                            errors.append(0.0)
                    analysis.append((i, original_tokens, asr_tokens, errors, np.mean(errors), True))
                else:
                    check = []
                    o_label = original_label[original_tag_ind]
                    for lab in o_label:
                        j = 0
                        for asr_lab in asr_label:
                            local_error = (1 - (Levenshtein.distance(lab, asr_lab) / max(len(lab), len(asr_lab)))) * 100
                            if local_error >= 50.0:
                                check.append(j)
                            j = j + 1
                    if len(check) > 0:
                        asr_tokens = []
                        original_tokens = []
                        errors = []
                        for ind in original_tag_ind:
                            original_entity = original_label[ind]
                            asr_entity = difflib.get_close_matches(original_entity, asr_label[check])
                            if len(asr_entity) > 0:
                                asr_entity = asr_entity[0]
                                error = (1 - (Levenshtein.distance(original_entity, asr_entity) / max(
                                len(original_entity), len(asr_entity)))) * 100
                                asr_tokens.append(asr_entity)
                                original_tokens.append(original_entity)
                                errors.append(error)
                            else:
                                asr_tokens.append("None")
                                original_tokens.append(original_entity)
                                errors.append(0.0)
                        analysis.append((i, original_tokens, asr_tokens, errors, np.mean(errors), False))
                    else:
                        analysis.append((i, original_label[original_tag_ind], ["None"], [0.0], 0.0, False))
    return analysis

In [None]:
asr_df, combined_df = prepare_data_for_analysis(test_df, 'unprocessed_sampled_original.csv')

In [None]:
original = pd.read_csv('unprocessed_sampled_original.csv')
original.drop(['Unnamed: 0'], axis=1, inplace=True)
original = original[:7851]
g_original = original.groupby("Sentence #")
original_df = pd.DataFrame({'Sentence': g_original.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                      'Tag': g_original.apply(lambda sdf: ",".join(sdf.Tag))})
original_df.reset_index(inplace=True)
combined_df = pd.DataFrame({"original_sentence": original_df['Sentence'],
                           "original_tags": original_df['Tag'], 
                           "asr_sentence": asr_df['Sentence'],
                           "asr_tags": asr_df['Tag']})

In [None]:
analysis_df = pd.DataFrame(pattern_finding("PER", combined_df), columns=['Sample #', 'Original', 'ASR', 'Lavenstein', 'Lavenstein Mean', 'Flag'])

In [None]:
analysis_df.head(10)

In [None]:
len(analysis_df)

In [None]:
len(combined_df)

In [None]:
np.mean([100.0,100.0]) == 100.0

In [None]:
orig_asr_found_complete = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] == 100.0)]
orig_asr_found_complete_per = (len(orig_asr_found_complete) / len(analysis_df)) * 100
print(orig_asr_found_complete_per)
orig_asr_found_complete.head()
print(len(orig_asr_found_complete))

In [None]:
orig_asr_found_complete.head()

In [None]:
orig_asr_found = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] < 100.0) & (analysis_df['Lavenstein Mean'] >= 0.0)]
orig_asr_found_per = (len(orig_asr_found) / len(analysis_df)) * 100
print(orig_asr_found_per)
print(len(orig_asr_found))
orig_asr_found.head()
#40.88050314465409
#65

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar.head()

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
orig_asr_nofound.head()

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
test = ["the position of our society that don ' t work of utility might be also a work of art , if we care to make it so .",
 "full details of the arrangements are to be found in mr . neil ' s ' state of prisons in england , scotland , and wales , ' published in 1812 .",
 'alfred the great established the court baron , the hundred court , and the county court , which among other matters entertain please for debt .',
 "lake county court was the sheriff ' s , who said they ' re surrounded by the bishop and the magnets of the county",
 'so much inconvenience ensued , that in 1518 the corporation obtained from parliament and act empowering to alderman',
 'four common councilman to hold courts of requests , or courts of conscience , to hear and determine all causes of death',
 'mr . buxton , in his " inquiry into the system of prison discipline , "',
 'the fleet , and the marshalsea prison especially devoted to them ,',
 'whilst ludgate , the gilts 1st street , and borrow comforters also received them',
 'the sale of spirits was forbidden , but june could always be had at the whistling shops , where it was known as moonshine , sky blue ,',
 'the fleet , which stood in farrington street ,',
 'the warden of the fleet at the commencement of the 18th century , are too well known to need more than a passing reference .',
 'and came under the strong animated version of the jail committee of 1729 .',
 'the lord steward of the household , the stewart and officers of the marshalsea court , and others .',
 'comforters of ludgate , giltspur street , and the borough where discontinued as debtors \' prisons ( as was newgate also )',
 "clergyman , proctor ' s , attorneys , and persons specially selected by the corporation .",
 'at one time the ludgate debtors , accompanied by the keeper ,',
 "spruce street compton received sheriff ' s debtors , also felons , vagrants , and knight charges .",
 'it was generally crowded , as debtors who would have gone to the poultry copter we sent to giltspur street when the former was condemned as unfit to receive prisoners .',
 "the borough compter was in a disgraceful state to the last . the men ' s ward had an earth , or rather a mud , floor ,",
 "notably as when numbers filled new gate in anticipation of lord reds dale ' s bill for insolvent debtors ,",
 'is gradually was forced upon the consciousness of the corporation ,',
 'bypass now to the criminal side of newgate , which consisted of the six quarters or yards already enumerated and describe .',
 'court of aldermen appointed a committee of its own body , assisted by the town clock , mr . , city surveyor , sun to the architect ,',
 "send mr . addison , keeper of new gate , to make a visitation of the jail ' s supposed to be the best managed , including those of petworth and gloucester .",
 'the committee did not deny the superior advantages offered by such prisons as gloucester and petworth ,',
 'the committee does not seem to have yet understood that new gate could be only and properly replaced',
 'buy a new jail built on the outskirts , as holloway eventually was , and committed itself to be altogether counter',
 "i ' m checked in its efforts towards reform by the prohibitory costliness of the land about nougat .",
 'why not relieving you gate more largely upon the superior accommodation which build bank offer ?',
 'chronicles of new gate , volume 2 . by arthur griffith . section 7 : the beginnings of prison reform .',
 'i have shown in a previous chapter what new gate was at this , despite a vast expenditure and boasted efforts to introduce reforms .',
 'one of the moving spirits was the honorable h . g . bennett , auntie , whose vigorous protests against the lamentable condition of newgate have already been recorded .',
 'the chronicles of nougat , volume 2 . by arthur griffith . section 8 : the beginnings of prison reform .',
 'newgate prisoners were the victims to another most objectionable practice which obtained all over london .',
 'an imputation which the society indignantly and very justly repudiated , the statement being , as they said ,',
 'among those from the society found a raid against it was sidney smith ,',
 'admitting the good intentions of the society , he condemned there ultra humanitarianism as misplaced .',
 'he took exception to various of the proposals of the society . he thought they linked too much to a system of indulgences and education in jails .',
 "society pursuit it ' s laudable undertaking with a remarkable energy and great singleness of purpose .",
 'another point to which the society devoted infinite was the preparation of plans for the guidance of architects in the construction of prisons .',
 'a very valuable volume published by the society',
 'was introduced as early as 1790 by mr . blackburn',
 'the society did not limit its remarks to the description of what had already been done',
 'the prison society reproves the misdirected efforts of ambitious architect , to buy a lavish an improvident expenditure of public money',
 "these are principles fully recognized now - a - days , and it may fairly be conceded that the prison discipline society ' s ideal",
 'after a few years of active exertion the society was rewarded by fresh legislation .',
 'to its efforts , and their effect upon parliament and the public mind , we must attribute the new jail acts of for george the 4th',
 'the promulgation of these to jail acts strengthen the hands of the prison discipline society enormously .',
 'the society did not shrink from its self - imposed duty , but continued year after year , with unflagging energy and unflinching spirit , to watch closely',
 'upon these and the private visitations made by various members of the society obtained effects ,',
 'four years later the prison society reported',
 'i just chillin by the report of the commissioners to inquire into the state of the municipal corporations in 1835 .',
 'kidderminster had a prison one dance chill room ,',
 'in 1827 the society was compelled to report that " no material change has taken place in newgate since the passing of the prison laws ,',
 'the prison society did not relax its efforts as time passed , but its leading members had other and more pressing claims upon their energies .',
 'this committee anniversary strongly upon the system in force at the metropolitan jails , and more especially upon the condition of nougat',
 'mister . samuel hoare was examined by this committee',
 'i stated that in his opinion new gate , as the common jail of middlesex , was wholly inadequate to the proper confinement of its prisoners .',
 'the committee was appointed , under the presidency of the duke of richmond',
 'the whole question was again dealt with in lord john russell \' s bill for the reform of the municipal corporations , and with a more liberal election of town councillors ,',
 'the chronicles of nougat , volume 2 . by arthur griffiths . section 9 : the first report of the inspector of prisons .',
 'newgate has remained rather in the background while the whole of the jails as a body wear under discussion .',
 'exchequer , the commissioners of bankruptcy and of taxes smugglers , and a larger number sentence for very short terms ,']

In [None]:
combined_df.loc[orig_asr_similar['Sample #'].values.tolist()]['original_sentence'].values.tolist()

In [None]:
loc_test = ['printed very few books in this type , 3 only but in their very first book syndrome beginning , with the year 1468 ,',
 'the chronicles of nougat , volume to arthur griffiths . section for : new gate down to 1818 .',
 'seldom let a session go by without visiting you gate .',
 'returns laid before the house of commons showed that 6439 persons have been committed to nougat',
 'the number of arrests actually made was 114 , 300 for the kingdom , and 7024 middlesex .',
 'there was in the city road a temporary bar , with a collector of tolls who was sometimes on the spot and sometimes not .',
 'before dealing with the debtors in newgate , prefer incidentally',
 'the best , or at least the most influential prisoners , god lodging in the statehouse , which contained " eight large handsome rooms . "',
 'in consequence of these disclosures , bambridge and hugging , his predecessor in the office , or committed to newgate ,',
 'senators were rather better at the marshalsea .',
 'is bequest , which was charged upon his manner at goering , auxins , and hence called the oxford charity ,',
 'supreme control of the marshalsea was vested in the marshal of the royal household but although he drew a salary of 500 pounds a year ,',
 'neeld found the prisoners in the boro compter ragged , starving , and dirty .',
 'the chronicles of nougat , volume 2 . by arthur griffiths . section v : newgate down to 18 18 , part 2 .',
 'notes for street , and the poultry , or about 476 and all .',
 'mr . davidson , sent to newgate for embezzlement , and whose case is given in the preceding chapter ,',
 'will mike mr . bennett right that the condition of the condemned side was the most prominent of the many - fold evils in the prison system of nougat ,',
 'it was not strange , therefore , that the inmates of nougat should turn their unoccupied brains and idle hands to all manner of mischief',
 'it was very desirable that there should be a more speedy removal of transports from you gate to the ships .',
 "mr . green , with stir - fry , mizer ' s . forester , and mr . t . f . buxton , the coadjutor of wilberforce in the great anti - slavery struggle .",
 'specify more particularly one or two of the worst , it may be mentioned that in the boro comforter',
 'ilchester the rule of employment have been carried further .',
 'the system not adopted generally till nearly half a century later had already prevail that bill chester .',
 'godmanchester there was no jail , but cage to secure prisoners till they could be taken before a magistrate .',
 'i shall have more to say on this subject , and upon the state of nougat generally , in the following chapter .',
 'this committee anniversary strongly upon the system in force at the metropolitan jails , and more especially upon the condition of nougat',
 'the committee was appointed , under the presidency of the duke of richmond',
 'he blamed the construction of new gate for the neglect of classification , and was yet compelled to confess that he had made no attempt whatever to carry it out .']

In [None]:
per_test = ["especially as regards to lower - case letters and type very similar was used during the next 15 or 20 years not only by chauffeur ,",
                "about the same year mental in at strasburg began to print in a type which is distinctly roman",
                "and though the famous family of aldis restored its technical excellence , rejecting battered letters ,",
                "most of caxton ' s zone types of an earlier character ,",
                "are the leaders in this luckless change , though our own baskerville , who was at work some years before them , went much on the same lines",
                "now come into general use that are obviously a great improvement on the ordinary \" modern style \" and use in england , which is in fact the bodony type",
                "on the top of the jail , continues neeld , ara - watch house and a century - box where two or more guards , with dogs and firearms ,",
                "these courts were extended to centuries later to several large provincial towns , and all were in full activity when nield road ,",
                "he had been in the employ of a corn - chandler at islington , and went into london with his master ' s cart and horse .",
                "shameful malpractices of bambridge ,",
                "the lord steward of the household , the stewart and officers of the marshalsea court , and others .",
                "if they happened to be in funds - - among whom was the marquis of slego in 1811 .",
                "mister . kneeled , a second howard ,",
                "which became the four george the 4th . tap . 64 , said that he had abstained from legislating for these small jurisdictions \" on mature deliberation . \"",
                "nothing was more prominently brought out by the inspectors and the inefficiency of the governor at that time , mister . co ."]


In [None]:
#!/usr/bin/python3
import matplotlib.pyplot as plt

data = [orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]
plt.bar(['Correctly Identified', 'Identified with missing entities', 'Similar tag but not identified', 'No Tag identification'], data)
plt.xticks(rotation=45)
plt.show()

In [None]:
def pattern_analysis(sample_df, combined_df):
    ind = np.array(sample_df['Sample #'].values.tolist())
    df = combined_df.loc[ind]
    df.insert(2,'Original',sample_df['Original'].values.tolist())
    df.insert(5,'ASR',sample_df['ASR'].values.tolist())
    df.drop(['original_tags', 'asr_tags'], axis=1, inplace=True)
    df.head(50)
    return df

In [None]:
error_pattern = pattern_analysis(orig_asr_similar, combined_df)

In [None]:
len(error_pattern)

In [None]:
error_pattern.head(10)

In [None]:
def error_sampling(df):
    i = 0
    equal_length_samples = []
    variable_length_samples = []
    for sample, original, asr in zip(df.index, 
                                     df['Original'],
                                     df['ASR']):
        if len(original) == len(asr):
            equal_length_samples.append(sample)
        else:
            variable_length_samples.append(sample)
    equal_length_samples.sort()
    variable_length_samples.sort()
    equal_length_samples_df = df.loc[equal_length_samples]
    variable_length_samples_df = df.loc[variable_length_samples]
    return equal_length_samples_df, variable_length_samples_df

In [None]:
equal_length_words_samples_df, variable_length_words_samples_df = error_sampling(error_pattern)

In [None]:
len(equal_length_words_samples_df)

In [None]:
equal_length_words_samples_df.head()

In [None]:
len(variable_length_words_samples_df)

In [None]:
variable_length_words_samples_df.head(10)

In [None]:
def equal_words_simulation(sampled_df):
    simulated_asr = []
    for sample, original_sentence, asr_sentence, original, asr in zip(sampled_df.index,
                                     sampled_df['original_sentence'],
                                     sampled_df['asr_sentence'],
                                     sampled_df['Original'],
                                     sampled_df['ASR']):

        for x,y in zip(original, asr):
            #original_words.append(x)
            #asr_words.append(y)
            if y in asr_sentence:
                asr_sentence = asr_sentence.replace(y, x)
            
        simulated_asr.append((sample, asr_sentence))
    simulated_asr_df = pd.DataFrame(simulated_asr)
    return simulated_asr_df

In [None]:
def variable_words_simulation(df):
    check = []
    for sample, original_sentence, asr_sentence, original_tag, asr_tag in zip(
            df.index,
            df['original_sentence'].values.tolist(),
            df['asr_sentence'].values.tolist(),
            df['Original'].values.tolist(),
            df['ASR'].values.tolist()):

        original_label = np.array(original_sentence.split())
        asr_label = np.array(asr_sentence.split())
        original_tag_ind = [index for index, element in enumerate(original_label) if original_label[index] in original_tag]
        asr_tag_ind = [index for index, element in enumerate(asr_label) if asr_label[index] in asr_tag]
        original_bigrams = []
        asr_bigrams = []
        o_label = original_label[original_tag_ind]
        for lab in original_tag:
            for asr_lab in asr_tag:
                local_error = (1 - (Levenshtein.distance(lab, asr_lab) / max(len(lab), len(asr_lab)))) * 100
                if local_error >= 50.0:
                    asr_sentence = asr_sentence.replace(asr_lab, lab)
        check.append((sample, asr_sentence))
    new_asr = pd.DataFrame(check)
    return new_asr

In [None]:
def update_df(asr_df, simulated_df):
    asr_df.loc[simulated_df[0].values.tolist(), 'Sentence'] = simulated_df[1].values.tolist()
    return asr_df

In [None]:
simulated_asr_df = equal_words_simulation(equal_length_words_samples_df)

In [None]:
simulated_asr_df.head()

In [None]:
len(simulated_asr_df)

In [None]:
asr_df.loc[simulated_asr_df[0].values.tolist(), 'Sentence'] = test

In [None]:
asr_df.head(20)

In [None]:
asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
asr_df.head(50)

In [None]:
asr_df.isna().sum()

In [None]:
#simulated_asr_df = variable_words_simulation(variable_length_words_samples_df)
#simulated_asr_df.head()

In [None]:
#asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
test_df = model_test(asr_df['Sentence'].values.tolist(), tokenizer, model)

In [None]:
#test_df = prepare_model_output(test_df, new_df)
test_df = prepare_model_output(test_df, df)

In [None]:
test_df.tail()

In [None]:
g_test = test_df.groupby("sentence_no")
test = pd.DataFrame({"model_tag": g_test.apply(lambda sdf: sdf.labels.values.tolist()),
                       "asr_tag": g_test.apply(lambda sdf: sdf.label_asr.values.tolist())})
test['asr_sentence_no'] = test.index
test[["asr_sentence_no"]] = test[["asr_sentence_no"]].apply(pd.to_numeric)
test.sort_values('asr_sentence_no', inplace=True)
test.reset_index(drop=True, inplace=True)

#statistics(test_df, ['PER', 'ORG', 'LOC', 'O'])
#0.7758389261744967 without punctuation
#0.676056338028169 with punctuation 1

In [None]:
test.tail()

In [None]:
print("Accuracy: " , accuracy_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))
print("F1 Score: ",f1_score(test['model_tag'].values.tolist(), test['asr_tag'].values.tolist()))

In [None]:
asr_df, combined_df = prepare_data_for_analysis(test_df, 'unprocessed_sampled_original.csv')

In [None]:
analysis_df = pd.DataFrame(pattern_finding("ORG", combined_df), columns=['Sample #', 'Original', 'ASR', 'Lavenstein','Lavenstein Mean', 'Flag'])

In [None]:
analysis_df.head(10)

In [None]:
len(analysis_df)

In [None]:
len(combined_df)

In [None]:
orig_asr_found_complete = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] == 100.0)]
orig_asr_found_complete_per = (len(orig_asr_found_complete) / len(analysis_df)) * 100
print(orig_asr_found_complete_per)
orig_asr_found_complete.head()
print(len(orig_asr_found_complete))

In [None]:
orig_asr_found = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] < 100.0) & (analysis_df['Lavenstein Mean'] >= 0.0)]
orig_asr_found_per = (len(orig_asr_found) / len(analysis_df)) * 100
print(orig_asr_found_per)
print(len(orig_asr_found))
orig_asr_found.head()
#40.88050314465409
#65

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar.head()

In [None]:
len(orig_asr_similar)

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
#!/usr/bin/python3
import matplotlib.pyplot as plt

data = [orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]
plt.bar(['Correctly Identified', 'Identified with missing entities', 'Similar tag but not identified', 'No Tag identification'], data)
plt.xticks(rotation=45)
plt.show()

In [None]:
orig_asr_similar.head(14)

In [None]:
original = pd.read_csv('unprocessed_sampled_original.csv')
original.drop(['Unnamed: 0'], axis=1, inplace=True)
original = original[:7851]
g_original = original.groupby("Sentence #")
original_df = pd.DataFrame({'Sentence': g_original.apply(lambda sdf: " ".join(map(str,sdf.Word))),
                      'Tag': g_original.apply(lambda sdf: ",".join(sdf.Tag))})
original_df.reset_index(inplace=True)
combined_df = pd.DataFrame({"original_sentence": original_df['Sentence'],
                           "original_tags": original_df['Tag'], 
                           "asr_sentence": asr_df['Sentence'],
                           "asr_tags": asr_df['Tag']})

In [None]:
analysis_df = pd.DataFrame(pattern_finding("PER", combined_df), columns=['Sample #', 'Original', 'ASR', 'Lavenstein','Lavenstein Mean', 'Flag'])

In [None]:
analysis_df.head(10)

In [None]:
len(analysis_df)

In [None]:
len(combined_df)

In [None]:
orig_asr_found_complete = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] == 100.0)]
orig_asr_found_complete_per = (len(orig_asr_found_complete) / len(analysis_df)) * 100
print(orig_asr_found_complete_per)
orig_asr_found_complete.head()
print(len(orig_asr_found_complete))

In [None]:
orig_asr_found = analysis_df[(analysis_df['Flag'] == True) & (analysis_df['Lavenstein Mean'] < 100.0) & (analysis_df['Lavenstein Mean'] >= 0.0)]
orig_asr_found_per = (len(orig_asr_found) / len(analysis_df)) * 100
print(orig_asr_found_per)
print(len(orig_asr_found))
orig_asr_found.head()
#40.88050314465409
#65

In [None]:
orig_asr_similar = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 100.0) & (analysis_df['Lavenstein Mean'] > 0.0)]
orig_asr_similar_per = (len(orig_asr_similar) / len(analysis_df)) * 100
print(orig_asr_similar_per)
orig_asr_similar.head()
print(len(orig_asr_similar))

In [None]:
orig_asr_similar.head()

In [None]:
len(orig_asr_similar)

In [None]:
orig_asr_nofound = analysis_df[(analysis_df['Flag'] == False) & (analysis_df['Lavenstein Mean'] <= 0.0)]
orig_asr_nofound_per = (len(orig_asr_nofound) / len(analysis_df))*100
print(orig_asr_nofound_per)
orig_asr_nofound.head()
print(len(orig_asr_nofound))

In [None]:
[orig_asr_found_complete_per, orig_asr_found_per, orig_asr_similar_per, orig_asr_nofound_per]

In [None]:
[len(orig_asr_found_complete), len(orig_asr_found), len(orig_asr_similar), len(orig_asr_nofound)]

In [None]:
def pattern_analysis(sample_df, combined_df):
    ind = np.array(sample_df['Sample #'].values.tolist())
    df = combined_df.loc[ind]
    df.insert(2,'Original',sample_df['Original'].values.tolist())
    df.insert(5,'ASR',sample_df['ASR'].values.tolist())
    df.drop(['original_tags', 'asr_tags'], axis=1, inplace=True)
    df.head(50)
    return df

In [None]:
error_pattern = pattern_analysis(orig_asr_similar, combined_df)

In [None]:
len(error_pattern)

In [None]:
error_pattern.head(10)

In [None]:
def error_sampling(df):
    i = 0
    equal_length_samples = []
    variable_length_samples = []
    for sample, original, asr in zip(df.index, 
                                     df['Original'],
                                     df['ASR']):
        if len(original) == len(asr):
            equal_length_samples.append(sample)
        else:
            variable_length_samples.append(sample)
    equal_length_samples.sort()
    variable_length_samples.sort()
    equal_length_samples_df = df.loc[equal_length_samples]
    variable_length_samples_df = df.loc[variable_length_samples]
    return equal_length_samples_df, variable_length_samples_df

In [None]:
equal_length_words_samples_df, variable_length_words_samples_df = error_sampling(error_pattern)

In [None]:
len(equal_length_words_samples_df)

In [None]:
equal_length_words_samples_df.head(40)

In [None]:
def finding_context(df, n_grams):
    check = []
    for sample, original_sentence, asr_sentence, original_tag, asr_tag in zip(
                df.index,
                df['original_sentence'].values.tolist(),
                df['asr_sentence'].values.tolist(),
                df['Original'].values.tolist(),
                df['ASR'].values.tolist()):

        original_label = np.array(original_sentence.split())
        asr_label = np.array(asr_sentence.split())
        original_tag_ind = [index for index, element in enumerate(original_label) if original_label[index] in original_tag]
        asr_tag_ind = [index for index, element in enumerate(asr_label) if asr_label[index] in asr_tag]
        original_bigrams = []
        asr_bigrams = []
        for l in original_tag_ind:
            if l <= (len(original_label)-1) - n_grams:
                data = ""
                for c in range(-n_grams, n_grams+1, 1):
                    if l+c >= 0:
                        data = data + original_label[l + c] + " "
                    else:
                        continue
                original_bigrams.append(data)
            else:
                data = ""
                for c in range(-n_grams, 1, 1):
                    if l+c < len(original_label):
                        data = data + original_label[l + c] + " "
                    else:
                        continue
                original_bigrams.append(data)
        for l in asr_tag_ind:
            if l <= (len(asr_label) - 1) - n_grams:
                data = ""
                for c in range(-n_grams, n_grams + 1, 1):
                    if l + c >= 0:
                        data = data + asr_label[l + c] + " "
                    else:
                        continue
                asr_bigrams.append(data)
            else:
                data = ""
                for c in range(-n_grams, 1, 1):
                    if l + c < len(asr_label):
                        data = data + asr_label[l + c] + " "
                    else:
                        continue
                asr_bigrams.append(data)
        check.append((sample, (" | ").join(original_bigrams), original_sentence, original_tag, (" | ").join(asr_bigrams), asr_sentence, asr_tag))
    context = pd.DataFrame(check)
    context.columns = ['Sample #', 'Original N-Grams', "original_sentence", "Original", "ASR N-Grams", "asr_sentence", "ASR"]
    return context

In [None]:
def error_sampling3(context):
    check = []
    for sample, original_ngrams, original_sentence, asr_ngrams, asr_sentence, original_tag, asr_tag in zip(
            context['Sample #'].values.tolist(),
            context['Original N-Grams'].values.tolist(),
            context['original_sentence'].values.tolist(),
            context['ASR N-Grams'].values.tolist(),
            context['asr_sentence'].values.tolist(),
            context['Original'].values.tolist(),
            context['ASR'].values.tolist()):

        original_ngrams = np.array(original_ngrams.split("|"))
        asr_ngrams = np.array(asr_ngrams.split("|"))

        local_errors = []
        i = 0
        j = 0
        for _original in original_tag:
            if _original in asr_tag:
                if len(asr_ngrams) < len(asr_tag):
                    continue
                
                print(asr_sentence)
                asr_sentence = asr_sentence.replace("".join(asr_ngrams[i].rstrip()), "".join(original_ngrams[i].rstrip()))
                print(asr_sentence)
                #print(check)
                i = i + 1
                j = j + 1
            else:
                j = j + 1
        check.append((sample, asr_sentence))
        print("---------------")
    new_asr = pd.DataFrame(check)
    return new_asr

In [None]:
len(equal_length_words_samples_df)

In [None]:
context = finding_context(equal_length_words_samples_df, 5)

In [None]:
context.head()

In [None]:
simulated_asr_df = error_sampling3(context)
simulated_asr_df.head(50)

In [None]:
asr_df = update_df(asr_df, simulated_asr_df)

In [None]:
test = ["and though the famous family of aldus restored its technical excellence , rejecting battered letters ,","most of caxton ' s zone types of an earlier character", "are the leaders in this luckless change , though our own baskerville , who was at work some years before them , went much on the same lines",
       "now come into general use that are obviously a great improvement on the ordinary \" modern style \" and use in england , which is in fact the bodoni type" , "on the top of the jail , continues neild , arawatch - house and a century - box , where two or more guards , with dogs and firearms ," ,
       "these courts were extended to centuries later to several large provincial towns , and all were in full activity when neild road ," , "he had been in the employ of a corn - chandler at islington , and went into london with his master ' s cart and horse .",
       "shameful malpractices of bambridge ," , "if they happened to be in funds - - among whom was the marquis of slego in 1811", 
       "mister . neild , a second howard ,", "again the 22 charles ii . c20 order the jailer to keep felons and debtors \" separate and apart from one another ,",
       "prisoners were crowded together in the jail , contrary to the requirements of the for george the 4th ."]

In [None]:
test = "have now come into general use and are obviously a great improvement on the ordinary \" modern style \" in use in england , which is in fact the bodoni type"

In [None]:
xyz = model_test([test], tokenizer, model)

In [None]:
xyz