In [1]:
import pandas as pd

In [2]:
# baseline_model/test_results.txt
# co-models/ext_1000_isw_model/test_results.txt = 1521
# co-models/ext_2290_isw_model/test_results.txt = 2290

# co-models/
# ext_1000_isw_model		ext_4847_isw_model
# ext_1000_top20_isw_model	ext_504_isw_model
# ext_2290_isw_model		ext_isw_model
# ext_3132_isw_model

## Comapre the test result of baseline model and co-train model
#### This will give us a insight about which NER tags are better or worse...

In [3]:
def load_eval_result(filename):
    with open(filename, encoding='utf-8') as f:
        lines = (line.strip() for line in f)
        eval_ls = []
        for line in lines:
            splits = line.split("    ")
            splits = list(filter(None, splits))
            eval_ls.append(splits)
    eval_ls = eval_ls[2:-3]+[eval_ls[-1]]

    float_list = []
    for ls in eval_ls:
        tmp_ls = []
        for ele in ls:
            try:
                ele = float(ele)
                tmp_ls.append(ele)
            except:
                tmp_ls.append(ele)
        float_list.append(tmp_ls)
    float_list = sorted(float_list, key = lambda x: x[0], reverse=False)
    return float_list

def get_better_worse_tags(baseline_ls, cotrain_ls):
    better_ = {}
    worse_ = {}
    for (b_tag, b_p, b_r, b_f, b_s), (tag, p, r, f, s) in zip(baseline_ls, cotrain_ls):
        assert b_tag == tag
        # better f1 score results
        if f>b_f:
            better_.update({b_tag:{'before': b_f, 'after': f}})
        elif f<b_f:
            worse_.update({b_tag:{'before': b_f, 'after': f}})
    return better_, worse_

In [4]:
# DF format for baseline model result
# df = pd.DataFrame(baseline_ls[0:],columns=['Baseline model','precision', 'recall','f1-score','support'])
# df

## Compare with tag distribution 
### This will give us an insight about the distribution of the new labels added to the training set, and in comparison the distribution of labels in the training set before / after adding the new labels)

In [5]:
from collections import Counter
import joblib

def get_stat(labels):
    flat_list = [item for sublist in labels for item in sublist if item != 'O']
    strList = list(map( lambda x: x.replace( 'B-', ''), flat_list))
    strList = list(map( lambda x: x.replace( 'I-', ''), strList))

    tag_list = list(Counter(strList).keys())
    num_tags = list(Counter(strList).values())
    dict_tags = dict(zip(tag_list, num_tags))

    final_tag_dict = sorted(dict_tags.items(), key=lambda x: x[1], reverse=True)
    return final_tag_dict

def get_compare_df(ori_df, train_tags, ext_data_dir, ext_, better_, worse_):
    com_tags, better_tags, worse_tags = [], [], []
    b_f, a_f = [], []
    for (tag, _) in train_tags:
        better_ls = list(better_.keys())
        worse_ls = list(worse_.keys())
        if tag in list(ext_.keys()):
            com_tags.append(ext_[tag])
        else:
            com_tags.append(0)
        
        if tag in better_ls:
            better_tags.append(1)
            worse_tags.append(0)
            b_f.append(better_[tag]['before'])
            a_f.append(better_[tag]['after'])
        elif tag in worse_ls:
            better_tags.append(0)
            worse_tags.append(1)
            b_f.append(worse_[tag]['before'])
            a_f.append(worse_[tag]['after'])
        else:
            better_tags.append(0)
            worse_tags.append(0)
            b_f.append("-")
            a_f.append("-")
    ori_df[ext_data_dir]=com_tags
    ori_df['Better']=better_tags
    ori_df['Worse']=worse_tags
    ori_df['baseline_F1']=b_f
    ori_df['ext_F1']=a_f
    return ori_df    


In [6]:
# Load better and wrose result 

# Load test result of baseline and cotrain model
baseline_ls = load_eval_result("baseline_model/test_results.txt")

# _1521_ls = load_eval_result("co-models/ext_1000_isw_model/test_results.txt")
_4847_ls = load_eval_result("co-models/ext_4847_isw_model/test_results.txt")
# _2290_ls = load_eval_result("co-models/ext_2290_isw_model/test_results.txt")

# Shows the better and worse tags [[PER, 0.8, 0.9]]
better_, worse_ = get_better_worse_tags(baseline_ls, _4847_ls)



# Load origin train data and new adding ext data from co-training
# Ori train data
train_sents = joblib.load("data/train-isw-sentences.pkl")
train_labels = joblib.load("data/train-isw-labels.pkl")

# Extended train data, which generated from co-trainig method.
# You should change the ext_data_dir to pick new adding train data.

# ext_data_dir = "ext_data_1000_u_300"
ext_data_dir = "ext_data_u_300_top_30"

# ext = joblib.load("ext_data/ext_data_1000/1521_ext_L_A_labels.pkl")
# ext = joblib.load("ext_data/{}/2290_ext_L_A_labels.pkl".format(ext_data_dir))
ext = joblib.load("ext_data/{}/4847_ext_L_A_labels.pkl".format(ext_data_dir))


# Get basic statistic of tags, present as dataframe for better visualizing.
train_tags = get_stat(train_labels)
ori_df = pd.DataFrame(train_tags[0:],columns=['Tag','num'])
# ori_df


# Get basic statistic of new adding tags.
ext_tags = get_stat(ext)
ext_df = pd.DataFrame(ext_tags[0:],columns=['Tag','num'])
# Convert into dict format
ext_ = dict(ext_tags)


compare_df = get_compare_df(ori_df, train_tags, ext_data_dir, ext_, better_, worse_)
compare_df

Unnamed: 0,Tag,num,ext_data_u_300_top_30,Better,Worse,baseline_F1,ext_F1
0,GPE,2803,1543,1,0,0.9696,0.9737
1,TIME,2585,310,1,0,0.8839,0.8906
2,NRP,1886,392,1,0,0.9183,0.9212
3,DUR,1634,15,1,0,0.6667,0.6715
4,LAN,1253,70,0,1,0.9483,0.9468
5,DATE,1098,153,1,0,0.7697,0.7921
6,PER,765,2987,0,1,0.8622,0.8546
7,FREQ,555,18,1,0,0.7962,0.8078
8,AGE,430,0,0,1,0.6245,0.5789
9,CARDINAL,412,810,1,0,0.8283,0.8387


In [7]:
ext_

{'PER': 2987,
 'GPE': 1543,
 'CARDINAL': 810,
 'NRP': 392,
 'ORDINAL': 337,
 'TIME': 310,
 'DATE': 153,
 'LAN': 70,
 'ORG': 28,
 'FREQ': 18,
 'DUR': 15,
 'EVT': 8,
 'TITLE': 3}

In [8]:
better_

{'ART': {'before': 0.3529, 'after': 0.3673},
 'CARDINAL': {'before': 0.8283, 'after': 0.8387},
 'DATE': {'before': 0.7697, 'after': 0.7921},
 'DUR': {'before': 0.6667, 'after': 0.6715},
 'EVT': {'before': 0.8364, 'after': 0.8649},
 'FAC': {'before': 0.7602, 'after': 0.7882},
 'FREQ': {'before': 0.7962, 'after': 0.8078},
 'GPE': {'before': 0.9696, 'after': 0.9737},
 'LOC': {'before': 0.7115, 'after': 0.7193},
 'MON': {'before': 0.8421, 'after': 0.8889},
 'NRP': {'before': 0.9183, 'after': 0.9212},
 'QUANT': {'before': 0.7368, 'after': 0.9474},
 'RATE': {'before': 0.0, 'after': 0.3333},
 'TIME': {'before': 0.8839, 'after': 0.8906},
 'macro avg': {'before': 0.8592, 'after': 0.8627}}

In [9]:
worse_

{'AGE': {'before': 0.6245, 'after': 0.5789},
 'LAN': {'before': 0.9483, 'after': 0.9468},
 'MISC': {'before': 0.6835, 'after': 0.6353},
 'ORDINAL': {'before': 0.823, 'after': 0.8033},
 'ORG': {'before': 0.6866, 'after': 0.6842},
 'PER': {'before': 0.8622, 'after': 0.8546},
 'SORD': {'before': 0.7516, 'after': 0.7417},
 'TITLE': {'before': 0.9032, 'after': 0.8525}}

# Check sentence and predictions

In [10]:
import os
from predict import Ner

base_clf = Ner(model_dir="baseline_model/")
# ext_clf = Ner(model_dir="co-models/ext_2290_isw_model/")
ext_clf = Ner(model_dir="co-models/ext_4847_isw_model/")

# Load test data, which used for error analysis
test_sents = joblib.load("data/30-test-isw-sentences.pkl")
test_labels = joblib.load("data/30-test-isw-labels.pkl")

In [17]:
def select_pred_by_tag(selected_tag, test_sents, test_labels, base_clf, ext_clf):
    sele_ls = []
    for sent, true_tag in zip(test_sents[:20], test_labels[:20]):
        base_pred = base_clf.predict(sent)
        ext_pred = ext_clf.predict(sent)
        
        base_tag = [dic['tag'] for dic in base_pred]
        ext_tag = [dic['tag'] for dic in ext_pred]
        
        if any(x in base_tag for x in selected_tag) or any(x in ext_tag for x in selected_tag):
            print("sent", sent)
            print("True", true_tag)
            print("base", base_tag)
            print("ext ", ext_tag)
            print("")
            sele_ls.append((sent, true_tag, base_tag, ext_tag))
        else:
            pass
    return sele_ls

In [18]:
# Load test result of baseline and cotrain model
baseline_ls = load_eval_result("baseline_model/test_results.txt")

# _1521_ls = load_eval_result("co-models/ext_1000_isw_model/test_results.txt")
# _2290_ls = load_eval_result("co-models/ext_2290_isw_model/test_results.txt")
_4847_ls = load_eval_result("co-models/ext_4847_isw_model/test_results.txt")

# Shows the better and worse tags [[PER, 0.8, 0.9]]
better_, worse_ = get_better_worse_tags(baseline_ls, _4847_ls)

## The "selected_tag" can be the ones "better" or "worse"
better_tags = list(better_.keys())
print("Better_tags:", better_tags)
print("")
worse_tags = list(worse_.keys())
print("Worse_tags:", worse_tags)


Better_tags: ['ART', 'CARDINAL', 'DATE', 'DUR', 'EVT', 'FAC', 'FREQ', 'GPE', 'LOC', 'MON', 'NRP', 'QUANT', 'RATE', 'TIME', 'macro avg']

Worse_tags: ['AGE', 'LAN', 'MISC', 'ORDINAL', 'ORG', 'PER', 'SORD', 'TITLE']


## Better predicted NER tag

In [19]:
print("")
# tag can be either "better" or "worse" on ext model preds...
selected_tag = ["B-DATE", "I-DATE"]
sele_ls = select_pred_by_tag(selected_tag, test_sents, test_labels, base_clf, ext_clf)


sent Aber ich mein in größeren Massen kann man sagen dass der ökonomische Faktor erst im 20
True ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE']
base ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE']
ext  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE']

sent Und äh im 9
True ['O', 'O', 'O', 'B-DATE']
base ['O', 'O', 'O', 'B-GPE']
ext  ['O', 'O', 'O', 'B-DATE']



## Worse predicted NER tag

In [20]:
print("")
# tag can be either "better" or "worse" on ext model preds...
selected_tag = ["B-ORDINAL", "I-ORDINAL"]
sele_ls = select_pred_by_tag(selected_tag, test_sents, test_labels, base_clf, ext_clf)




In [21]:
# Save into txt
analysis_ls = []
for sent, true_tag in zip(test_sents, test_labels):
    try:
        base_pred = base_clf.predict(sent)
        ext_pred = ext_clf.predict(sent)

        base_tag = [dic['tag'] for dic in base_pred]
        ext_tag = [dic['tag'] for dic in ext_pred]
        analysis_ls.append((sent, true_tag, base_tag, ext_tag))
    except:
        pass

with open("4847_analysis.txt", "w", encoding="utf-8") as writer:
    for (sent, true_tag, base_tag, ext_tag) in analysis_ls:
        writer.write("sent    "+str(sent)+'\n')
        writer.write("True Tag"+str(true_tag)+'\n')
        writer.write("baseline"+str(base_tag)+'\n')
        writer.write("ext_tag "+str(ext_tag)+'\n')
        writer.write('\n')
writer.close()

Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors


In [23]:
analysis_ls[0]

('Die Lehrer waren zum Teil Christen zum Teil Juden',
 ['O', 'O', 'O', 'O', 'O', 'B-NRP', 'O', 'O', 'B-NRP'],
 ['O', 'O', 'O', 'O', 'O', 'B-NRP', 'O', 'O', 'B-NRP'],
 ['O', 'O', 'O', 'O', 'O', 'B-NRP', 'O', 'O', 'B-NRP'])

In [24]:
len(analysis_ls)

4825

In [36]:
def get_better_preds(analysis_ls):
    for (sent, true, base, ext) in analysis_ls:
        if true==ext:
            if true!=base:
                print("sent", sent)
                print("True", true)
                print("base", base)
                print("ext ", ext)
                print("")
        else:
            pass
        
        
def get_worse_preds(analysis_ls):
    for (sent, true, base, ext) in analysis_ls:
        if true==base:
            if true!=ext:
                print("sent", sent)
                print("True", true)
                print("base", base)
                print("ext ", ext)
                print("")
        else:
            pass

In [37]:
get_better_preds(analysis_ls)

sent Aber ich mein in größeren Massen kann man sagen dass der ökonomische Faktor erst im 20
True ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE']
base ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE']
ext  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATE']

sent Hab ich gesagt Nein das ist nicht die Sprache des Feindes Deutsch war Deutsche Literatur und deutsche Sprache war noch lang bevor Hitler auf die Welt gekommen ist und die Sprache hat nichts zu tun mit Hitler
True ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAN', 'O', 'B-LAN', 'O', 'O', 'B-LAN', 'I-LAN', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER']
base ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LAN', 'O', 'B-LAN', 'O', 'O', 'B-LAN', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PE

In [38]:
get_worse_preds(analysis_ls)

sent Und das sind Dinge die man niemals sich hätte vorstellen können
True ['O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O']
base ['O', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'O', 'O', 'O', 'O']
ext  ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

sent Da fahr ich durch Zürich oder durch Paris oder so und die fahrn immer durch Wien und fühlen sich sehr gut mit den Austrian Airlines
True ['O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG']
base ['O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG']
ext  ['O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'B-FREQ', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG']

sent Dort gehts ja jetzt heute überhaupt über alle Fremden nicht nur über die Juden
True ['O', 'O', 'O', 'B-TIME', 'B-TIME', 'O', 'O', 'O', '