# 150 : Qualitative analysis using M1

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_1"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_1').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

## Model

Choose a fine-tuned model to perform qualitative analysis. Load models from the HuggingFace.

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
models = ['nlpso/m1_ind_layers_ref_cmbert_io',
 'nlpso/m1_ind_layers_ref_cmbert_iob2',
 'nlpso/m1_ind_layers_ref_ptrn_cmbert_io',
 'nlpso/m1_ind_layers_ref_ptrn_cmbert_iob2',
 'nlpso/m1_ind_layers_ocr_cmbert_io',
 'nlpso/m1_ind_layers_ocr_cmbert_iob2',
 'nlpso/m1_ind_layers_ocr_ptrn_cmbert_io',
 'nlpso/m1_ind_layers_ocr_ptrn_cmbert_iob2',
]

In [None]:
model = widgets.RadioButtons(
            options=models,
            layout={'width': 'max-content'}
        )
model

In [None]:
### Model you want to run
MODEL_PATH_L1 = model.value + '_level_1'
MODEL_PATH_L2 = model.value + '_level_2'

In [None]:
MODEL = model.value
FORMAT = 'IO'

if 'ref' in model.value:
    SET = "ref"
elif 'ocr' in model.value:
    SET = "ocr"

if 'ptrn' in model.value:
    MODEL_TYPE = 'pretrained_camembert_ner'
else:
    MODEL_TYPE = 'camembert_ner'
    
if 'm1_ind_layers_ref_cmbert_io' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ref_cmbert_io'
elif 'm1_ind_layers_ref_cmbert_iob2' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ref_cmbert_iob2'
elif 'm1_ind_layers_ref_ptrn_cmbert_io' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ref_ptrn_cmbert_io'
elif 'm1_ind_layers_ref_ptrn_cmbert_iob2' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ref_ptrn_cmbert_iob2'
elif 'm1_ind_layers_ocr_cmbert_io' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ocr_cmbert_io'
elif 'm1_ind_layers_ocr_cmbert_iob2' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ocr_cmbert_iob2'
elif 'm1_ind_layers_ocr_ptrn_cmbert_io' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ocr_ptrn_cmbert_io'
elif 'm1_ind_layers_ocr_ptrn_cmbert_iob2' in MODEL:
    DATASET = 'nlpso/m1_qualitative_analysis_ocr_ptrn_cmbert_iob2'

print(f"MODEL : {MODEL}")
print(f"MODEL TYPE : {MODEL_TYPE}")
print(f"FORMAT : {FORMAT}")
print(f"SET : {SET}")

## Load data

### Gold

In [None]:
import os
from pathlib import Path
from config import logger
from datasets import load_dataset

TRAINSETS_SIZES = [6084]
train_dev_test = load_dataset(DATASET)
test = train_dev_test["test"]
len(test)

### Non-structured entries

In [None]:
PATH = f"{DATASETS}/qualitative_analysis/test_entries_{SET}.txt"
with open(PATH, 'r',encoding='utf8') as ex:
    lines = ex.read()
    lines = lines.split('\n')
len(lines)

## Pipeline

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline

LIMIT = 100

#Level-1 layer
tokenizer_l1 = AutoTokenizer.from_pretrained(MODEL_PATH_L1)
model_l1 = AutoModelForTokenClassification.from_pretrained(MODEL_PATH_L1)
nlp_l1 = TokenClassificationPipeline(model=model_l1, tokenizer=tokenizer_l1, aggregation_strategy=None, ignore_labels=[''])

#Level-2 layer
tokenizer_l2 = AutoTokenizer.from_pretrained(MODEL_PATH_L2)
model_l2 = AutoModelForTokenClassification.from_pretrained(MODEL_PATH_L2)
nlp_l2 = TokenClassificationPipeline(model=model_l2, tokenizer=tokenizer_l2, aggregation_strategy=None, ignore_labels=[''])

In [None]:
from xmlize_util import get_NER_tags, xmlize_multilevel, group_entities

k = 0
stats = []
errors_count = 0
errors = []
for i in range(len(lines)):
    s = lines[i]
    res = []
    res.append(nlp_l1(s))
    res.append(nlp_l2(s))
    
    #Create joint-labels at token scale
    preds_tokens = []
    preds_tags = []
    for j in range(len(res[0])):
        l1 = res[0]
        l2 = res[1]
        preds_tokens.append(l1[j]['word'])
        if l1[j]['entity'] != 'O' and l2[j]['entity'] != 'O':
            preds_tags.append('I-' + l1[j]['entity'][0].lower() + '_' + l1[j]['entity'][2:] + '+' + l2[j]['entity'][0].lower() + '_' + l2[j]['entity'][2:])
        elif l1[j]['entity'] != 'O' and l2[j]['entity'] == 'O':
            preds_tags.append('I-' + l1[j]['entity'][0].lower() + '_' + l1[j]['entity'][2:] + '+' + l2[j]['entity'])
        elif l1[j]['entity'] == 'O' and l2[j]['entity'] != 'O':
            preds_tags.append('I-' + l1[j]['entity'] + '+' + l2[j]['entity'][0].lower() + '_' + l2[j]['entity'][2:])
        else:
            preds_tags.append('O+O')
            
    test_tags = []
    for h in range(len(test[i]["tokens"])):
        if test[i]["ner_tags_niv1"][h] != 'O' and test[i]["ner_tags_niv2"][h] != 'O':
            test_tags.append('I-' + test[i]["ner_tags_niv1"][h][0].lower() + '_' + test[i]["ner_tags_niv1"][h][2:] + '+' + test[i]["ner_tags_niv2"][h][0].lower() + '_' + test[i]["ner_tags_niv2"][h][2:])
        elif test[i]["ner_tags_niv1"][h] != 'O' and test[i]["ner_tags_niv2"][h] == 'O':
            test_tags.append('I-' + test[i]["ner_tags_niv1"][h][0].lower() + '_' + test[i]["ner_tags_niv1"][h][2:] + '+O')
        else:
            test_tags.append('O+O')
    
    #Create XML output
    aggregate = [group_entities(res[0],tokenizer_l1),group_entities(res[1],tokenizer_l1)]
    levels, num_levels = get_NER_tags(s,aggregate)
    
    if len(levels['niv_1']) > 0:
        xml = xmlize_multilevel(levels,num_levels)
        print(xml)
        print('')
    
    try:
        assert len(test[i]["tokens"]) == len(preds_tokens)
        assert len(test_tags) == len(preds_tags)
        
        stats.append([k,s,xml,test[i]["tokens"],test_tags,preds_tokens,preds_tags])
    except:
        errors_count += 1
        print("NUM " + str(k))
        print(s)
        print(xml)
        
    k += 1

In [None]:
columns = ["index","entry","entry_xml","spans_gold","tags_gold","spans_preds","tags_preds"]
df = pd.DataFrame(stats,columns=columns)
df

## F1-Score ranking
F1-Score is calculated for each entry using seqeval library :
* entities are rebuild using joint-labels with seqeval lib
* f1-score is calculted for each entry

In [None]:
from seqeval.metrics import f1_score

scores = []
count = 0
for i in range(len(df)):
    y_preds = df.iloc[i]["tags_preds"]
    y_true = df.iloc[i]["tags_gold"]
    try:
        f1 = f1_score([y_true],[y_preds])
        scores.append(f1)
    except:
        count += 1
        print(df.iloc[i])
print(count)

In [None]:
df_f1 = df.copy()
df_f1["f1"] = scores
df_f1 = df_f1.sort_values(by=['f1']).reset_index()
del df_f1["level_0"]

In [None]:
df_f1

In [None]:
import pylab as pl

df_f1.hist(column='f1',bins=40,sharey=True, sharex=True)
pl.suptitle('Entery-scale F1-Score distribution over test set')

### 15-Top worst

In [None]:
for i in range(len(df_f1[0:14])):
    print(f"INDEX {i}")
    print(df_f1.iloc[i]["entry"])
    print(rdf.iloc[i]["entry_xml"])
    print(f"F1-Score : {df_f1.iloc[i]['f1']}")
    print("")

### 15-Top best

In [None]:
rdf = df_f1.sort_values(by='f1', ascending=False)
for i in range(len(rdf[0:14])):
    print(rdf.iloc[i]["entry"])
    print(rdf.iloc[i]["entry_xml"])
    print(f"F1-Score : {rdf.iloc[i]['f1']}")
    print("")

## Sub-word global analysis

In [None]:
flat_preds = []
flat_labels = []
for i in range(len(df)):
    flat_preds += df["tags_preds"][i]
    flat_labels += df["tags_gold"][i]
flat_preds = pd.Series(flat_preds, name="Predictions")
flat_labels = pd.Series(flat_labels, name="Gold")

global_confusion = pd.crosstab(flat_labels, flat_preds,normalize='index').multiply(100., axis=1)

col = []
for c in global_confusion.columns:
    c = c.replace('I-','')
    c = c.replace('i_','')
    c = c.replace('B-','')
    c = c.replace('b_','')
    col.append(c)
global_confusion.columns = col

ind = []
for c in global_confusion.index:
    c = c.replace('I-','')
    c = c.replace('i_','')
    c = c.replace('B-','')
    c = c.replace('b_','')
    ind.append(c)
global_confusion.index = ind
global_confusion

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Init figure
plt.figure(figsize=(60, 16))

#Create heatmap
snsfig = sns.heatmap(
    global_confusion, 
    annot = True, #Display labels
    cmap=sns.color_palette("rocket_r", as_cmap=True), #Color
    fmt=".1f",
    cbar=False,
    annot_kws={"fontsize":40}
)

#Rename label axis and set their style
plt.xlabel('Predictions',weight = 'bold',fontsize=35)
plt.ylabel('Gold',weight = 'bold',fontsize=35)

#Set x labels position to top
snsfig.xaxis.tick_top()
snsfig.xaxis.set_label_position('top')

#Rotate y ticks horizontaly
plt.yticks(rotation=0) 

#Change ticks size
snsfig.set_xticklabels(snsfig.get_xmajorticklabels(), fontsize = 32)
snsfig.set_yticklabels(snsfig.get_ymajorticklabels(), fontsize = 32)

print("Confusion matrix of reference and predicted tokens types.")
print("Values are normalized by row (percentage of each reference classe and its resultants predictions)")
print("Last row represent percentage of each class in gold.")

plt.show()

#Save figure
fig = snsfig.get_figure()
fig.tight_layout()
#fig.savefig(f"./tokenscaleanalysis-{FORMAT}-{SET}.pdf") 

## Entry scale analysis

Please choose one entry giving its index to perform an entry scale analysis :

In [None]:
i = 2

In [None]:
print(df.iloc[i]["entry"])
print(df.iloc[i]["entry_xml"])

In [None]:
from seqeval.metrics import classification_report

y_preds = df.iloc[i]["tags_preds"]
y_true = df.iloc[i]["tags_gold"]
print(classification_report([y_true],[y_preds]))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y_true = pd.Series(df["tags_gold"].iloc[i], name='Gold')
y_pred = pd.Series(df["tags_preds"].iloc[i], name='Predictions')
entry_confusion = pd.crosstab(y_true, y_pred)

col = []
for c in entry_confusion.columns:
    c = c.replace('I-','')
    c = c.replace('i_','')
    c = c.replace('B-','')
    c = c.replace('b_','')
    col.append(c)
entry_confusion.columns = col

ind = []
for c in entry_confusion.index:
    c = c.replace('I-','')
    c = c.replace('i_','')
    c = c.replace('B-','')
    c = c.replace('b_','')
    ind.append(c)
entry_confusion.index = ind
entry_confusion

In [None]:
#Init figure
plt.figure(figsize=(15, 10))

#Create heatmap
snsfig = sns.heatmap(
    entry_confusion, 
    annot = True, #Display labels
    cmap=sns.color_palette("rocket_r", as_cmap=True), #Color
    fmt="g",
    cbar=False,
    annot_kws={"fontsize":30}
)

#Rename label axis and set their style
plt.xlabel('Predictions',weight = 'bold',fontsize=25) # x-axis label with fontsize 15
plt.ylabel('Gold',weight = 'bold',fontsize=25) # y-axis label with fontsize 15

#Set x labels position to top
snsfig.xaxis.tick_top()
snsfig.xaxis.set_label_position('top')

#Rotate y ticks horizontaly
plt.yticks(rotation=0) 

#Change ticks size
snsfig.set_xticklabels(snsfig.get_xmajorticklabels(), fontsize = 20)
snsfig.set_yticklabels(snsfig.get_ymajorticklabels(), fontsize = 20)

print("Confusion matrix of reference and predicted tokens types.")

plt.show()

#Save figure
fig = snsfig.get_figure()