# 240 : Qualitative analysis using M2

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import os, sys
from pathlib import Path

ENV_IS_GOOGLE_COLAB = True if 'google.colab' in str(get_ipython()) else False
os.environ["ENV_IS_GOOGLE_COLAB"] = str(ENV_IS_GOOGLE_COLAB)

if ENV_IS_GOOGLE_COLAB:
  from google.colab import drive
  mountpoint = Path("/content/drive")
  drive.mount(str(mountpoint)) # Mount gdrive to BASE
  base = mountpoint / "MyDrive/article_icdar_2023" # Adapt this to your situation
  sys.path.append(str(base)) # Add BASE to Python Path
  BASE = Path(base).resolve() # Make BASE absolute
  DATASETS =  BASE / "dataset_ICDAR"
  OUT_BASE = BASE / "res_ICDAR/method_2"
else:
  BASE = Path().resolve() # Directory of this approach
  #Adapt this to your situation
  DATASETS = Path('../dataset_ICDAR').resolve() #Where your data are located befor Dataset object creation
  OUT_BASE = Path('../res_ICDAR/method_2').resolve() #Where you save the results of this notebook

print(sys.path)
print(BASE)
print(DATASETS)
print(OUT_BASE)

## Model

Choose a fine-tuned model to perform qualitative analysis. Load models from the HuggingFace.

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
models = ['m2_joint_label_ref_cmbert_io',
 'm2_joint_label_ref_ptrn_cmbert_io',
 'm2_joint_label_ref_cmbert_iob2',
 'm2_joint_label_ref_ptrn_cmbert_iob2',
 'm2_joint_label_ocr_cmbert_io',
 'm2_joint_label_ocr_ptrn_cmbert_io',
 'm2_joint_label_ocr_cmbert_iob2',
 'm2_joint_label_ocr_ptrn_cmbert_iob2',
]

In [None]:
model_name = widgets.RadioButtons(
            options=models,
            layout={'width': 'max-content'}
        )
model_name

In [None]:
MODEL = model_name.value
FORMAT = 'IO'

if 'ref' in MODEL:
    SET = "ref"
elif 'ocr' in MODEL:
    SET = "ocr"

if 'ptrn' in MODEL:
    MODEL_TYPE = 'pretrained_camembert_ner'
else:
    MODEL_TYPE = 'camembert_ner'
    
if 'm2_joint_label_ref_cmbert_io' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ref_cmbert_io'
elif 'm2_joint_label_ref_ptrn_cmbert_io' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ref_ptrn_cmbert_io'
elif 'm2_joint_label_ref_cmbert_iob2' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ref_cmbert_iob2'
elif 'm2_joint_label_ref_ptrn_cmbert_iob2' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ref_ptrn_cmbert_iob2'
elif 'm2_joint_label_ocr_cmbert_io' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ocr_cmbert_io'
elif 'm2_joint_label_ocr_ptrn_cmbert_io' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ocr_ptrn_cmbert_io'
elif 'm2_joint_label_ocr_cmbert_iob2' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ocr_cmbert_iob2'
elif 'm2_joint_label_ocr_ptrn_cmbert_iob2' in MODEL:
    DATASET = 'm2m3_qualitative_analysis_ocr_ptrn_cmbert_iob2'

## Load data

### Gold

In [None]:
import os
from pathlib import Path
from config import logger
from datasets import load_dataset

TRAINSETS_SIZES = [6084] #To train only on the biggest dataset
train_dev_test = load_dataset('nlpso/' + DATASET)
test = train_dev_test["test"]
test[0]

### Non-structured entries

In [None]:
PATH = f"{DATASETS}/qualitative_analysis/test_entries_{SET}.txt"
with open(PATH, 'r',encoding='utf8') as ex:
    lines = ex.read()
    lines = lines.split('\n')
len(lines)

## Use model on entries

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline

LIMIT = 100

tokenizer = AutoTokenizer.from_pretrained('nlpso/' + MODEL) #Param : tokenizer du modele souhaité
model = AutoModelForTokenClassification.from_pretrained('nlpso/' + MODEL) #Modèle choisi

#Classification des entités
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=None, use_fast=True)

In [None]:
from xmlize_util import get_NER_tags
if FORMAT == "IOB2":
    from camembert_utils.util_IOB2 import word_tokens_from_nested_xml_iob2
else:
    from camembert_utils.util_IO import word_tokens_from_nested_xml
from xmlize_util import get_NER_tags, xmlize_multilevel

stats = []
k = 0 
errors_count = 0
errors = []
for i in range(len(lines)):
    print(lines[i])
    res = nlp(lines[i])
    preds_tokens, preds_tags = [],[]

    for e in res:
        preds_tokens.append(e["word"])
        preds_tags.append(e['entity'])
    levels, num_levels = get_NER_tags(lines[i],res,FORMAT)
    xml = xmlize_multilevel(levels,num_levels)

    try:
        assert len(test[i]["tokens"]) == len(preds_tokens)
        assert len(test[i]["ner_tags"]) == len(preds_tags)
        stats.append([k,lines[i],xml,test[i]["tokens"],test[i]["ner_tags"],preds_tokens,preds_tags])
    except:
        errors_count += 1
        print(test[i]["tokens"])
        print(preds_tokens)
        print("NUM " + str(k))
        print(lines[i][:-1])
    k += 1
    
#print(errors_count)

In [None]:
import pandas as pd
columns = ["index","entry","entry_xml","spans_gold","tags_gold","spans_preds","tags_preds"]
df = pd.DataFrame(stats,columns=columns)
df

## F1-Score ranking
F1-Score is calculated for each entry using seqeval library :
* entities are rebuild using joint-labels with seqeval lib
* f1-score is calculted for each entry

In [None]:
from seqeval.metrics import f1_score

scores = []
count = 0
for i in range(len(df)):
    y_preds = df.iloc[i]["tags_preds"]
    y_true = df.iloc[i]["tags_gold"]
    try:
        f1 = f1_score([y_true],[y_preds])
        scores.append(f1)
    except:
        count += 1
        print(df.iloc[i])
print(count)

In [None]:
df["f1"] = scores
df = df.sort_values(by=['f1']).reset_index()
del df["level_0"]

In [None]:
df

In [None]:
import pylab as pl

df.hist(column='f1',bins=40,sharey=True, sharex=True)
pl.suptitle('Entery-scale F1-Score distribution over test set')

### 15-Top worst

In [None]:
for i in range(0,16):
    print(f"INDEX {i}")
    print(df.iloc[i]["entry"])
    print(df.iloc[i]["entry_xml"])
    print(f"F1-Score : {df.iloc[i]['f1']}")
    print("")

### 15-Top best

In [None]:
rdf = df.sort_values(by='f1', ascending=False)
for i in range(0,16):
    print(rdf.iloc[i]["entry"])
    print(rdf.iloc[i]["entry_xml"])
    print(f"F1-Score : {rdf.iloc[i]['f1']}")
    print("")

## Sub-word global analysis

In [None]:
flat_preds = []
flat_labels = []
for i in range(len(df)):
    flat_preds += df["tags_preds"][i]
    flat_labels += df["tags_gold"][i]
flat_preds = pd.Series(flat_preds, name="Predictions")
flat_labels = pd.Series(flat_labels, name="Gold")

global_confusion = pd.crosstab(flat_labels, flat_preds,normalize='index').multiply(100., axis=1)
global_confusion.columns = ["ACT+O","DESC+O","DESC+ACT","DESC+TITREP","PER+O","PER+TITREH","SPAT+O","SPAT+CARDINAL","SPAT+FT","SPAT+LOC","O+O"]
global_confusion.index = ["ACT+O","DESC+O","DESC+ACT","DESC+TITREP","PER+O","PER+TITREH","SPAT+O","SPAT+CARDINAL","SPAT+FT","SPAT+LOC","O+O"]
global_confusion

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Init figure
plt.figure(figsize=(34, 16))

#Create heatmap
snsfig = sns.heatmap(
    global_confusion, 
    annot = True, #Display labels
    cmap=sns.color_palette("rocket_r", as_cmap=True), #Color
    fmt=".1f",
    cbar=False,
    annot_kws={"fontsize":40}
)

#Rename label axis and set their style
plt.xlabel('Predictions',weight = 'bold',fontsize=35) # x-axis label with fontsize 15
plt.ylabel('Gold',weight = 'bold',fontsize=35) # y-axis label with fontsize 15

#Set x labels position to top
snsfig.xaxis.tick_top()
snsfig.xaxis.set_label_position('top')

#Rotate y ticks horizontaly
plt.yticks(rotation=0) 

#Change ticks size
snsfig.set_xticklabels(snsfig.get_xmajorticklabels(), fontsize = 32)
snsfig.set_yticklabels(snsfig.get_ymajorticklabels(), fontsize = 32)

print("Confusion matrix of reference and predicted tokens types.")
print("Values are normalized by row (percentage of each reference classe and its resultants predictions)")
print("Last row represent percentage of each class in gold.")

plt.show()

#Save figure
fig = snsfig.get_figure()
fig.tight_layout()
fig.savefig(f"./tokenscaleanalysis-{FORMAT}-{SET}.pdf") 

## Entry scale analysis

Please choose one entry giving its index to perform an entry scale analysis :

In [None]:
i = 15

In [None]:
print(df.iloc[i]["entry"])
print(df.iloc[i]["entry_xml"])

In [None]:
from seqeval.metrics import classification_report

y_preds = df.iloc[i]["tags_preds"]
y_true = df.iloc[i]["tags_gold"]
print(classification_report([y_true],[y_preds]))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

y_true = pd.Series(df["tags_gold"].iloc[i], name='Gold')
y_pred = pd.Series(df["tags_preds"].iloc[i], name='Predictions')
entry_confusion = pd.crosstab(y_true, y_pred)
entry_confusion

In [None]:
#Create heatmap
snsfig = sns.heatmap(
    entry_confusion, 
    annot = True, #Display labels
    cmap=sns.color_palette("rocket_r", as_cmap=True), #Color
    fmt="g",
    cbar=False,
    annot_kws={"fontsize":30}
)

#Rename label axis and set their style
plt.xlabel('Predictions',weight = 'bold',fontsize=25) # x-axis label with fontsize 15
plt.ylabel('Gold',weight = 'bold',fontsize=25) # y-axis label with fontsize 15

#Set x labels position to top
snsfig.xaxis.tick_top()
snsfig.xaxis.set_label_position('top')

#Rotate y ticks horizontaly
plt.yticks(rotation=0) 

#Change ticks size
snsfig.set_xticklabels(snsfig.get_xmajorticklabels(), fontsize = 20)
snsfig.set_yticklabels(snsfig.get_ymajorticklabels(), fontsize = 20)

print("Confusion matrix of reference and predicted tokens types.")

plt.show()

#Save figure
fig = snsfig.get_figure()