# Pràctica 3

In [None]:
%pip install python-crfsuite

In [2]:
import nltk
import pycrfsuite
from nltk.corpus import conll2002
from nltk.tag import CRFTagger
from sklearn.metrics import accuracy_score

In [3]:
nltk.download('punkt') # Tokenitzador
nltk.download('averaged_perceptron_tagger') # Etiquetador POS
nltk.download('maxent_ne_chunker') # Etiquetador Entitats Anomenades
nltk.download('words')
nltk.download('treebank')
nltk.download('conll2002')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package conll2002 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2002 is already up-to-d

True

In [4]:
train_esp = conll2002.iob_sents('esp.train') # Train, ned.train => Neerlandès
testa_esp = conll2002.iob_sents('esp.testa') # Dev
testb_esp = conll2002.iob_sents('esp.testb') # Test

In [40]:
train_ned = conll2002.iob_sents('ned.train') # Train, ned.train => Neerlandès
testa_ned = conll2002.iob_sents('ned.testa') # Dev
testb_ned = conll2002.iob_sents('ned.testb') # Test

## Predicció amb BIO

### Espanyol

In [6]:
model_tagger = CRFTagger()

In [7]:
# Entrenem el model per predir els POS que corresponen a cada token

train_esp_pos_tag = []
for sentence in train_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem2))
    train_esp_pos_tag.append(frases)
    
model_tagger.train(train_esp_pos_tag, 'model_POS.crf.tagger')


# Fem prediccions i mirem l'accuracy

testa_esp_pre_tag = []
for sentence in testa_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append(elem1)
    testa_esp_pre_tag.append(frases)
    
predicted = model_tagger.tag_sents(testa_esp_pre_tag)

predictions = [elem[1] for sentence in predicted for elem in sentence]
real_label = [elem[1] for sentence in testa_esp for elem in sentence]

print(accuracy_score(predictions, real_label))

0.9447121289420479


<span> Features que es tenen en compte:
<ul>
    <li>Paraula actual</li>
    <li>Si comença en majúscula</li>
    <li>Si té signe de puntuació</li>
    <li>Si té números</li>
    <li>Prefixos fins a longitud 3</li>
    <li>Sufixos fins a longitud 3</li>
    <li>Paraules prèvies i posteriors amb POS</li>
    <li>POS-tags</li>
    <li>Longitud de la paraula</li>
</ul>
</span>

In [8]:
from nltk.tag import CRFTagger
import unicodedata
import re

class FeatureExtractor:
    def __init__(self, pattern):
        self._pattern = pattern

    def _get_features(self, tokens, idx):
        """
        Extract basic features about this word including
            - Current word
            - is it capitalized?
            - Does it have punctuation?
            - Does it have a number?
            - Preffixes up to length 3
            - Suffixes up to length 3
            - paraules prèvies i posteriors amb POS
            - POS-tags
            - longitud

        Note that : we might include feature over previous word, next word etc.

        :return: a list which contains the features
        :rtype: list(str)
        """
        token = tokens[idx]

        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(self._pattern, token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")
            
        # preffix up to length 3
        if len(token) > 1:
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("PRE_" + token[:3])

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
        
        # POS_tags
        POS = model_tagger.tag(tokens)
            
        # Paraules prèvies amb POS
        if idx > 0:
            feature_list.append("anterior1_" + tokens[idx-1] + "_" + POS[idx-1][1])
        if idx > 1:
            feature_list.append("anterior2_" + tokens[idx-2] + "_" + POS[idx-2][1])
            
        # Paraules posteriors amb POS
        if idx < (len(tokens)-1):
            feature_list.append("posterior1_" + tokens[idx+1] + "_" + POS[idx+1][1])
        if idx < (len(tokens)-2):
            feature_list.append("posterior2_" + tokens[idx+2] + "_" + POS[idx+2][1])

        feature_list.append("WORD_" + token)

        return feature_list

# Crear una instancia de FeatureExtractor
pattern = r'\d+'  # Patrón para encontrar números
feature_extractor = FeatureExtractor(pattern)

train_esp_BIO_tag = []
for sentence in train_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem3))
    train_esp_BIO_tag.append(frases)
    
model_BIO = CRFTagger(feature_func=feature_extractor._get_features)
model_BIO.train(train_esp_BIO_tag, 'model_BIO.crf.tagger')

#### Utilitzem aquest accuracy per ajustar els features

In [9]:
testa_esp_BIO_tag = []
for sentence in testa_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem3))
    testa_esp_BIO_tag.append(frases)

predicted_BIO = model_BIO.tag_sents(testa_esp_pre_tag)

predictions_BIO = [elem[1] for sentence in predicted_BIO for elem in sentence]
real_label_BIO = [elem[1] for sentence in testa_esp_BIO_tag for elem in sentence]

print(accuracy_score(predictions_BIO, real_label_BIO))

0.9580711599871511


#### Reconeixem entitats i avaluem la detecció d'entitats

In [10]:
entitats_reals_testa_esp = []

for sentence in testa_esp_BIO_tag:
    ent = []
    name = None
    prev_tag = None  # Almacenar la etiqueta del token anterior
    
    for token in sentence:
        word, tag = token
        
        if tag.startswith('B-'):
            # Si hay una entidad anterior, la agregamos a la lista de entidades
            if ent:
                entitats_reals_testa_esp.append((tuple(ent), name))
            # Creamos una nueva entidad con la palabra actual
            ent = [word]
            # Obtenemos el tipo de entidad
            name = tag.split('-')[1]
            prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag.startswith('I-'):
            # Solo agregamos la palabra actual si el token anterior tiene etiqueta I- o B-
            if prev_tag:
                ent.append(word)
                prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag == 'O' and ent:
            # Si encontramos una etiqueta 'O' y hay una entidad en curso, la agregamos a la lista de entidades
            entitats_reals_testa_esp.append((tuple(ent), name))
            # Reiniciamos la lista de la entidad actual
            ent = []
            prev_tag = None  # Reiniciamos la etiqueta del token anterior

    # Agregamos la última entidad si la hay
    if ent:
        entitats_reals_testa_esp.append((tuple(ent), name))     

In [11]:
predicted_BIO = model_BIO.tag_sents(testa_esp_pre_tag)

entitats_predites_testa_esp = []

for sentence in predicted_BIO:
    ent = []
    name = None
    prev_tag = None  # Almacenar la etiqueta del token anterior
    
    for token in sentence:
        word, tag = token
        
        if tag.startswith('B-'):
            # Si hay una entidad anterior, la agregamos a la lista de entidades
            if ent:
                entitats_predites_testa_esp.append((tuple(ent), name))
            # Creamos una nueva entidad con la palabra actual
            ent = [word]
            # Obtenemos el tipo de entidad
            name = tag.split('-')[1]
            prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag.startswith('I-'):
            # Solo agregamos la palabra actual si el token anterior tiene etiqueta I- o B-
            if prev_tag:
                ent.append(word)
                prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag == 'O' and ent:
            # Si encontramos una etiqueta 'O' y hay una entidad en curso, la agregamos a la lista de entidades
            entitats_predites_testa_esp.append((tuple(ent), name))
            # Reiniciamos la lista de la entidad actual
            ent = []
            prev_tag = None  # Reiniciamos la etiqueta del token anterior

    # Agregamos la última entidad si la hay
    if ent:
        entitats_predites_testa_esp.append((tuple(ent), name))

In [12]:
### AVALUEM ###
encerts = 0
for entitat in entitats_predites_testa_esp:
    if entitat in entitats_reals_testa_esp:
        encerts += 1

total_entitats_reals = len(entitats_reals_testa_esp)
total_entitats_predites = len(entitats_predites_testa_esp)

recall = encerts / total_entitats_reals

if total_entitats_predites != 0:
    precision = encerts / total_entitats_predites
    f_score = (2 * precision * recall) / (precision + recall)
else:
    precision = 0
    f_score = 0

print("Recall:", recall)
print("Precision:", precision)
print("F-score:", f_score)


Recall: 0.7747644219719605
Precision: 0.8156302927655457
F-score: 0.7946723243752947


### Neerlandès

In [13]:
model_tagger = CRFTagger()

In [14]:
# Entrenem el model per predir els POS que corresponen a cada token

train_ned_pos_tag = []
for sentence in train_ned:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem2))
    train_ned_pos_tag.append(frases)
    
model_tagger.train(train_ned_pos_tag, 'model_POS.crf.tagger')


# Fem prediccions i mirem l'accuracy

testa_ned_pre_tag = []
for sentence in testa_ned:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append(elem1)
    testa_ned_pre_tag.append(frases)
    
predicted = model_tagger.tag_sents(testa_ned_pre_tag)

predictions = [elem[1] for sentence in predicted for elem in sentence]
real_label = [elem[1] for sentence in testa_ned for elem in sentence]

print(accuracy_score(predictions, real_label))

0.940191577997718


In [15]:
from nltk.tag import CRFTagger
import unicodedata
import re

class FeatureExtractor:
    def __init__(self, pattern):
        self._pattern = pattern

    def _get_features(self, tokens, idx):
        """
        Extract basic features about this word including
            - Current word
            - is it capitalized?
            - Does it have punctuation?
            - Does it have a number?
            - Preffixes up to length 3
            - Suffixes up to length 3
            - paraules prèvies i posteriors amb POS
            - POS-tags
            - longitud

        Note that : we might include feature over previous word, next word etc.

        :return: a list which contains the features
        :rtype: list(str)
        """
        token = tokens[idx]

        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(self._pattern, token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")
            
        # preffix up to length 3
        if len(token) > 1:
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("PRE_" + token[:3])

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
        
        # POS_tags
        POS = model_tagger.tag(tokens)
            
        # Paraules prèvies amb POS
        if idx > 0:
            feature_list.append("anterior1_" + tokens[idx-1] + "_" + POS[idx-1][1])
        if idx > 1:
            feature_list.append("anterior2_" + tokens[idx-2] + "_" + POS[idx-2][1])
            
        # Paraules posteriors amb POS
        if idx < (len(tokens)-1):
            feature_list.append("posterior1_" + tokens[idx+1] + "_" + POS[idx+1][1])
        if idx < (len(tokens)-2):
            feature_list.append("posterior2_" + tokens[idx+2] + "_" + POS[idx+2][1])

        feature_list.append("WORD_" + token)

        return feature_list

# Crear una instancia de FeatureExtractor
pattern = r'\d+'  # Patrón para encontrar números
feature_extractor = FeatureExtractor(pattern)

train_ned_BIO_tag = []
for sentence in train_ned:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem3))
    train_ned_BIO_tag.append(frases)
    
model_BIO = CRFTagger(feature_func=feature_extractor._get_features)
model_BIO.train(train_ned_BIO_tag, 'model_BIO.crf.tagger')

In [16]:
testa_ned_BIO_tag = []
for sentence in testa_ned:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem3))
    testa_ned_BIO_tag.append(frases)

predicted_BIO = model_BIO.tag_sents(testa_ned_pre_tag)

predictions_BIO = [elem[1] for sentence in predicted_BIO for elem in sentence]
real_label_BIO = [elem[1] for sentence in testa_ned_BIO_tag for elem in sentence]

print(accuracy_score(predictions_BIO, real_label_BIO))

0.9683975906811367


In [17]:
entitats_reals_testa_ned = []

for sentence in testa_ned_BIO_tag:
    ent = []
    name = None
    prev_tag = None  # Almacenar la etiqueta del token anterior
    
    for token in sentence:
        word, tag = token
        
        if tag.startswith('B-'):
            # Si hay una entidad anterior, la agregamos a la lista de entidades
            if ent:
                entitats_reals_testa_ned.append((tuple(ent), name))
            # Creamos una nueva entidad con la palabra actual
            ent = [word]
            # Obtenemos el tipo de entidad
            name = tag.split('-')[1]
            prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag.startswith('I-'):
            # Solo agregamos la palabra actual si el token anterior tiene etiqueta I- o B-
            if prev_tag:
                ent.append(word)
                prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag == 'O' and ent:
            # Si encontramos una etiqueta 'O' y hay una entidad en curso, la agregamos a la lista de entidades
            entitats_reals_testa_ned.append((tuple(ent), name))
            # Reiniciamos la lista de la entidad actual
            ent = []
            prev_tag = None  # Reiniciamos la etiqueta del token anterior

    # Agregamos la última entidad si la hay
    if ent:
        entitats_reals_testa_ned.append((tuple(ent), name))     

In [18]:
predicted_BIO = model_BIO.tag_sents(testa_ned_pre_tag)

entitats_predites_testa_ned = []

for sentence in predicted_BIO:
    ent = []
    name = None
    prev_tag = None  # Almacenar la etiqueta del token anterior
    
    for token in sentence:
        word, tag = token
        
        if tag.startswith('B-'):
            # Si hay una entidad anterior, la agregamos a la lista de entidades
            if ent:
                entitats_predites_testa_ned.append((tuple(ent), name))
            # Creamos una nueva entidad con la palabra actual
            ent = [word]
            # Obtenemos el tipo de entidad
            name = tag.split('-')[1]
            prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag.startswith('I-'):
            # Solo agregamos la palabra actual si el token anterior tiene etiqueta I- o B-
            if prev_tag:
                ent.append(word)
                prev_tag = tag  # Actualizamos la etiqueta del token anterior
        elif tag == 'O' and ent:
            # Si encontramos una etiqueta 'O' y hay una entidad en curso, la agregamos a la lista de entidades
            entitats_predites_testa_ned.append((tuple(ent), name))
            # Reiniciamos la lista de la entidad actual
            ent = []
            prev_tag = None  # Reiniciamos la etiqueta del token anterior

    # Agregamos la última entidad si la hay
    if ent:
        entitats_predites_testa_ned.append((tuple(ent), name))

In [19]:
### AVALUEM ###
encerts = 0
for entitat in entitats_predites_testa_ned:
    if entitat in entitats_reals_testa_ned:
        encerts += 1

total_entitats_reals = len(entitats_reals_testa_ned)
total_entitats_predites = len(entitats_predites_testa_ned)

recall = encerts / total_entitats_reals

if total_entitats_predites != 0:
    precision = encerts / total_entitats_predites
    f_score = (2 * precision * recall) / (precision + recall)
else:
    precision = 0
    f_score = 0

print("Recall:", recall)
print("Precision:", precision)
print("F-score:", f_score)


Recall: 0.6788990825688074
Precision: 0.7477894736842106
F-score: 0.7116810258465238


## Predicció amb IO

Espanyol

In [20]:
def convert_to_io(train_data_bio):
    train_data_io = []
    for sentence in train_data_bio:
        io_tags = []
        for word, pos_tag, bio_tag in sentence:
            if bio_tag == 'O':
                io_tags.append('O')
            elif bio_tag.startswith('B-'):
                io_tags.append('I' + bio_tag[1:])
            else:
                io_tags.append(bio_tag)
                
        train_data_io.append(list(zip([word for word, pos_tag, bio_tag in sentence], io_tags)))
    return train_data_io

train_esp_pre_IO = convert_to_io(train_esp)

# Entrenar el modelo CRFTagger con el esquema IO
model_IO = CRFTagger(feature_func=feature_extractor._get_features)
model_IO.train(train_esp_pre_IO, 'model_io.crf.tagger')

In [21]:
testa_pre_IO = convert_to_io(testa_esp)

predicted_IO = model_IO.tag_sents(testa_esp_pre_tag)

predictions_IO = [elem[1] for sentence in predicted_IO for elem in sentence]
real_label_IO = [elem[1] for sentence in testa_pre_IO for elem in sentence]

print(accuracy_score(predictions_IO, real_label_IO))

0.954953422897417


IO NED

In [22]:
def convert_to_io(train_data_bio):
    train_data_io = []
    for sentence in train_data_bio:
        io_tags = []
        for word, pos_tag, bio_tag in sentence:
            if bio_tag == 'O':
                io_tags.append('O')
            elif bio_tag.startswith('B-'):
                io_tags.append('I' + bio_tag[1:])
            else:
                io_tags.append(bio_tag)
                
        train_data_io.append(list(zip([word for word, pos_tag, bio_tag in sentence], io_tags)))
    return train_data_io

train_ned_pre_IO = convert_to_io(train_ned)

# Entrenar el modelo CRFTagger con el esquema IO
model_IO = CRFTagger(feature_func=feature_extractor._get_features)
model_IO.train(train_ned_pre_IO, 'model_io.crf.tagger')

In [23]:
testa_pre_IO = convert_to_io(testa_ned)

predicted_IO = model_IO.tag_sents(testa_ned_pre_tag)

predictions_IO = [elem[1] for sentence in predicted_IO for elem in sentence]
real_label_IO = [elem[1] for sentence in testa_pre_IO for elem in sentence]

print(accuracy_score(predictions_IO, real_label_IO))

0.969644705070714


BIOW

esp

In [53]:
def convert_to_biow(train_data_bio):
    train_data_biow = []
    for sentence in train_data_bio:
        biow_tags = []
        for word, pos_tag, bio_tag in sentence:
            if bio_tag == 'O':
                biow_tags.append('O')
            else:
                biow_tags.append(bio_tag + 'W')  # Añadir 'W' a todas las etiquetas
            
        train_data_biow.append(list(zip([word for word, pos_tag, bio_tag in sentence], biow_tags)))
    return train_data_biow


train_esp_pre_BIOW = convert_to_biow(train_esp)

# Entrenar el modelo CRFTagger con el esquema IO
model_BIOW = CRFTagger(feature_func=feature_extractor._get_features)
model_BIOW.train(train_esp_pre_BIOW, 'model_biow.crf.tagger')

In [54]:
testa_pre_BIOW = convert_to_biow(testa_esp)

predicted_BIOW = model_BIOW.tag_sents(testa_esp_pre_tag)

predictions_BIOW = [elem[1] for sentence in predicted_BIOW for elem in sentence]
real_label_BIOW = [elem[1] for sentence in testa_pre_BIOW for elem in sentence]

print(accuracy_score(predictions_BIOW, real_label_BIOW))

0.9576176709559171


ned

In [45]:
def convert_to_biow(train_data_bio):
    train_data_biow = []
    for sentence in train_data_bio:
        biow_tags = []
        for word, pos_tag, bio_tag in sentence:
            if bio_tag == 'O':
                biow_tags.append('O')
            else:
                biow_tags.append(bio_tag + 'W')  # Añadir 'W' a todas las etiquetas
            
        train_data_biow.append(list(zip([word for word, pos_tag, bio_tag in sentence], biow_tags)))
    return train_data_biow


train_ned_pre_BIOW = convert_to_biow(train_ned)

# Entrenar el modelo CRFTagger con el esquema IO
model_BIOW = CRFTagger(feature_func=feature_extractor._get_features)
model_BIOW.train(train_ned_pre_BIOW, 'model_biow.crf.tagger')

In [52]:
testa_pre_BIOW = convert_to_biow(testa_ned)

predicted_BIOW = model_BIOW.tag_sents(testa_ned_pre_tag)

predictions_BIOW = [elem[1] for sentence in predicted_BIOW for elem in sentence]
real_label_BIOW = [elem[1] for sentence in testa_pre_BIOW for elem in sentence]

print(accuracy_score(predictions_BIOW, real_label_BIOW))

0.9683975906811367


BIOES

esp

In [73]:
def convert_to_bioes(train_data_bio):
    train_data_bioes = []
    for sentence in train_data_bio:
        bioes_tags = []
        for i, (word, pos_tag, bio_tag) in enumerate(sentence):
            if bio_tag == 'O':
                bioes_tags.append('O')
            elif bio_tag.startswith('B-'):
                if i == len(sentence) - 1 or sentence[i + 1][2] != 'I' + bio_tag[1:]:
                    bioes_tags.append('S' + bio_tag[1:])  # Single
                else:
                    bioes_tags.append('B' + bio_tag[1:])  # Begin
            elif bio_tag.startswith('I-'):
                if i == len(sentence) - 1 or sentence[i + 1][2] != 'I' + bio_tag[1:]:
                    bioes_tags.append('E' + bio_tag[1:])  # End
                else:
                    bioes_tags.append('I' + bio_tag[1:])  # Inside
            else:
                raise ValueError("Etiqueta BIO incorrecta: {}".format(bio_tag))
                
        train_data_bioes.append(list(zip([word for word, pos_tag, bio_tag in sentence], bioes_tags)))
    return train_data_bioes


train_esp_pre_BIOES = convert_to_bioes(train_esp)

# Entrenar el modelo CRFTagger con el esquema IO
model_BIOES = CRFTagger(feature_func=feature_extractor._get_features)
model_BIOES.train(train_esp_pre_BIOW, 'model_bioes.crf.tagger')

In [74]:
# Inicializamos contadores
total_entidades = 0
entidades_correctas = 0

test_data_bioes = convert_to_bioes(testa_esp)
predicted_bioes = model_BIOES.tag_sents([[word for word, _ in sentence] for sentence in test_data_bioes])
# Iteramos sobre cada muestra en el conjunto de datos de prueba
for true_sentence, predicted_sentence in zip(test_data_bioes, predicted_bioes):
    # Iteramos sobre cada palabra en la muestra
    for true_word, true_tag in true_sentence:
        # Buscamos la palabra correspondiente en las predicciones
        for predicted_word, predicted_tag in predicted_sentence:
            # Si la palabra coincide
            if true_word == predicted_word:
                # Incrementamos el contador total de entidades
                total_entidades += 1
                # Comparamos las etiquetas predichas y reales
                if true_tag == predicted_tag:
                    # Si las etiquetas coinciden, incrementamos el contador de entidades correctas
                    entidades_correctas += 1
                # Salimos del bucle interior si encontramos la palabra correspondiente
                break
    # Salimos del bucle exterior si ya hemos evaluado todas las muestras
    break

# Calculamos el porcentaje de entidades predichas correctamente
precision_entidades = (entidades_correctas / total_entidades) * 100 if total_entidades > 0 else 0

print("Total de entidades:", total_entidades)
print("Entidades predichas correctamente:", entidades_correctas)
print("Precisión de entidades:", precision_entidades, "%")


Total de entidades: 12
Entidades predichas correctamente: 8
Precisión de entidades: 66.66666666666666 %


In [76]:
# Inicializamos contadores
total_entidades = 0
entidades_correctas = 0
total_no_entidades = 0
no_entidades_correctas = 0

test_data_bioes = convert_to_bioes(testa_esp)
predicted_bioes = model_BIOES.tag_sents([[word for word, _ in sentence] for sentence in test_data_bioes])
# Iteramos sobre cada muestra en el conjunto de datos de prueba
for true_sentence, predicted_sentence in zip(test_data_bioes, predicted_bioes):
    # Iteramos sobre cada palabra en la muestra
    for true_word, true_tag in true_sentence:
        # Buscamos la palabra correspondiente en las predicciones
        for predicted_word, predicted_tag in predicted_sentence:
            # Si la palabra coincide
            if true_word == predicted_word:
                # Incrementamos el contador total de entidades o no-entidades
                if true_tag != 'O':
                    total_entidades += 1
                    # Comparamos las etiquetas predichas y reales
                    if true_tag == predicted_tag:
                        # Si las etiquetas coinciden, incrementamos el contador de entidades correctas
                        entidades_correctas += 1
                else:
                    total_no_entidades += 1
                    # Comparamos las etiquetas predichas y reales
                    if true_tag == predicted_tag:
                        # Si las etiquetas coinciden, incrementamos el contador de no-entidades correctas
                        no_entidades_correctas += 1
                # Salimos del bucle interior si encontramos la palabra correspondiente
                break
    # Salimos del bucle exterior si ya hemos evaluado todas las muestras
    break

# Calculamos la precisión de las entidades y no-entidades
precision_entidades = (entidades_correctas / total_entidades) * 100 if total_entidades > 0 else 0
precision_no_entidades = (no_entidades_correctas / total_no_entidades) * 100 if total_no_entidades > 0 else 0

print("Total de entidades:", total_entidades)
print("Entidades predichas correctamente:", entidades_correctas)
print("Precisión de entidades:", precision_entidades, "%")

print("Total de no-entidades:", total_no_entidades)
print("No-entidades predichas correctamente:", no_entidades_correctas)
print("Precisión de no-entidades:", precision_no_entidades, "%")


Total de entidades: 4
Entidades predichas correctamente: 0
Precisión de entidades: 0.0 %
Total de no-entidades: 8
No-entidades predichas correctamente: 8
Precisión de no-entidades: 100.0 %


In [75]:
# Suponiendo que tienes un conjunto de datos de prueba llamado test_data_bioes
# donde cada elemento es una lista de tuplas (palabra, etiqueta_BIOES)
# Similar a train_data_bio
test_data_bioes = convert_to_bioes(testa_esp)
# Obtener predicciones del modelo CRFTagger
predicted_bioes = model_BIOES.tag_sents([[word for word, _ in sentence] for sentence in test_data_bioes])

# Comparar las etiquetas predichas con las etiquetas reales
true_labels = [tag for sentence in test_data_bioes for _, tag in sentence]
predicted_labels = [tag for sentence in predicted_bioes for _, tag in sentence]

# Calcular métricas de evaluación
from sklearn.metrics import classification_report

print(classification_report(true_labels, predicted_labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00       227
      B-LOCW       0.00      0.00      0.00         0
      B-MISC       0.00      0.00      0.00       256
     B-MISCW       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00       609
      B-ORGW       0.00      0.00      0.00         0
       B-PER       0.00      0.00      0.00       673
      B-PERW       0.00      0.00      0.00         0
       E-LOC       0.00      0.00      0.00       228
      E-MISC       0.00      0.00      0.00       256
       E-ORG       0.00      0.00      0.00       609
       E-PER       0.00      0.00      0.00       673
       I-LOC       0.00      0.00      0.00       109
      I-LOCW       0.00      0.00      0.00         0
      I-MISC       0.00      0.00      0.00       398
     I-MISCW       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00       757
      I-ORGW       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
testa_pre_BIOES = convert_to_bioes(testa_esp)

predicted_BIOES = model_BIOES.tag_sents(testa_esp_pre_tag)

predictions_BIOES = [elem[1] for sentence in predicted_BIOES for elem in sentence]
real_label_BIOES = [elem[1] for sentence in testa_pre_BIOES for elem in sentence]

print(accuracy_score(predictions_BIOES, real_label_BIOES))

0.8509154809818038


ned

In [66]:
def convert_to_bioes(train_data_bio):
    train_data_bioes = []
    for sentence in train_data_bio:
        bioes_tags = []
        for i, (word, pos_tag, bio_tag) in enumerate(sentence):
            if bio_tag == 'O':
                bioes_tags.append('O')
            elif bio_tag.startswith('B-'):
                if i == len(sentence) - 1 or sentence[i + 1][2] != 'I' + bio_tag[1:]:
                    bioes_tags.append('S' + bio_tag[1:])  # Single
                else:
                    bioes_tags.append('B' + bio_tag[1:])  # Begin
            elif bio_tag.startswith('I-'):
                if i == len(sentence) - 1 or sentence[i + 1][2] != 'I' + bio_tag[1:]:
                    bioes_tags.append('E' + bio_tag[1:])  # End
                else:
                    bioes_tags.append('I' + bio_tag[1:])  # Inside
            else:
                raise ValueError("Etiqueta BIO incorrecta: {}".format(bio_tag))
                
        train_data_bioes.append(list(zip([word for word, pos_tag, bio_tag in sentence], bioes_tags)))
    return train_data_bioes


train_ned_pre_BIOES = convert_to_biow(train_ned)

# Entrenar el modelo CRFTagger con el esquema IO
model_BIOES = CRFTagger(feature_func=feature_extractor._get_features)
model_BIOES.train(train_ned_pre_BIOES, 'model_bioes.crf.tagger')

In [68]:
# Suponiendo que tienes un conjunto de datos de prueba llamado test_data_bioes
# donde cada elemento es una lista de tuplas (palabra, etiqueta_BIOES)
# Similar a train_data_bio
test_data_bioes = convert_to_bioes(testa_ned)
# Obtener predicciones del modelo CRFTagger
predicted_bioes = model_BIOES.tag_sents([[word for word, _ in sentence] for sentence in test_data_bioes])

# Comparar las etiquetas predichas con las etiquetas reales
true_labels = [tag for sentence in test_data_bioes for _, tag in sentence]
predicted_labels = [tag for sentence in predicted_bioes for _, tag in sentence]

# Calcular métricas de evaluación
from sklearn.metrics import classification_report

print(classification_report(true_labels, predicted_labels))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-LOC       0.00      0.00      0.00        50
      B-LOCW       0.00      0.00      0.00         0
      B-MISC       0.00      0.00      0.00       139
     B-MISCW       0.00      0.00      0.00         0
       B-ORG       0.00      0.00      0.00       278
      B-ORGW       0.00      0.00      0.00         0
       B-PER       0.00      0.00      0.00       380
      B-PERW       0.00      0.00      0.00         0
       E-LOC       0.00      0.00      0.00        50
      E-MISC       0.00      0.00      0.00       139
       E-ORG       0.00      0.00      0.00       278
       E-PER       0.00      0.00      0.00       380
       I-LOC       0.00      0.00      0.00        14
      I-LOCW       0.00      0.00      0.00         0
      I-MISC       0.00      0.00      0.00        76
     I-MISCW       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00       118
      I-ORGW       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [72]:
# Suponiendo que tienes las predicciones del modelo CRFTagger en predicted_bioes y las etiquetas reales en test_data_bioes
for i, sentence in enumerate(test_data_bioes):
    predicted_tags = predicted_bioes[i]
    true_tags = [tag for _, tag in sentence]  # Extraer solo las etiquetas reales
    print("Sentence:", " ".join(word for word, _ in sentence))
    print("True Tags:", true_tags)
    print("Predicted Tags:", predicted_tags)
    print()


Sentence: Dat is verder opgelaaid door windsnelheden die oplopen tot 35 kilometer per uur .
True Tags: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Tags: [('Dat', 'O'), ('is', 'O'), ('verder', 'O'), ('opgelaaid', 'O'), ('door', 'O'), ('windsnelheden', 'O'), ('die', 'O'), ('oplopen', 'O'), ('tot', 'O'), ('35', 'O'), ('kilometer', 'O'), ('per', 'O'), ('uur', 'O'), ('.', 'O')]

Sentence: Bomaanslag op Indiase trein : twaalf doden
True Tags: ['O', 'O', 'S-MISC', 'O', 'O', 'O', 'O']
Predicted Tags: [('Bomaanslag', 'O'), ('op', 'O'), ('Indiase', 'B-MISCW'), ('trein', 'O'), (':', 'O'), ('twaalf', 'O'), ('doden', 'O')]

Sentence: Ook in Californië , in Sierra Nevada , woeden al een week lang hevige bosbranden .
True Tags: ['O', 'O', 'S-LOC', 'O', 'O', 'B-LOC', 'E-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Predicted Tags: [('Ook', 'O'), ('in', 'O'), ('Californië', 'B-LOCW'), (',', 'O'), ('in', 'O'), ('Sierra', 'B-LOCW'), ('Nevada', 'I-LOCW'), (',', 'O

In [51]:
testa_pre_BIOES = convert_to_bioes(testa_ned)

predicted_BIOES = model_BIOES.tag_sents(testa_ned_pre_tag)

predictions_BIOES = [elem[1] for sentence in predicted_BIOES for elem in sentence]
real_label_BIOES = [elem[1] for sentence in testa_pre_BIOES for elem in sentence]

print(accuracy_score(predictions_BIOES, real_label_BIOES))

0.8997266962082415


In [73]:
model_BIO.tag(nltk.word_tokenize("Mark Pedersen treballa a Google des del 1994."))

[('Mark', 'B-PER'),
 ('Pedersen', 'I-PER'),
 ('treballa', 'O'),
 ('a', 'O'),
 ('Google', 'B-LOC'),
 ('des', 'O'),
 ('del', 'O'),
 ('1994', 'O'),
 ('.', 'O')]

In [74]:
model_IO.tag(nltk.word_tokenize("Mark Pedersen treballa a Google des del 1994."))

[('Mark', 'I-PER'),
 ('Pedersen', 'I-PER'),
 ('treballa', 'O'),
 ('a', 'O'),
 ('Google', 'I-PER'),
 ('des', 'O'),
 ('del', 'O'),
 ('1994', 'O'),
 ('.', 'O')]

## Avaluació

In [32]:
# Avaluació mal feta, contant només quants tokens són correctes, i no les entitats correctes.
model.accuracy(testa_esp_pre)

0.9459214330253387

In [None]:
# Avaluació ben feta:


Hem d'avaluar quantes entitats estan reconegudes correctament, no quants tokens son correctes.
Descodificar la sequencia i obtenir les entitats, i doncs avaluar les entitats.
Per exemple, 'Mark Pedersen Romero' --> 'M P R' (una entitat) per BIO; 'M' i 'P R' (dos entitats) per IO; en aquest exemple IO ho fa malament.

A nivell d'entitats: Recall i f-score

Per avaluar el model avaluem en base a recall i precisio parcial.

## Exemple d'ús CRFTagger

In [9]:
import unicodedata
import re

class FeatureExtractor:
    def __init__(self, pattern):
        self._pattern = pattern

    def _get_features(self, tokens, idx):
        """
        Extract basic features about this word including
            - Current word
            - is it capitalized?
            - Does it have punctuation?
            - Does it have a number?
            - Preffixes up to length 3
            - Suffixes up to length 3
            - paraules prèvies i posteriors amb POS
            - POS-tags
            - longitud

        Note that : we might include feature over previous word, next word etc.

        :return: a list which contains the features
        :rtype: list(str)
        """
        token = tokens[idx]

        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(self._pattern, token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")
            
        # preffix up to length 3
        if len(token) > 1:
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("PRE_" + token[:3])

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
        
        # POS_tags
        POS = model_tagger.tag(tokens)
            
        # Paraules prèvies amb POS
        if idx > 0:
            feature_list.append("anterior1_" + tokens[idx-1] + "_" + POS[idx-1][1])
        if idx > 1:
            feature_list.append("anterior2_" + tokens[idx-2] + "_" + POS[idx-2][1])
            
        # Paraules posteriors amb POS
        if idx < (len(tokens)-1):
            feature_list.append("posterior1_" + tokens[idx+1] + "_" + POS[idx+1][1])
        if idx < (len(tokens)-2):
            feature_list.append("posterior2_" + tokens[idx+2] + "_" + POS[idx+2][1])

        feature_list.append("WORD_" + token)

        return feature_list

# Ejemplo de uso:
pattern = r'\d+'  # Patrón para encontrar números
feature_extractor = FeatureExtractor(pattern)

tokens = ['El', 'men', 'atendió', 'a', 'la', 'reunión']

for i, token in enumerate(tokens):
    features = feature_extractor._get_features(tokens, i)
    print(f"Token: {token}, Features: {features}")

Token: El, Features: ['CAPITALIZATION', 'PRE_E', 'SUF_l', 'posterior1_men_NC', 'posterior2_atendió_VMI', 'WORD_El']
Token: men, Features: ['PRE_m', 'PRE_me', 'SUF_n', 'SUF_en', 'anterior1_El_DA', 'posterior1_atendió_VMI', 'posterior2_a_SP', 'WORD_men']
Token: atendió, Features: ['PRE_a', 'PRE_at', 'PRE_ate', 'SUF_ó', 'SUF_ió', 'SUF_dió', 'anterior1_men_NC', 'anterior2_El_DA', 'posterior1_a_SP', 'posterior2_la_DA', 'WORD_atendió']
Token: a, Features: ['anterior1_atendió_VMI', 'anterior2_men_NC', 'posterior1_la_DA', 'posterior2_reunión_NC', 'WORD_a']
Token: la, Features: ['PRE_l', 'SUF_a', 'anterior1_a_SP', 'anterior2_atendió_VMI', 'posterior1_reunión_NC', 'WORD_la']
Token: reunión, Features: ['PRE_r', 'PRE_re', 'PRE_reu', 'SUF_n', 'SUF_ón', 'SUF_ión', 'anterior1_la_DA', 'anterior2_a_SP', 'WORD_reunión']
