# Pràctica 3

In [None]:
%pip install python-crfsuite

In [4]:
import nltk
import pycrfsuite
from nltk.corpus import conll2002
from nltk.tag import CRFTagger
from sklearn.metrics import accuracy_score

In [None]:
nltk.download('punkt') # Tokenitzador
nltk.download('averaged_perceptron_tagger') # Etiquetador POS
nltk.download('maxent_ne_chunker') # Etiquetador Entitats Anomenades
nltk.download('words')
nltk.download('treebank')
nltk.download('conll2002')

In [4]:
train = nltk.corpus.treebank.tagged_sents()[:3000]
test = nltk.corpus.treebank.tagged_sents()[3000:]
model.train(train, 'crfTagger.mdl')
model.accuracy(test)

0.9474638463198791

## Predicció amb BIO

In [None]:
model_tagger = CRFTagger()
model_BIO = CRFTagger()

In [10]:
train_esp = conll2002.iob_sents('esp.train') # Train, ned.train => Neerlandès
testa_esp = conll2002.iob_sents('esp.testa') # Dev
testb_esp = conll2002.iob_sents('esp.testb') # Test

In [32]:
# Entrenem el model per fer els POS que corresponen a cada token

train_esp_pre_tag = []
for sentence in train_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem2))
    train_esp_pre_tag.append(frases)
    
model_tagger.train(train_esp_pre_tag, 'model_POS.crf.tagger')


# Fem prediccions i mirem l'accuracy

testa_esp_pre_tag = []
for sentence in testa_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append(elem1)
    testa_esp_pre_tag.append(frases)
    
predicted = model_tagger.tag_sents(testa_esp_pre_tag)

predictions = [elem[1] for sentence in predicted for elem in sentence]
real_label = [elem[1] for sentence in testa_esp for elem in sentence]

print(accuracy_score(predictions, real_label))

0.9447121289420479


In [53]:
from nltk.tag import CRFTagger
import unicodedata
import re

class FeatureExtractor:
    def __init__(self, pattern):
        self._pattern = pattern

    def _get_features(self, tokens, idx):
        """
        Extract basic features about this word including
            - Current word
            - is it capitalized?
            - Does it have punctuation?
            - Does it have a number?
            - Preffixes up to length 3
            - Suffixes up to length 3
            - paraules prèvies i posteriors amb POS
            - POS-tags
            - longitud

        Note that : we might include feature over previous word, next word etc.

        :return: a list which contains the features
        :rtype: list(str)
        """
        token = tokens[idx]

        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(self._pattern, token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")
            
        # preffix up to length 3
        if len(token) > 1:
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("PRE_" + token[:3])

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
        
        # POS_tags
        POS = model_tagger.tag(tokens)
            
        # Paraules prèvies amb POS
        if idx > 0:
            feature_list.append("anterior1_" + tokens[idx-1] + "_" + POS[idx-1][1])
        if idx > 1:
            feature_list.append("anterior2_" + tokens[idx-2] + "_" + POS[idx-2][1])
            
        # Paraules posteriors amb POS
        if idx < (len(tokens)-1):
            feature_list.append("posterior1_" + tokens[idx+1] + "_" + POS[idx+1][1])
        if idx < (len(tokens)-2):
            feature_list.append("posterior2_" + tokens[idx+2] + "_" + POS[idx+2][1])

        feature_list.append("WORD_" + token)

        return feature_list

# Crear una instancia de FeatureExtractor
pattern = r'\d+'  # Patrón para encontrar números
feature_extractor = FeatureExtractor(pattern)

train_esp_pre_BIO = []
for sentence in train_esp:
    frases = []
    for elem1, elem2, elem3 in sentence:
        frases.append((elem1, elem3))
    train_esp_pre_BIO.append(frases)
    
model_BIO = CRFTagger(feature_func=feature_extractor._get_features)
model_BIO.train(train_esp_pre_BIO, 'model_BIO.crf.tagger')

In [54]:
predicted_BIO = model_BIO.tag_sents(testa_esp_pre_tag)

predictions_BIO = [elem[1] for sentence in predicted_BIO for elem in sentence]
real_label_BIO = [elem[2] for sentence in testa_esp for elem in sentence]

print(accuracy_score(predictions_BIO, real_label_BIO))

0.9580711599871511


## Predicció amb IO

In [66]:
def convert_to_io(train_data_bio):
    train_data_io = []
    for sentence in train_data_bio:
        io_tags = []
        for word, pos_tag, bio_tag in sentence:
            if bio_tag == 'O':
                io_tags.append('O')
            elif bio_tag.startswith('B-'):
                io_tags.append('I' + bio_tag[1:])
            else:
                io_tags.append(bio_tag)
                
        train_data_io.append(list(zip([word for word, pos_tag, bio_tag in sentence], io_tags)))
    return train_data_io

train_esp_pre_IO = convert_to_io(train_esp)

# Entrenar el modelo CRFTagger con el esquema IO
model_IO = CRFTagger(feature_func=feature_extractor._get_features)
model_IO.train(train_esp_pre_IO, 'model_io.crf.tagger')

In [71]:
testa_pre_IO = convert_to_io(testa_esp)

predicted_IO = model_IO.tag_sents(testa_esp_pre_tag)

predictions_IO = [elem[1] for sentence in predicted_IO for elem in sentence]
real_label_IO = [elem[1] for sentence in testa_pre_IO for elem in sentence]

print(accuracy_score(predictions_IO, real_label_IO))

0.9559170870887894


In [73]:
model_BIO.tag(nltk.word_tokenize("Mark Pedersen treballa a Google des del 1994."))

[('Mark', 'B-PER'),
 ('Pedersen', 'I-PER'),
 ('treballa', 'O'),
 ('a', 'O'),
 ('Google', 'B-LOC'),
 ('des', 'O'),
 ('del', 'O'),
 ('1994', 'O'),
 ('.', 'O')]

In [74]:
model_IO.tag(nltk.word_tokenize("Mark Pedersen treballa a Google des del 1994."))

[('Mark', 'I-PER'),
 ('Pedersen', 'I-PER'),
 ('treballa', 'O'),
 ('a', 'O'),
 ('Google', 'I-PER'),
 ('des', 'O'),
 ('del', 'O'),
 ('1994', 'O'),
 ('.', 'O')]

## Avaluació

In [32]:
# Avaluació mal feta, contant només quants tokens són correctes, i no les entitats correctes.
model.accuracy(testa_esp_pre)

0.9459214330253387

In [None]:
# Avaluació ben feta:


Hem d'avaluar quantes entitats estan reconegudes correctament, no quants tokens son correctes.
Descodificar la sequencia i obtenir les entitats, i doncs avaluar les entitats.
Per exemple, 'Mark Pedersen Romero' --> 'M P R' (una entitat) per BIO; 'M' i 'P R' (dos entitats) per IO; en aquest exemple IO ho fa malament.

A nivell d'entitats: Recall i f-score

Per avaluar el model avaluem en base a recall i precisio parcial.

## Exemple d'ús CRFTagger

In [50]:
import unicodedata
import re

class FeatureExtractor:
    def __init__(self, pattern):
        self._pattern = pattern

    def _get_features(self, tokens, idx):
        """
        Extract basic features about this word including
            - Current word
            - is it capitalized?
            - Does it have punctuation?
            - Does it have a number?
            - Preffixes up to length 3
            - Suffixes up to length 3
            - paraules prèvies i posteriors amb POS
            - POS-tags
            - longitud

        Note that : we might include feature over previous word, next word etc.

        :return: a list which contains the features
        :rtype: list(str)
        """
        token = tokens[idx]

        feature_list = []

        if not token:
            return feature_list

        # Capitalization
        if token[0].isupper():
            feature_list.append("CAPITALIZATION")

        # Number
        if re.search(self._pattern, token) is not None:
            feature_list.append("HAS_NUM")

        # Punctuation
        punc_cat = {"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"}
        if all(unicodedata.category(x) in punc_cat for x in token):
            feature_list.append("PUNCTUATION")
            
        # preffix up to length 3
        if len(token) > 1:
            feature_list.append("PRE_" + token[:1])
        if len(token) > 2:
            feature_list.append("PRE_" + token[:2])
        if len(token) > 3:
            feature_list.append("PRE_" + token[:3])

        # Suffix up to length 3
        if len(token) > 1:
            feature_list.append("SUF_" + token[-1:])
        if len(token) > 2:
            feature_list.append("SUF_" + token[-2:])
        if len(token) > 3:
            feature_list.append("SUF_" + token[-3:])
        
        # POS_tags
        POS = model_tagger.tag(tokens)
            
        # Paraules prèvies amb POS
        if idx > 0:
            feature_list.append("anterior1_" + tokens[idx-1] + "_" + POS[idx-1][1])
        if idx > 1:
            feature_list.append("anterior2_" + tokens[idx-2] + "_" + POS[idx-2][1])
            
        # Paraules posteriors amb POS
        if idx < (len(tokens)-1):
            feature_list.append("posterior1_" + tokens[idx+1] + "_" + POS[idx+1][1])
        if idx < (len(tokens)-2):
            feature_list.append("posterior2_" + tokens[idx+2] + "_" + POS[idx+2][1])

        feature_list.append("WORD_" + token)

        return feature_list

# Ejemplo de uso:
pattern = r'\d+'  # Patrón para encontrar números
feature_extractor = FeatureExtractor(pattern)

tokens = ['El', 'men', 'atendió', 'a', 'la', 'reunión']

for i, token in enumerate(tokens):
    features = feature_extractor._get_features(tokens, i)
    print(f"Token: {token}, Features: {features}")

Token: El, Features: ['CAPITALIZATION', 'PRE_E', 'SUF_l', 'posterior1_men_NC', 'posterior2_atendió_VMI', 'WORD_El']
Token: men, Features: ['PRE_m', 'PRE_me', 'SUF_n', 'SUF_en', 'anterior1_El_DA', 'posterior1_atendió_VMI', 'posterior2_a_SP', 'WORD_men']
Token: atendió, Features: ['PRE_a', 'PRE_at', 'PRE_ate', 'SUF_ó', 'SUF_ió', 'SUF_dió', 'anterior1_men_NC', 'anterior2_El_DA', 'posterior1_a_SP', 'posterior2_la_DA', 'WORD_atendió']
Token: a, Features: ['anterior1_atendió_VMI', 'anterior2_men_NC', 'posterior1_la_DA', 'posterior2_reunión_NC', 'WORD_a']
Token: la, Features: ['PRE_l', 'SUF_a', 'anterior1_a_SP', 'anterior2_atendió_VMI', 'posterior1_reunión_NC', 'WORD_la']
Token: reunión, Features: ['PRE_r', 'PRE_re', 'PRE_reu', 'SUF_n', 'SUF_ón', 'SUF_ión', 'anterior1_la_DA', 'anterior2_a_SP', 'WORD_reunión']
