# Reconocimiento de Entidades
En este notebook se desarrollará un modelo que aprenderá a reconocer ciertas entidades previamente definidas en textos.

In [None]:
# Importar librerias
import spacy
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('dataset.csv')
df.head()

In [None]:
# check NaNs
df.isnull().sum()

In [None]:
labels = np.delete(df['Tag'].unique(), np.where(df['Tag'].unique() == 'O'))
labels

In [None]:
import random
TOTAL_CHATS = max(df['Chat #'])
TEST_CHATS = int(TOTAL_CHATS * 0.25)
test_chats_ids = sorted(random.sample(range(1, TOTAL_CHATS), TEST_CHATS))
df_test = df[df['Chat #'].isin(test_chats_ids)]
df_train = df[~df['Chat #'].isin(test_chats_ids)]

In [None]:
def format_dataframe_to_spacy(df):
    res = []
    for i, data in df.groupby('Sentence #'):
        sentence_words_list = data['Word'].values.tolist()
        sentence_words_lens = [len(word) for word in sentence_words_list]
        sentence = ' '.join(sentence_words_list)
        tag_list = data['Tag'].values.tolist()
        start_end_tag = []
        for j, tag in enumerate(tag_list):
            if tag != 'O':
                start = sum(sentence_words_lens[:j]) + j
                end = start + sentence_words_lens[j]
                start_end_tag.append((start, end, tag))
        res.append((sentence, start_end_tag))
    return res

In [None]:
# Formateo los datos de entrenamiento al formato de Spacy
train_data = format_dataframe_to_spacy(df_train)
train_data[0]

In [None]:
'Cliente : Hola , ¿ qué tal ? Soy María Fernández y me quiero dar de baja de Celtel .'[33:38]

In [None]:
# Creo el modelo
nlp = spacy.blank('es')
ner = nlp.add_pipe('ner')
for label in labels:
    ner.add_label(label)

In [None]:
# Entreno el modelo
from spacy.training import Example
optimizer = nlp.begin_training()
n_iter = 4
for itn in range(n_iter):
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets})
        nlp.update([example], sgd=optimizer)

In [None]:
# # Si se quiere guardar el modelo
# nlp.to_disk(f'ner_{n_iter}_iterations')

In [None]:
# Para mostrar los resultados
from spacy import displacy
def find_entities(text):
    return nlp(text)

def print_entities(doc):
    displacy.render(doc, style="ent", jupyter=True)

In [None]:
# Prueba con el primer chat de test
with open(f'data/chat{str(test_chats_ids[0]).zfill(2)}.txt', encoding='utf-8') as f:
    text = f.read()
    doc = find_entities(text)
    print_entities(doc)

In [None]:
def get_predicted_df(chat_num):
    predicted = pd.DataFrame(columns=['Chat #', 'Sentence #', 'Word', 'Tag', 'wordspan'])
    first_sentence_num = df_test[df_test['Chat #'] == chat_num]['Sentence #'].unique()[0]
    last_sentence_num = df_test[df_test['Chat #'] == chat_num]['Sentence #'].unique()[-1]
    for i in range(first_sentence_num, last_sentence_num + 1):
        sentence = df_test[df_test['Sentence #'] == i]['Word'].values.tolist()
        sentence = ' '.join(sentence)
        words_lens = [len(word) for word in sentence.split()]
        words_spans = []
        for j, word in enumerate(sentence.split()):
            start = sum(words_lens[:j]) + j
            end = start + words_lens[j]
            words_spans.append((start, end))
        for word, word_span in zip(sentence.split(), words_spans):
            predicted = predicted.append({'Chat #': chat_num, 'Sentence #': i, 'Word': word, 'Tag': 'O', 'wordspan': word_span}, ignore_index=True)
        doc = find_entities(sentence)
        for ent in doc.ents:
            if (ent.start_char, ent.end_char) in words_spans:
                predicted.loc[(predicted['Chat #'] == chat_num) & (predicted['Sentence #'] == i) & (predicted['wordspan'] == (ent.start_char, ent.end_char)), 'Tag'] = ent.label_
    predicted.drop(columns=['wordspan'], inplace=True)
    return predicted

In [None]:
pred_dfs = []
for id in test_chats_ids:
    pred_dfs.append(get_predicted_df(id))
df_pred = pd.concat(pred_dfs)
df_test.head(20)

In [None]:
df_test.reset_index(drop=True, inplace=True)
df_pred.reset_index(drop=True, inplace=True)
df_test = df_test[df_test['Tag'] != 'O']
df_pred = df_pred[df_pred.index.isin(df_test[df_test['Tag'] != 'O'].index)]
df_test.head(10)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_test['Tag'], df_pred['Tag'], zero_division=0))