# Entrenando un Modelo Markoviano Latente (HMM)

## Corpus de español: 

* AnCora | Github: https://github.com/UniversalDependencies/UD_Spanish-AnCora

* usamos el conllu parser para leer el corpus: https://pypi.org/project/conllu/

* Etiquetas Universal POS (Documentación): https://universaldependencies.org/u/pos/

In [14]:
#@title dependencias previas
!pip install conllu
!git clone https://github.com/UniversalDependencies/UD_Spanish-AnCora.git

[31mERROR: Operation cancelled by user[0m
fatal: destination path 'UD_Spanish-AnCora' already exists and is not an empty directory.


In [None]:
#@title leyendo el corpus AnCora
from conllu import parse_incr 
wordList = []
data_file = open("UD_Spanish-AnCora/es_ancora-ud-dev.conllu", "r", encoding="utf-8")
for tokenlist in parse_incr(data_file):
    print(tokenlist.serialize())

In [16]:
#@title Estructura de los tokens etiquetados del corpus
tokenlist[1]

{'deprel': 'nsubj',
 'deps': None,
 'feats': {'Gender': 'Masc', 'Number': 'Sing'},
 'form': 'cierto',
 'head': 3,
 'id': 2,
 'lemma': 'cierto',
 'misc': None,
 'upos': 'ADJ',
 'xpos': 'ADJ'}

In [17]:
tokenlist[1]['form']+'|'+tokenlist[1]['upos']

# hacemos esto porque sera la forma en que haremos conteos sobre cada objeto condicionado

'cierto|ADJ'

## Entrenamiento del modelo - Calculo de conteos:

* tags (tags) `tagCountDict`: $C(tag)$
* emisiones (word|tag) `emissionProbDict`: $C(word|tag)$
* transiciones (tag|prevtag) `transitionDict`: $C(tag|prevtag)$

In [None]:
tagCountDict = {} 
emissionDict = {}
transitionDict = {}

#UPUS convesion de etiquetas universal
tagtype = 'upos'
data_file = open("UD_Spanish-AnCora/es_ancora-ud-dev.conllu", "r", encoding="utf-8")

# Calculando conteos (pre-probabilidades)
for tokenlist in parse_incr(data_file):
  prevtag = None
  # definimos prevtag no tenemos token previo
  for token in tokenlist:

    # C(tag)
    tag = token[tagtype]
    if tag in tagCountDict.keys():
      tagCountDict[tag] += 1
    else:
      tagCountDict[tag] = 1

    # C(word|tag) -> probabilidades emision
    wordtag = token['form'].lower()+'|'+token[tagtype] # (word|tag)
    if wordtag in emissionDict.keys():
      emissionDict[wordtag] = emissionDict[wordtag] + 1
    else:
      emissionDict[wordtag] = 1

    #  C(tag|tag_previo) -> probabilidades transición
    if prevtag is None:
      prevtag = tag
      continue
    transitiontags = tag+'|'+prevtag
    if transitiontags in transitionDict.keys():
      transitionDict[transitiontags] = transitionDict[transitiontags] + 1
    else:
      transitionDict[transitiontags] = 1
    prevtag = tag
    
#transitionDict
#emissionDict
#tagCountDict

## Entrenamiento del modelo - calculo de probabilidades
* probabilidades de transición:
$$P(tag|prevtag) = \frac{C(prevtag, tag)}{C(prevtag)}$$

* probabilidades de emisión:
 $$P(word|tag) = \frac{C(word|tag)}{C(tag)}$$

In [None]:
transitionProbDict = {} # matriz A
emissionProbDict = {} # matriz B

# transition Probabilities 
for key in transitionDict.keys():
  tag, prevtag = key.split('|')
  if tagCountDict[prevtag]>0:
    transitionProbDict[key] = transitionDict[key]/(tagCountDict[prevtag])
  else:
    print(key)

# emission Probabilities 
for key in emissionDict.keys():
  word, tag = key.split('|')
  if emissionDict[key]>0:
    emissionProbDict[key] = emissionDict[key]/tagCountDict[tag]
  else:
    print(key)

transitionProbDict['ADJ|ADJ']
#emissionProbDict

0.030225988700564973

In [None]:
emissionProbDict

{'el|DET': 0.2411214953271028,
 'gobernante|NOUN': 0.00020835503698301907,
 ',|PUNCT': 0.45316979929913986,
 'con|ADP': 0.05196480938416422,
 'ganada|ADJ': 0.0002824858757062147,
 'fama|NOUN': 0.00010417751849150954,
 'desde|ADP': 0.008797653958944282,
 'que|SCONJ': 0.6382042253521126,
 'llegó|VERB': 0.0022411474675033617,
 'hace|VERB': 0.009188704616763783,
 '16|NUM': 0.011428571428571429,
 'meses|NOUN': 0.0028127929992707574,
 'al|ADP': 0.04105571847507331,
 'poder|NOUN': 0.0011459527034066049,
 'de|ADP': 0.37478005865102637,
 'explotar|VERB': 0.00044822949350067237,
 'máximo|NOUN': 0.00020835503698301907,
 'su|DET': 0.0503235082674335,
 'oratoria|NOUN': 0.00010417751849150954,
 'y|CCONJ': 0.7771664374140302,
 'acusado|ADJ': 0.000847457627118644,
 'por|ADP': 0.05970674486803519,
 'sus|DET': 0.019985621854780734,
 'detractores|NOUN': 0.0003125325554745286,
 'incontinencia|NOUN': 0.00010417751849150954,
 'verbal|ADJ': 0.0005649717514124294,
 'enmudeció|VERB': 0.00022411474675033618,
 '

## Guardar parámetros del modelo

In [None]:
import numpy as np
np.save('transitionHMM.npy', transitionProbDict)
np.save('emissionHMM.npy', emissionProbDict)
transitionProbdict = np.load('transitionHMM.npy', allow_pickle='TRUE').item()
transitionProbDict['ADJ|ADJ']

0.030225988700564973