# Practica 2

In [248]:
import nltk
import numpy as np
from tqdm import tqdm
from unidecode import unidecode
from matplotlib import pyplot as plt
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [249]:
with open('corpusML.txt', 'r') as f:
    corpus = f.readlines()

In [250]:
corpus = [unidecode(line.lower()) for line in corpus]

In [251]:
corpus[0:2]

['comence a trabajar y me pegaron, me maltrataron con chicote \n',
 'mis patrones me pegaron porque no me queria apurar, porque era flojo \n']

### 1, 2) Limpiar corpus y agregar simbolos de inicio y fin

* Se limpia el corpus mediante el algoritmo de Porter para el lenguaje espa;ol. 
* A cada oracion del corpus, se le agrega el simbolo de inicio y fin. 
* Se crea el alfabeto $\Sigma$ del corpus donde se almacenen unicamente los tipos 

In [252]:
stemmer = SnowballStemmer("spanish")
stems = []                                              # Lista de stems por cada oracion
cleanedCorpus = []                                      # Corpus procesado con stemming
Sigma = []                                              # Alfabeto del corpus (tipos)               

for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)               # Obtener lista tokens
    for tk in tokens:   
        if tk.isalpha():                                # Validar token como caracter del alfabeto                                
            stem = stemmer.stem(tk)                     # Aplicar algotimo de stemming
            #stem = tk
            stems.append(stem)                          # Agregarlo a la lista de stems 
            if stem not in Sigma:                       # Agregar stem al alfabeto
                Sigma.append(stem)
    s = '<BOS> ' + ' '.join(stems) + ' <EOS>'           # Agregar simbolos de inicio y fin
    cleanedCorpus.append(s)                             # Agregar oracion procesada a la lista del corpus limpio                               
    stems.clear()

# Agregar simbolos de inicio y fin al alfabeto
Sigma.append('<BOS>')
Sigma.append('<EOS>')

In [253]:
cleanedCorpus[:3]

['<BOS> comenc a trabaj y me peg me maltrat con chicot <EOS>',
 '<BOS> mis patron me peg porqu no me queri apur porqu era floj <EOS>',
 '<BOS> por eso me habi peg <EOS>']

In [254]:
Sigma[-10:]

['cab',
 'bebecit',
 'tabiqu',
 'calent',
 'pajuel',
 'vapor',
 'quemart',
 'cai',
 '<BOS>',
 '<EOS>']

### 3) Obtener los bigramas

In [255]:
# Funcion para obtener los bigramas de una secuencia de caracteres
def bigrams(sequence):
    s = sequence.split()
    return [(wi, wj) for wi, wj in zip(s[:-1], s[1:])]

In [256]:
# Obtener los bigramas del corpus limpio
sentence_bigrams = [bigrams(s) for s in cleanedCorpus]

In [257]:
# Bigramas de la primera oracion
sentence_bigrams[0]

[('<BOS>', 'comenc'),
 ('comenc', 'a'),
 ('a', 'trabaj'),
 ('trabaj', 'y'),
 ('y', 'me'),
 ('me', 'peg'),
 ('peg', 'me'),
 ('me', 'maltrat'),
 ('maltrat', 'con'),
 ('con', 'chicot'),
 ('chicot', '<EOS>')]

In [258]:
# Bigramas de todo el corpus
corpus_bigrams = [bigram for sentence in sentence_bigrams for bigram in sentence]

In [259]:
corpus_bigrams[0:11]

[('<BOS>', 'comenc'),
 ('comenc', 'a'),
 ('a', 'trabaj'),
 ('trabaj', 'y'),
 ('y', 'me'),
 ('me', 'peg'),
 ('peg', 'me'),
 ('me', 'maltrat'),
 ('maltrat', 'con'),
 ('con', 'chicot'),
 ('chicot', '<EOS>')]

In [260]:
# Obtener los vectores one hot de cada palabra en el corpus
oneHotMatrix = np.identity(len(Sigma), np.float32)

In [261]:
word2oneHot = {}        # Entrada: palabra del alfabeto, Salida: vector one hot
word2number = {}        # Entrada: palabra del alfabeto, Salida: indice en la lista del alfabeto
oneHot2word = {}        # Entrada: vector one hot (caracteres), Salida: palabra del alfabeto

for i, (word, vector) in enumerate(zip(Sigma, oneHotMatrix)):
    word2oneHot[word] = vector
    word2number[word] = i
    oneHot2word[np.where(vector==1)[0][0]] = word

In [262]:
word2oneHot['<EOS>']

array([0., 0., 0., ..., 0., 0., 1.], dtype=float32)

### 4) Entrenar la red neuronal con los bigramas

In [263]:
d = 300
N = len(Sigma)

U = np.random.randn(N, d)
W = np.random.randn(d, N)

In [264]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x))

In [265]:
def predict(x):
    h = np.dot(x, U)
    a = np.dot(h, W)
    return softmax(a)

In [275]:
def fit(bigrams, lr, epochs):
    errorVector = []
    for _ in tqdm(range(epochs)):
        for wi, wj in bigrams:
            #print(wi, wj)
            vectorWi = word2oneHot[wi]
            vectorWj = word2oneHot[wj]
            # Feedforward
            h = np.dot(vectorWi, U)
            a = softmax(np.dot(h, W))
            #print(preactivation)
            #print(output)
            #print(preactivation)
            error = a - vectorWj
            #print(error)
            # Backpropragation
            k = np.where(vectorWi==1)[0][0]
            W[:,k] -= lr*error[k]*h
            delta = np.dot(W, error)
            U[k,:] -= lr*delta
            errorVector.append(error[k])
    return errorVector

In [276]:
err = fit(corpus_bigrams, 0.01, 100)





  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



  1%|          | 1/100 [00:11<18:38, 11.30s/it][A[A[A[A



  2%|▏         | 2/100 [00:22<18:20, 11.23s/it][A[A[A[A



  3%|▎         | 3/100 [00:33<18:08, 11.22s/it][A[A[A[A



  4%|▍         | 4/100 [00:41<16:15, 10.16s/it][A[A[A[A



  5%|▌         | 5/100 [00:52<16:24, 10.37s/it][A[A[A[A



  6%|▌         | 6/100 [00:59<14:57,  9.54s/it][A[A[A[A



  7%|▋         | 7/100 [01:07<13:50,  8.93s/it][A[A[A[A



  8%|▊         | 8/100 [01:14<13:03,  8.51s/it][A[A[A[A



  9%|▉         | 9/100 [01:24<13:24,  8.84s/it][A[A[A[A



 10%|█         | 10/100 [01:35<14:15,  9.51s/it][A[A[A[A



 11%|█         | 11/100 [01:47<15:16, 10.30s/it][A[A[A[A



 12%|█▏        | 12/100 [01:58<15:35, 10.63s/it][A[A[A[A



 13%|█▎        | 13/100 [02:09<15:33, 10.73s/it][A[A[A[A



 14%|█▍        | 14/100 [02:22<16:20, 11.40s/it][A[A[A[A



 15%|█▌        | 15/100 [02:35<16:39, 11.76s/it][A[A

In [277]:
# Prueba
#wordVector = word2oneHot[stemmer.stem('caballo')]
wordVector = word2oneHot['<EOS>']
pred = predict(wordVector)
print(pred)
print(pred.shape)
print(np.sum(pred))

indice = np.argmax(pred)
print(indice)

print('prediccion => ', oneHot2word[indice])

print(pred[indice])

[4.74524154e-18 7.25955628e-13 2.95167024e-15 ... 4.60679657e-21
 2.48340268e-21 3.38811166e-23]
(1216,)
1.0000000000000002
78
prediccion =>  el
0.9763496271675253


### 5) Obtener las matrices $A$ y $\Pi$

In [278]:
# Para cada palabra del alfabeto, predecir el vector de probabilidades
# y agruparlos por columna para hacer la matriz A

A = []

for wj in Sigma[:-2]:
    aj = predict(word2oneHot[wj])
    A.append(list(aj))
    
A = np.matrix(A).T

In [279]:
A.shape

(1216, 1214)

In [280]:
# El vector de inicio se obtiene al predecir la distribucion
# para el simbolo <BOS>
Pi = predict(word2oneHot['<BOS>'])

### 6) Calcular la propabilidad de las siguientes oraciones

Se calcularan usando la propiedad de Markov que establece que:

$p(x_1,...,x_n)=\prod_{i=1}^{n}p(w_{i}|w_{i-1})$

1) Nos ba;amos con agua caliente

$p(caliente|agua)p(agua|con)p(con|banamos)p(banamos|nos)p(nos|bos)$

In [299]:
#s = 'Nos banamos con agua caliente'.split()
s = '<BOS> pascuala ordenaba las vacas'.split()
s[1:] = [stemmer.stem(word.lower()) for word in s[1:]]

In [300]:
#j = word2number['las']
#i = word2number['vacas']

#A[i,j]

In [301]:
#Pi[word2number['pues']]

In [302]:
p = 1

for wi, wj in zip(s[:-1], s[1:]):
    if wi == '<BOS>':
        p *= Pi[word2number[wj]]
    else:
        i = word2number[wj]
        j = word2number[wi]
        p *= A[i,j]
        
print('p(s) = ', p)

p(s) =  2.425589910422211e-19


In [283]:
from nltk.stem import PorterStemmer

In [285]:
po = PorterStemmer()

In [286]:
po.stem('caliente')

'calient'