<a href="https://colab.research.google.com/github/rblanco2023/NLP/blob/main/Desaf%C3%ADo_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import string
import random
import re
import urllib.request
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
# Para leer y parsear el texto en HTML de wikipedia
import bs4 as bs

import nltk
# Descargar el diccionario
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

###Determino el dataset a utilizar
Utilizaré los sonetos de William Shakespeare

In [2]:
raw_html = urllib.request.urlopen('https://www.gutenberg.org/files/1041/1041-h/1041-h.htm')
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')

article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()

In [3]:
article_text



###Preprocesamiento

In [4]:
article_text = article_text.replace('\n', '.') #Elimino los saltos de fila
article_text = article_text.replace(',.', '.') # Reemplazo ,. por .
article_text = article_text.replace(':.', '.')
article_text = article_text.replace(';.', '.')
article_text = article_text.replace('\xa0\xa0\xa0\xa0', '')
article_text = re.split(r"\r.", article_text) #Separo las oraciones

In [5]:
df = pd.DataFrame(article_text)
df.drop(0)

Unnamed: 0,0
1,"from fairest creatures we desire increase,"
2,"that thereby beauty’s rose might never die,"
3,"but as the riper should by time decease,"
4,his tender heir might bear his memory:
5,"but thou, contracted to thine own bright eyes,"
...,...
2151,"which from love’s fire took heat perpetual,"
2152,"growing a bath and healthful remedy,"
2153,"for men diseas’d; but i, my mistress’ thrall,"
2154,"came there for cure and this by that i prove,"


In [6]:
df.head()

Unnamed: 0,0
0,
1,"from fairest creatures we desire increase,"
2,"that thereby beauty’s rose might never die,"
3,"but as the riper should by time decease,"
4,his tender heir might bear his memory:


In [7]:
print("Cantidad de documentos en el corpus:", len(article_text))

Cantidad de documentos en el corpus: 2156


###Modelo

In [8]:
from keras.preprocessing.text import text_to_word_sequence

sentence_tokens = []
# Recorrer todas las filas y transformar las oraciones
# en una secuencia de palabras (esto podría realizarse con NLTK o spaCy también)
for _, row in df[:None].iterrows():
    sentence_tokens.append(text_to_word_sequence(row[0]))

In [9]:
sentence_tokens[2:3]

[['that', 'thereby', 'beauty’s', 'rose', 'might', 'never', 'die']]

In [10]:
from gensim.models.callbacks import CallbackAny2Vec
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobracargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [11]:
# Crearmos el modelo generador de vectoeres
# En este caso utilizaremos la estructura modelo Skipgram
w2v_model = Word2Vec(min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=3,       # cant de palabras antes y desp de la predicha
                     size=300,       # dimensionalidad de los vectores 
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=1)           # modelo 0:CBOW  1:skipgram

In [12]:
w2v_model.build_vocab(sentence_tokens)

In [13]:
print("Cantidad de docs en el corpus:", w2v_model.corpus_count)

Cantidad de docs en el corpus: 2156


In [14]:
print("Cantidad de words distintas en el corpus:", len(w2v_model.wv.vocab))

Cantidad de words distintas en el corpus: 511


###Entrenamiento del modelo

In [15]:
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=200,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 162263.5625
Loss after epoch 1: 83257.1875
Loss after epoch 2: 81659.25
Loss after epoch 3: 83166.8125
Loss after epoch 4: 82804.65625
Loss after epoch 5: 83499.53125
Loss after epoch 6: 83871.6875
Loss after epoch 7: 84260.4375
Loss after epoch 8: 84729.0
Loss after epoch 9: 83409.6875
Loss after epoch 10: 82824.8125
Loss after epoch 11: 80113.0
Loss after epoch 12: 73259.25
Loss after epoch 13: 72571.5
Loss after epoch 14: 71847.125
Loss after epoch 15: 71789.5
Loss after epoch 16: 71207.5
Loss after epoch 17: 71703.875
Loss after epoch 18: 71390.125
Loss after epoch 19: 71174.25
Loss after epoch 20: 70174.25
Loss after epoch 21: 69182.0
Loss after epoch 22: 68302.5
Loss after epoch 23: 69203.875
Loss after epoch 24: 68949.25
Loss after epoch 25: 66759.875
Loss after epoch 26: 62359.5
Loss after epoch 27: 58792.0
Loss after epoch 28: 58827.0
Loss after epoch 29: 58171.75
Loss after epoch 30: 58708.25
Loss after epoch 31: 58267.75
Loss after epoch 32: 57773.5
Loss 

(1699979, 3524000)

###Pruebas

In [16]:
w2v_model.wv.most_similar(positive=["dead"], topn=10)

[('living', 0.44620198011398315),
 ('worse', 0.42709121108055115),
 ('less', 0.4207935631275177),
 ('brow', 0.393370121717453),
 ('flowers', 0.37296390533447266),
 ('longer', 0.36998364329338074),
 ('makes', 0.3685852289199829),
 ('once', 0.3668811619281769),
 ('hide', 0.3582109808921814),
 ('gone', 0.35620540380477905)]

In [20]:
w2v_model.wv.most_similar(negative=["dead"], topn=10)

[('true', 0.05011307820677757),
 ('know', 0.015494484454393387),
 ('since', 0.012204296886920929),
 ('skill', 0.010307639837265015),
 ('decay', 0.004793494939804077),
 ('eyes', 0.003576332703232765),
 ('tell', 0.0026676729321479797),
 ('water', -0.0008852854371070862),
 ('come', -0.0027501857839524746),
 ('itself', -0.005945511162281036)]

In [17]:
w2v_model.wv.most_similar(positive=["desire"], topn=10)

[('therefore', 0.4046172797679901),
 ('two', 0.3852030038833618),
 ('buried', 0.3824410140514374),
 ('brain', 0.3611655831336975),
 ('grew', 0.3531067371368408),
 ('painting', 0.3494494557380676),
 ('lines', 0.3426726162433624),
 ('invention', 0.33686840534210205),
 ('thought', 0.33218586444854736),
 ('fairest', 0.32911860942840576)]

In [19]:
w2v_model.wv.most_similar(negative=["desire"], topn=10)

[('others', 0.0377042256295681),
 ('skill', 0.019147800281643867),
 ('power', 0.01686260849237442),
 ('summer’s', 0.01038346067070961),
 ('they', -0.004806850105524063),
 ('come', -0.005541369318962097),
 ('thing', -0.012170083820819855),
 ('pen', -0.018888752907514572),
 ('day', -0.02026074379682541),
 ('taught', -0.021313752979040146)]

In [21]:
w2v_model.wv.most_similar(positive=["joy"], topn=10)

[('wherein', 0.47509273886680603),
 ('grief', 0.47131937742233276),
 ('cure', 0.43250638246536255),
 ('sweets', 0.4273146390914917),
 ('war', 0.4124882221221924),
 ('quite', 0.3857344090938568),
 ('says', 0.3849511742591858),
 ('sorrow', 0.38366395235061646),
 ('knows', 0.37792330980300903),
 ('am', 0.37721389532089233)]

In [22]:
w2v_model.wv.most_similar(negative=["joy"], topn=10)

[('own', 0.08512154221534729),
 ('eyes', 0.04462076723575592),
 ('thine', 0.027846956625580788),
 ('old', 0.026398025453090668),
 ('within', 0.021579235792160034),
 ('time’s', 0.019596491008996964),
 ('swift', -0.007687138393521309),
 ('skill', -0.013840774074196815),
 ('cruel', -0.021549426019191742),
 ('hide', -0.02611752599477768)]