## Neural Language Model
### Sergio Alberto De León Martīnez

Problema 1

In [4]:
# Tools
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# Pytorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F


# Scikit learn
from sklearn.metrics import accuracy_score

In [5]:
seed = 1111
random.seed(seed) # python seed
np.random.seed(seed) # numpy seed
torch.manual_seed(seed) # torch seed
torch.backends.cudnn.benchmark = False # Ensure reproducibility of CUDA

Cargamos los datos 

In [6]:
X_train = pd.read_csv('mex20_train.txt', sep='\r\n', engine='python', header=None).loc[:, 0].values.tolist()
print(X_train)
X_val = pd.read_csv('mex20_val.txt', sep='\r\n', engine='python', header=None).loc[:, 0].values.tolist()
print(X_val)

['@USUARIO @USUARIO @USUARIO Q se puede esperar del maricon de closet de la Yañez aun recuerdo esa ves q lo vi en zona rosa viendo quien lo levantada', '@USUARIO La piel nueva siempre arde un poquito los primeros días... y más con este puto clima', 'Ustedes no se enamoran de mí… por tontas.', 'Me las va a pagar esa puta gorda roba tuits...', '@USUARIO LA GENTE ES TONTA PORQUE NO SE DAN CUENTA QUE TÚ HACES A BATMAN AZUL', 'Estoy muy encabronada con las pseudo feministas por tontas e iletradas, a veces me avergüenza ser mujer; preferiría tener un falo. #NiUnaMas', 'Anden putos, recuerdan el #noerapenal #Holanda fuera de #Rusia2018, esto se llama #karma ehhhhhhhh #puuuuuutos', 'Si no tienen chichis no traten de enseñar se ven muy mal y más cuando son prietas.', 'Ojalá asi me agarrars cuando te digo que me voy en lugar de correrme a la verga cada 5 minutos.', '@USUARIO @USUARIO @USUARIO @USUARIO Es solo un HDP aprovechado y que su "Diosito Bimbo" me perdone', 'La próxima vez que diga que m

Construimos una clase para crear el vocabulario y generar los datos con sus respectivas etiquetas  

In [4]:
class NgramData:
    def __init__(self, N: int, vocab_max: int=5000, tokenizer=None, embeddings_model=None, embeddings_size=None):
        self.tokenizer = tokenizer if tokenizer else self.default_tokenizer
        self.punct = set(['.', ', ', ':', ';', '-', '^', '*', '!', '¡', '?', '¿', '"', '\'', '...', '<url>', '+', '@usuario'])
        self.N = N
        self.vocab_max = vocab_max
        self.UNK = '<unk>'
        self.SOS = '<s>'
        self.EOS = '</s>'
        self.embeddings_model = embeddings_model

    def get_vocab_size(self) -> int:
        return len(self.vocab)

    def default_tokenizer(self, doc: str) -> list:
        return doc.split(' ')

    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = True if word in self.punct else False
        is_digit = word.isnumeric()
        return is_punct or is_digit

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist([
            w.lower() for sentence in corpus\
                      for w in self.tokenizer(sentence)\
                      if not self.remove_word(w)
        ])
        sorted_words = self.sortFreqDict(freq_dist)[:self.vocab_max-3]
        return set(sorted_words)

    def sortFreqDict(self, freq_dist) -> list:
        freq_dict = dict(freq_dist)
        return sorted(freq_dict, key=freq_dict.get, reverse=True)

    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)

        self.w2id = {}
        self.id2w = {}

        if self.embeddings_model is not None:
            self.embedding_matrix = np.empty([len(self.vocab), self.embeddings_model.vector_size])

        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and not word_  in self.w2id:
                    self.w2id[word_] = id
                    self.id2w[id] = word_

                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embedding_matrix[id] = self.embeddings_model[word_]
                        else:
                            self.embedding_matrix[id] = np.random.rand(self.embeddings_model.vector_size)

                    id += 1

        # Always add especial tokens
        self.w2id.update(
            {
                self.UNK: id,
                self.SOS: id+1,
                self.EOS: id+2
            }
        )
        self.id2w.update(
            {
                id:  self.UNK,
                id+1: self.SOS,
                id+2: self.EOS
            }
        )

    def transform(self, corpus: list) -> Tuple[np.ndarray, np.ndarray]:
        X_ngrams = []
        y = []

        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])

        return np.array(X_ngrams), np.array(y)

    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS]*(self.N-1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))

    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens



In [18]:
args = Namespace()
args.N = 6 # Ventana de contexto para caracteres

In [8]:
class tokenizer_char:
    def __init__(self):
        pass

    def tokenizer(self, text):
        return list(text)
    
tk = tokenizer_char()

ngram_data = NgramData(args.N, 5000, tk.tokenizer) # Usamos list como tokenizador ya que separa el texto en caracteres
ngram_data.fit(X_train)
print(f'Vocab size: {ngram_data.get_vocab_size()}')

Vocab size: 373


In [9]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val = ngram_data.transform(X_val)

Creamos los dataloaders para el entrenamiento 

In [None]:
# Set batch size in args
args.batch_size = 64

# Num workers
args.num_workers = 2

# Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype=torch.int64),
                              torch.tensor(y_ngram_train, dtype=torch.int64))

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    shuffle=True
)

# Val
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype=torch.int64),
                              torch.tensor(y_ngram_val, dtype=torch.int64))

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    shuffle=False
)

Aquí se implementa el modelo, con algunas variantes respecto del de Bengio, por ejemplo se agrega una capa de dropout y se una relu como función de activación. 

In [11]:
class NeuralLM(nn.Module):
    def __init__(self, args):
        super(NeuralLM, self).__init__()

        self.window_size = args.N - 1
        self.embedding_size = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1 = nn.Linear(args.d*(args.N-1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)

In [39]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()

    return y_pred

In [40]:
def model_eval(data, model, gpu=False):
    with torch.no_grad():
        preds, tgts = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()

            outputs = model(window_words)

            # Get prediction
            y_pred = get_preds(outputs)

            tgt = labels.numpy()
            tgts.append(tgt)
            preds.append(y_pred)

    tgts = [e for l in tgts for e in l]
    preds = [e for l in preds for e in l]

    return accuracy_score(tgts, preds)

In [41]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pt"):
    filename = os.path.join(checkpoint_path, filename)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint_path, "model_best.pt"))

In [18]:
# Model hyperparameters

args.vocab_size = ngram_data.get_vocab_size()
args.d = 100 # Dimension of word embeddings
args.d_h = 200 # Dimension for hidden layer
args.dropout = 0.1

# Traing hyperparameters
args.lr = 2.3e-1
args.num_epochs = 30
args.patience = 20

# Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5

# Saving directory
args.savedir = 'model'
os.makedirs(args.savedir, exist_ok=True)

# Create model
model = NeuralLM(args)

# Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()

# Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, "min",
    patience=args.lr_patience,
    factor=args.lr_factor
)

Entrenamos al modelo y guardamos la mejor versión. 

In [19]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()

    for window_words, labels in train_loader:

        # if GPU available
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        # Forward pass
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        # Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    # Get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader, model, gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)

    # Update sheduler
    scheduler.step(tuning_metric)

    # Check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    # Save best model if metric improved
    save_checkpoint(
        {
            "epoch": epoch + 1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )

    # Early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop")
        break

    print('Train acc: {}'.format(mean_epoch_metric))
    print('Epoch [{}/{}], Loss: {: .4f} - Val accuracy: {: .4f} - Epoch time: {: .2f}'.
          format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time()-epoch_start_time)))

print("--- %s seconds ---"% (time.time() - start_time))

Train acc: 0.414579390319578
Epoch [1/30], Loss:  2.0123 - Val accuracy:  0.4545 - Epoch time:  67.43
Train acc: 0.4532576403971455
Epoch [2/30], Loss:  1.8503 - Val accuracy:  0.4687 - Epoch time:  62.34
Train acc: 0.4652738132174992
Epoch [3/30], Loss:  1.7999 - Val accuracy:  0.4753 - Epoch time:  67.51
Train acc: 0.4727984408935774
Epoch [4/30], Loss:  1.7687 - Val accuracy:  0.4812 - Epoch time:  68.75
Train acc: 0.47707648153893883
Epoch [5/30], Loss:  1.7475 - Val accuracy:  0.4851 - Epoch time:  64.98
Train acc: 0.4820190816009929
Epoch [6/30], Loss:  1.7312 - Val accuracy:  0.4876 - Epoch time:  59.09
Train acc: 0.4850238520012411
Epoch [7/30], Loss:  1.7189 - Val accuracy:  0.4909 - Epoch time:  58.05
Train acc: 0.4872015591064226
Epoch [8/30], Loss:  1.7076 - Val accuracy:  0.4911 - Epoch time:  58.38
Train acc: 0.4894746742165684
Epoch [9/30], Loss:  1.6992 - Val accuracy:  0.4934 - Epoch time:  56.60
Train acc: 0.4910006205398697
Epoch [10/30], Loss:  1.6915 - Val accuracy

In [46]:
def print_closes_word(embeddings, ngram_data, word, n):
    word_id = torch.LongTensor([ngram_data.w2id[word]]) # get word id
    word_embed = embeddings(word_id) # get word embedding
    dists = torch.norm(embeddings.weight - word_embed, dim=1).detach() # compute distances
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1]) # sort by distance
    for idx, difference in lst[1:n+1]:
        print(ngram_data.id2w[idx], difference)

In [20]:
# Model with learned embeddings from scratch
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load('model/model_best.pt')['state_dict'])
best_model.train(False)

print("."*30)
print("Learned embeddings")
print("."*30)

..............................
Learned embeddings
..............................


In [62]:
def parse_text(text, tokenizer):
    all_tokens = [w.lower() if w in ngram_data.w2id else '<unks>' for w in tokenizer(text)]
    tokens_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
    return all_tokens, tokens_ids

In [61]:
def sample_next_word(logits, temperature=1.0):
    logits = np.asarray(logits).astype('float64')

    # How much stochastic is the model
    preds = logits / temperature

    # Normalize probabilities
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    # Make a sample
    probas = np.random.multinomial(1, preds)
    return np.argmax(probas)

In [60]:
def predict_next_token(model, tokens_ids):
    word_ids_tensor = torch.LongTensor(tokens_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()

    y_pred = sample_next_word(y_raw_pred, 1.0)

    return y_pred

In [59]:
def generate_sentence(model, initial_text, tokenizer):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)

    for i in range(300):
        y_pred = predict_next_token(best_model, window_word_ids)
        next_word = ngram_data.id2w[y_pred]
        all_tokens.append(next_word)

        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)

    return " ".join(all_tokens)



Generamos texto 

In [31]:
initial_tokens = "</s> "

print("."*30)
print("Learned embeddings")
print("."*30)
for i in range(3):
  print(generate_sentence(best_model, initial_tokens, tk.tokenizer))

..............................
Learned embeddings
..............................
< / s >   q u e   h e r m a n d o <unk> <unk> <unk>   🏆 😛 </s>
< / s >   p a r a   @ u s u a r i o   c a b l i r   d e s a p i o s   d e j a   p e r s o n a s   a n d a r   e s o   d e l   a p t o d o   n o   m u c h a   v a r d … </s>
< / s >   b o c i o n e s t o   l e c r e é s   a d o s   p a n g a   s e   c o m o   l u   c a m p o a e r   a   s u   m a d r e   e s a s   t o n t a s   y   i m b r e   a   l a   a s í ,   f e a s   d o r m i r ,   q u e   e l   w e <unk>   y   e s o ,   g i g a s ,   e   s h o s t a   🇨 ツ i s   e s a   c o m í   😂 😂 </s>


Podemos ver que el modelo es capaz de articular palabras con sentido. 

In [115]:
def log_likelihood(model, text, ngram_model):
    # Generate n-gram window frmo input text and the respective label y
    X, y = ngram_model.transform([text])
    # Discard first two n-gram window since they contain '<s>' tokens not necessary
    X, y = X[2:], y[2:]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim=1).numpy()

    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

Calculamos log-likelihood de algunas palabras 

In [32]:
print("log likelihood: ", log_likelihood(best_model, "Vamos México", ngram_data))
print("log likelihood: ", log_likelihood(best_model, "aguas locas", ngram_data))
print("log likelihood: ", log_likelihood(best_model, "presidente corrupto", ngram_data))
print("log likelihood: ", log_likelihood(best_model, "gool de chucky", ngram_data))
print("log likelihood: ", log_likelihood(best_model, "palbra mal estita", ngram_data))

log likelihood:  -13.91062
log likelihood:  -17.978996
log likelihood:  -22.072773
log likelihood:  -29.295086
log likelihood:  -44.627472


En este ejemplo se proponen algunas oraciones y las que tiene sentido, tienen una log-verosímilitud considerablemente más baja a la oración mal escrita. 

Estructura morfológica 

In [34]:
from itertools import permutations
from random import shuffle

word_list = list("mexico")
perms = [' '.join(perm) for perm in permutations(word_list)]
# print(len(perms))

print('.'*50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[:5]:
    print(p, t)

print('.'*50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[-5:]:
    print(p, t)

..................................................
-39.5151 c e m i o x
-41.899933 c e i o m x
-42.42803 c e i o x m
-42.96985 i c e m x o
-43.25964 c e o m x i
..................................................
-66.11398 e x o i m c
-66.33474 m c x e o i
-66.48716 m e x c i o
-67.21211 o m c x e i
-67.78076 m c x e i o


Como podemos ver, el modelo no fue capaz para este caso de reconstruir la palabra 'mexico' entre las que tiene una log-verosímilitud más alta. 

Calculo de la perplejidad 

In [121]:
def perplexity(model, corpus, ngram_data):
    log_likelihoods = 0
    N = 0
    for doc in corpus: 
        text = list(doc)
        N += len(text)
        log_likelihoods += log_likelihood(model, text, ngram_data)
    return np.exp(-log_likelihoods / N)


In [38]:
val_perplexity = perplexity(model, X_val, ngram_data)
print('valor de la perplejidad en validación:', val_perplexity)

valor de la perplejidad en validación: 4.992085409250099


Problema 2

Cargamos los embeddings de word2vec 

In [1]:
# Primero leamos los datos del word2vec 
import pandas as pd 

import numpy as np

def cargar_embeddings(ruta_archivo):
    embeddings = {}
    with open(ruta_archivo, 'r', encoding='utf-8') as archivo:
        # Leer la primera línea para obtener el número de palabras y la dimensión de los embeddings
        num_palabras, dim_embedding = map(int, archivo.readline().split())
        
        # Leer cada línea y almacenar en el diccionario
        for linea in archivo:
            valores = linea.split()
            palabra = valores[0]
            coeficientes = np.asarray(valores[1:], dtype='float32')
            embeddings[palabra] = coeficientes
            
    return embeddings

ruta_archivo = './word2vec_col.txt'
embeddings = cargar_embeddings(ruta_archivo)


In [20]:
args = Namespace()
args.N = 4

Modifiquemos ligeramente el codigo de ngram_data para poder cargar los embeddinigs.

In [16]:
class NgramData:
    def __init__(self, N: int, vocab_max: int=5000, tokenizer=None, embeddings_model=None):
        self.tokenizer = tokenizer if tokenizer else self.default_tokenizer
        self.punct = set(['.', ', ', ':', ';', '-', '^', '*', '!', '¡', '?', '¿', '"', '\'', '...', '<url>', '+', '@usuario'])
        self.N = N
        self.vocab_max = vocab_max
        self.UNK = '<unk>'
        self.SOS = '<s>'
        self.EOS = '</s>'
        self.embeddings_model = embeddings_model

    def get_vocab_size(self) -> int:
        return len(self.vocab)

    def default_tokenizer(self, doc: str) -> list:
        return doc.split(' ')

    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = True if word in self.punct else False
        is_digit = word.isnumeric()
        return is_punct or is_digit

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist([
            w.lower() for sentence in corpus\
                      for w in self.tokenizer(sentence)\
                      if not self.remove_word(w)
        ])
        sorted_words = self.sortFreqDict(freq_dist)[:self.vocab_max-3]
        return set(sorted_words)

    def sortFreqDict(self, freq_dist) -> list:
        freq_dict = dict(freq_dist)
        return sorted(freq_dict, key=freq_dict.get, reverse=True)

    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        embedding_dim = next(iter(embeddings.values())).shape[0]

        self.w2id = {}
        self.id2w = {}

        if self.embeddings_model is not None:
            self.embedding_matrix = torch.zeros(len(self.vocab), embedding_dim)

        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and not word_  in self.w2id:
                    self.w2id[word_] = id
                    self.id2w[id] = word_

                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embedding_matrix[id] = torch.tensor(self.embeddings_model[word_])
                        else:
                            self.embedding_matrix[id] = torch.rand(embedding_dim)

                    id += 1

        # Always add especial tokens
        self.w2id.update(
            {
                self.UNK: id,
                self.SOS: id+1,
                self.EOS: id+2
            }
        )
        self.id2w.update(
            {
                id:  self.UNK,
                id+1: self.SOS,
                id+2: self.EOS
            }
        )

    def transform(self, corpus: list) -> Tuple[np.ndarray, np.ndarray]:
        X_ngrams = []
        y = []

        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])

        return np.array(X_ngrams), np.array(y)

    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS]*(self.N-1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))

    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens



In [22]:
tk = TweetTokenizer()
ngram_data_w2vec = NgramData(args.N, 5000, tk.tokenize, embeddings_model=embeddings)
ngram_data_w2vec.fit(X_train)
print(f'Vocab size: {ngram_data_w2vec.get_vocab_size()}')

X_ngram_train, y_ngram_train = ngram_data_w2vec.transform(X_train)
X_ngram_val, y_ngram_val = ngram_data_w2vec.transform(X_val)

Vocab size: 5000


In [34]:
ngram_data_w2vec.embedding_matrix.shape

torch.Size([5000, 100])

Creamos los dataloaders 

In [35]:
# Set batch size in args
args.batch_size = 64

# Num workers
args.num_workers = 2

# Train
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype=torch.int64),
                              torch.tensor(y_ngram_train, dtype=torch.int64))

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    shuffle=True
)

# Val
val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype=torch.int64),
                              torch.tensor(y_ngram_val, dtype=torch.int64))

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=args.batch_size,
    num_workers=args.num_workers,
    shuffle=False
)

Ahora le pasamos los embeddings a la red 

In [51]:
class NeuralLM(nn.Module):
    def __init__(self, args, embeddings):
        super(NeuralLM, self).__init__()

        self.window_size = args.N - 1
        self.embedding_size = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        with torch.no_grad():
            self.emb.weight.copy_(embeddings)
        self.fc1 = nn.Linear(args.d*(args.N-1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h) 

In [52]:
# Model hyperparameters

args.vocab_size = ngram_data_w2vec.get_vocab_size()
args.d = 100 # Dimension of word embeddings
args.d_h = 200 # Dimension for hidden layer
args.dropout = 0.1

# Traing hyperparameters
args.lr = 2.3e-1
args.num_epochs = 30
args.patience = 20

# Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5

# Saving directory
args.savedir = 'model_w2vec'
os.makedirs(args.savedir, exist_ok=True)

# Create model
model_w2vec = NeuralLM(args, embeddings=ngram_data_w2vec.embedding_matrix)

# Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model_w2vec.cuda()

# Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_w2vec.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, "min",
    patience=args.lr_patience,
    factor=args.lr_factor
)

In [53]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model_w2vec.train()

    for window_words, labels in train_loader:

        # if GPU available
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        # Forward pass
        outputs = model_w2vec(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        # Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    # Get metric in validation dataset
    model_w2vec.eval()
    tuning_metric = model_eval(val_loader, model_w2vec, gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)

    # Update sheduler
    scheduler.step(tuning_metric)

    # Check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    # Save best model if metric improved
    save_checkpoint(
        {
            "epoch": epoch + 1,
            "state_dict": model_w2vec.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )

    # Early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop")
        break

    print('Train acc: {}'.format(mean_epoch_metric))
    print('Epoch [{}/{}], Loss: {: .4f} - Val accuracy: {: .4f} - Epoch time: {: .2f}'.
          format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time()-epoch_start_time)))

print("--- %s seconds ---"% (time.time() - start_time))

Train acc: 0.0018971824408468244
Epoch [1/30], Loss:  24.4317 - Val accuracy:  0.0022 - Epoch time:  62.41
Train acc: 0.0013912671232876712
Epoch [2/30], Loss:  11.3275 - Val accuracy:  0.0022 - Epoch time:  69.58
Train acc: 0.00138153798256538
Epoch [3/30], Loss:  9.7756 - Val accuracy:  0.0022 - Epoch time:  77.47
Train acc: 0.00138153798256538
Epoch [4/30], Loss:  9.0840 - Val accuracy:  0.0022 - Epoch time:  75.35
Train acc: 0.0013523505603985056
Epoch [5/30], Loss:  8.5169 - Val accuracy:  0.0022 - Epoch time:  73.72
Train acc: 0.001362079701120797
Epoch [6/30], Loss:  8.5198 - Val accuracy:  0.0022 - Epoch time:  69.23
Train acc: 0.0013523505603985056
Epoch [7/30], Loss:  8.5169 - Val accuracy:  0.0022 - Epoch time:  68.33
Train acc: 0.001362079701120797
Epoch [8/30], Loss:  8.5168 - Val accuracy:  0.0022 - Epoch time:  88.82
Train acc: 0.0013523505603985056
Epoch [9/30], Loss:  8.5169 - Val accuracy:  0.0022 - Epoch time:  89.11
Train acc: 0.0013523505603985056
Epoch [10/30], Lo

Imprimimos las palabras más similares 

In [54]:
# Cargamos el modelo aprendido 
best_model_w2vec = NeuralLM(args, embeddings=ngram_data_w2vec.embedding_matrix)
best_model_w2vec.load_state_dict(torch.load('model_w2vec/model_best.pt')['state_dict'])
best_model_w2vec.train(False)

print("."*30)
print("Learned embeddings")
print("."*30)

..............................
Learned embeddings
..............................


In [55]:
print_closes_word(best_model_w2vec.emb, ngram_data_w2vec, "mamá", 10)

gorda 45.933167
llevando 46.409718
calabaza 46.411
cierto 46.486923
cabrona 47.293633
morra 47.58222
👸 47.591522
pinchi 47.962654
diciendo 47.96333
loca 48.231575


In [56]:
print_closes_word(best_model_w2vec.emb, ngram_data_w2vec, "méxico", 10)

😻 61.167538
hoy 62.244057
acumulada 62.35954
reverenda 62.468155
doble 63.06319
culito 63.08487
durante 63.09571
rata 63.404396
@ 63.444305
rola 63.883274


In [58]:
print_closes_word(best_model_w2vec.emb, ngram_data_w2vec, "políticos", 10)

politicos 14.287606
ciudadanos 16.13567
periodistas 17.521915
españoles 17.957241
delincuentes 18.048443
catalanes 18.445683
sinvergüenzas 18.520702
colombianos 18.77693
traidores 18.954855
ladrones 19.062769


Vemos que con la introducción de los embeddings el modelo mejora considerablemente en encontrar palabras similares y además, parece ser que los nuevos mebeddings se han ajustado al contexto prticular de este corpus. 

Generación de texto

In [113]:
def parse_text(text, tokenizer):
    all_tokens = [w.lower() if w in ngram_data_w2vec.w2id else '<unk>' for w in tokenizer(text)]
    tokens_ids = [ngram_data_w2vec.w2id[word.lower()] for word in all_tokens]
    return all_tokens, tokens_ids

In [104]:
def sample_next_word(logits, temperature=1.0):
    logits = np.asarray(logits).astype(np.float64)

    # How much stochastic is the model
    preds = logits / temperature

    # print(preds)
    # Normalize probabilities

    # Restar el máximo para estabilización numérica
    preds_max = np.max(preds)
    preds_stable = preds - preds_max

    # Exponenciación de los logits ajustados
    exp_preds = np.exp(preds_stable)

    # exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    # Make a sample
    # print(preds)
    probas = np.random.multinomial(1, preds)
    return np.argmax(probas)

In [105]:
def predict_next_token(model, tokens_ids):
    word_ids_tensor = torch.LongTensor(tokens_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()

    y_pred = sample_next_word(y_raw_pred, 1.0)
    # print(y_pred)
    return y_pred

In [106]:
def generate_sentence(model, initial_text, tokenizer):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)

    for i in range(300):
        y_pred = predict_next_token(model, window_word_ids)
        next_word = ngram_data_w2vec.id2w[y_pred]
        all_tokens.append(next_word)

        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)

    return " ".join(all_tokens)



In [107]:
initial_tokens = "<s> <s> <s>"

print("."*30)
print("Learned embeddings")
print("."*30)
print(generate_sentence(best_model_w2vec, initial_tokens, tk.tokenize))

..............................
Learned embeddings
..............................
<s> <s> <s> mochila cdmx personalidad iletradas q ya cinco últimamente esperaban coma zapatos </s>


In [108]:
initial_tokens = "la vida es"

print("."*30)
print("Learned embeddings")
print("."*30)
print(generate_sentence(best_model_w2vec, initial_tokens, tk.tokenize))

..............................
Learned embeddings
..............................
la vida es pobres #sexy emocional #putinalgona gustar 😠 #cruising permisos maneras servicio , supo portero pubertad de cariño laferte sentarme hermoso irme bola chiquitito $ </s>


In [111]:
initial_tokens = "gol de mexico"

print("."*30)
print("Learned embeddings")
print("."*30)
print(generate_sentence(best_model_w2vec, initial_tokens, tk.tokenize))

..............................
Learned embeddings
..............................


gol de mexico promoción charlie unfollow bloqueo una ✊🏻 enorme sigan confianza sarcasmo vestido cargue <unk> enamoró negras pendejete empezó robó <unk> abusivos entremos pucha </s>


La generación de texto parece ser que también mejora, ahora pudiendo articular tres o cuatro palabras seguidas con sentido. 

Likelihood de oraciones 

In [116]:
print("log likelihood: ", log_likelihood(best_model_w2vec, "Vamos México", ngram_data_w2vec))
print("log likelihood: ", log_likelihood(best_model_w2vec, "aguas locas", ngram_data_w2vec))
print("log likelihood: ", log_likelihood(best_model_w2vec, "presidente corrupto", ngram_data_w2vec))
print("log likelihood: ", log_likelihood(best_model_w2vec, "gool de chucky", ngram_data_w2vec))
print("log likelihood: ", log_likelihood(best_model_w2vec, "palbra mal estita", ngram_data_w2vec))

log likelihood:  -8.517193
log likelihood:  -8.517193
log likelihood:  -8.517193
log likelihood:  -17.034386
log likelihood:  -17.034386


Permutaciones 

In [120]:
from itertools import permutations
from random import shuffle

sentence = "ya se va a acabar la corrupcion"
tokens_sentence = tk.tokenize(sentence)
perms = [' '.join(perm) for perm in permutations(tokens_sentence)]
# print(len(perms))
logs_perms = sorted([(log_likelihood(best_model_w2vec, text, ngram_data_w2vec), text) for text in perms], reverse=True)

print('.'*50)
for p, t in logs_perms[:5]:
    print(p, t)

print('.'*50)
for p, t in logs_perms[-5:]:
    print(p, t)

..................................................
-51.103157 ya va se la corrupcion acabar a
-51.103157 ya va se la corrupcion a acabar
-51.103157 ya va se la acabar corrupcion a
-51.103157 ya va se la acabar a corrupcion
-51.103157 ya va se la a corrupcion acabar
..................................................
-51.103157 a acabar corrupcion la ya se va
-51.103157 a acabar corrupcion la va ya se
-51.103157 a acabar corrupcion la va se ya
-51.103157 a acabar corrupcion la se ya va
-51.103157 a acabar corrupcion la se va ya


Perplejidad 

In [166]:
def perplexity(model, corpus, ngram_data, tokenizer):
    max_log_likelihood = -np.inf
    log_likelihoods = []
    N = 0

    for doc in corpus:
        text = tokenizer(doc)
        N += len(text)
        __log_likelihood = log_likelihood(model, doc, ngram_data)
        
        if __log_likelihood > max_log_likelihood:
            max_log_likelihood = __log_likelihood
        
        log_likelihoods.append(__log_likelihood)

    # Normaliza los logaritmos restando el máximo para evitar el underflow
    adjusted_log_likelihoods = [ll - max_log_likelihood for ll in log_likelihoods]
    
    # Suma exponenciando y luego tomando el logaritmo nuevamente
    total_log_likelihood = max_log_likelihood + np.log(np.sum(np.exp(adjusted_log_likelihoods)))
    
    # return total_log_likelihood
    return np.exp(-total_log_likelihood / N)


In [178]:
val_perplexity = perplexity(best_model_w2vec, X_val, ngram_data_w2vec, tk.tokenize)
print('valor de la perplejidad en validación:', val_perplexity)

valor de la perplejidad en validación: 1.0007766201235047


  return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])


Este valor es mucho menor que el que se obtuvo en clase de aproximadamentee 4.6 

Conexión directa del model

Ahora implementemos la conexión directa que proponia Bengio en el artículo original 

In [171]:
class NeuralLM_DC(nn.Module):
    def __init__(self, args, embeddings):
        super(NeuralLM_DC, self).__init__()

        self.window_size = args.N - 1
        self.embedding_size = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        with torch.no_grad():
            self.emb.weight.copy_(embeddings)
        self.fc1 = nn.Linear(args.d*(args.N-1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)
        
        # Agregamos la conexion directa 
        self.fc_direct = nn.Linear(args.d*(args.N-1), args.vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h) + self.fc_direct(x)

In [172]:
# Saving directory
args.savedir = 'model_direct_conection'
os.makedirs(args.savedir, exist_ok=True)

# Create model
model_dc = NeuralLM_DC(args, embeddings=ngram_data_w2vec.embedding_matrix)

# Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model_dc.cuda()

# Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model_dc.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, "min",
    patience=args.lr_patience,
    factor=args.lr_factor
)

Entrenemos al nuevo modelo 

In [173]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model_dc.train()

    for window_words, labels in train_loader:

        # if GPU available
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        # Forward pass
        outputs = model_dc(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        # Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    # Get metric in validation dataset
    model_dc.eval()
    tuning_metric = model_eval(val_loader, model_dc, gpu=args.use_gpu)
    metric_history.append(mean_epoch_metric)

    # Update sheduler
    scheduler.step(tuning_metric)

    # Check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    # Save best model if metric improved
    save_checkpoint(
        {
            "epoch": epoch + 1,
            "state_dict": model_dc.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )

    # Early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop")
        break

    print('Train acc: {}'.format(mean_epoch_metric))
    print('Epoch [{}/{}], Loss: {: .4f} - Val accuracy: {: .4f} - Epoch time: {: .2f}'.
          format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time()-epoch_start_time)))

print("--- %s seconds ---"% (time.time() - start_time))

Train acc: 0.06152771361426907
Epoch [1/30], Loss:  693.3620 - Val accuracy:  0.0895 - Epoch time:  95.41
Train acc: 0.08418751004298397
Epoch [2/30], Loss:  697.1769 - Val accuracy:  0.0876 - Epoch time:  106.72
Train acc: 0.09754436488169364
Epoch [3/30], Loss:  695.0031 - Val accuracy:  0.0828 - Epoch time:  110.12
Train acc: 0.10464852066846102
Epoch [4/30], Loss:  632.2559 - Val accuracy:  0.0838 - Epoch time:  109.52
Train acc: 0.10652687251436146
Epoch [5/30], Loss:  739.6291 - Val accuracy:  0.0850 - Epoch time:  108.75
Train acc: 0.1136850093399751
Epoch [6/30], Loss:  719.4416 - Val accuracy:  0.0782 - Epoch time:  131.70
Train acc: 0.11911638060900655
Epoch [7/30], Loss:  684.3758 - Val accuracy:  0.1005 - Epoch time:  124.17
Train acc: 0.12221809343992288
Epoch [8/30], Loss:  702.7011 - Val accuracy:  0.0971 - Epoch time:  127.52
Train acc: 0.12698662776684208
Epoch [9/30], Loss:  749.6927 - Val accuracy:  0.0890 - Epoch time:  130.59
Train acc: 0.13473290684128067
Epoch [1

In [174]:
# Cargamos el modelo aprendido 
best_model_dc = NeuralLM_DC(args, embeddings=ngram_data_w2vec.embedding_matrix)
best_model_dc.load_state_dict(torch.load('model_direct_conection/model_best.pt')['state_dict'])
best_model_dc.train(False)

print("."*30)
print("Learned embeddings")
print("."*30)

..............................
Learned embeddings
..............................


Calculamos las 10 palabras más similares a mama, para comparar con el modelo anterior sin conexión directa.

In [176]:
print_closes_word(best_model_dc.emb, ngram_data_w2vec, "mama", 10)

bien 42.434277
porque 44.741245
que 44.961884
madre 45.11058
y 45.146553
es 45.610737
, 46.21731
hoy 46.327736
si 46.445908
pero 46.579235


In [179]:
val_perplexity = perplexity(best_model_dc, X_val, ngram_data_w2vec, tk.tokenize)
print('valor de la perplejidad en validación:', val_perplexity)

  return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])


valor de la perplejidad en validación: 1.0084703744487509


No parece haber una mejoría considerable respecto al modelo sin conexión, esto podría debeerse a que el conjunto de datos que estamos utilizando es relativamente pequeño, incluso me parece que sería mejor no usar una conexión directa para evitar caer en overfiting. 