# Natural Language Models

## Parte 1

In [48]:
import os
import time
import shutil
import random
from typing import Tuple
from argparse import Namespace
import matplotlib.pyplot as plt

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# Pytorch
from torch.utils.data import DataLoader, TensorDataset 
import torch
import torch.nn as nn
import torch.nn.functional as F

# scikit-learn
from sklearn.metrics import accuracy_score

In [49]:
seed = 1111
random.seed(seed) # python seed
np.random.seed(seed) #numpy seed
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False 

In [50]:
X_train = pd.read_csv('./mex_train.txt', sep = '\r\n', engine = 'python', header = None).loc[:,0].values.tolist()
# print(X_train)
X_val = pd.read_csv('./mex_val.txt', sep = '\r\n', engine = 'python', header = None).loc[:,0].values.tolist()
# print(X_val)

In [51]:
args = Namespace()
args.N = 4

In [52]:
from nltk import FreqDist
from nltk.tokenize import TweetTokenizer
import numpy as np

class NgramData():
    def __init__(self, N: int, vocab_max: int = 5000, tokenizer = None, embeddings_model = None):
        self.tokenizer = tokenizer if tokenizer else self.default_tokenizer
        self.punct = set(['.', ',', ';', ':', '-', '^', '»', '«', '!', '¡', '¿', '?', '"', '\'', '...', '<url>', '*', '@usuario'])
        # Orden de modelo
        self.N = N
        self.vocab_max = vocab_max
        self.UNK = '<unk>'
        self.SOS = '<s>'
        self.EOS = '</s>'
        self.embeddings_model = embeddings_model

    def get_vocab_size(self) -> list:
        return len(self.vocab)

    def default_tokenizer(self, doc: str) -> list:
        return doc.split(" ")

    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = True if word in self.punct else False
        is_digit = word.isnumeric()
        return is_punct or is_digit

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist([w.lower() for sentence in corpus\
                                        for w in self.tokenizer(sentence)\
                                        if not self.remove_word(w)])
        sorted_words = self.sortFreqDict(freq_dist)[:self.vocab_max-3]
        return set(sorted_words)

    def sortFreqDict(self, freq_dist) -> list:
        freq_dict = dict(freq_dist)
        return sorted(freq_dict, key = freq_dict.get, reverse = True)

    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        self.w2id = {}
        self.id2w = {}

        if self.embeddings_model is not None:
            self.embedding_matrix = np.empty([len(self.vocab), self.embeddings_model.vector_size])

        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and not word_ in self.w2id:
                    self.w2id[word_] = id
                    self.id2w[id] = word_

                    if self.embeddings_model is not None:
                        if word_ in self.embeddings_model:
                            self.embedding_matrix[id] = self.embeddings_model[word_]
                        else:
                            self.embedding_matrix[id] = np.random.rand(self.embeddings_model[word_].vector_size)
                    
                    id += 1

        self.w2id.update({self.UNK: id, self.SOS: id+1, self.EOS: id+2})
        self.id2w.update({id: self.UNK, id+1: self.SOS, id+2: self.EOS})

    def transform(self, corpus: list) -> Tuple[np.ndarray, np.ndarray]:
        X_ngrams=[]
        y=[]

        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_window in doc_ngram:
                words_window_ids = [self.w2id[w] for w in words_window]
                X_ngrams.append(list(words_window_ids[:-1]))
                y.append(words_window_ids[-1])
        return np.array(X_ngrams), np.array(y)
                
    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N-1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))

    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens


In [53]:
tk = TweetTokenizer()
ngram_data = NgramData(args.N, 5000, tk.tokenize)
ngram_data.fit(X_train)

In [54]:
print(f'Vocab size: {ngram_data.get_vocab_size()}')

Vocab size: 5000


In [55]:
X_ngram_train, y_ngram_train = ngram_data.transform(X_train)
X_ngram_val, y_ngram_val = ngram_data.transform(X_val)

In [56]:
X_ngram_train

array([[4998, 4998, 4998],
       [4998, 4998,    0],
       [4998,    0,    1],
       ...,
       [  20, 4353,  138],
       [4353,  138,  163],
       [ 138,  163, 4997]])

In [57]:
y_ngram_train

array([   0,    1,    2, ...,  163, 4997, 4999])

In [58]:
print(f'Training observations: {X_ngram_train.shape}, y: {y_ngram_train.shape}')
print(f'Validation observations: {X_ngram_val.shape}, y: {y_ngram_val.shape}')

Training observations: (106964, 3), y: (106964,)
Validation observations: (11594, 3), y: (11594,)


In [59]:
[[ngram_data.id2w[w] for w in tw] for tw in X_ngram_train[:22]]

[['<s>', '<s>', '<s>'],
 ['<s>', '<s>', 'lo'],
 ['<s>', 'lo', 'peor'],
 ['lo', 'peor', 'de'],
 ['peor', 'de', 'todo'],
 ['de', 'todo', 'es'],
 ['todo', 'es', 'que'],
 ['es', 'que', 'no'],
 ['que', 'no', 'me'],
 ['no', 'me', 'dan'],
 ['me', 'dan', 'por'],
 ['dan', 'por', 'un'],
 ['por', 'un', 'tiempo'],
 ['un', 'tiempo', 'y'],
 ['tiempo', 'y', 'luego'],
 ['y', 'luego', 'vuelven'],
 ['luego', 'vuelven', 'estoy'],
 ['vuelven', 'estoy', 'hasta'],
 ['estoy', 'hasta', 'la'],
 ['hasta', 'la', 'verga'],
 ['la', 'verga', 'de'],
 ['verga', 'de', 'estl']]

In [60]:
[ngram_data.id2w[w] for w in y_ngram_train[:22]]


['lo',
 'peor',
 'de',
 'todo',
 'es',
 'que',
 'no',
 'me',
 'dan',
 'por',
 'un',
 'tiempo',
 'y',
 'luego',
 'vuelven',
 'estoy',
 'hasta',
 'la',
 'verga',
 'de',
 'estl',
 '</s>']

In [61]:
args.batch_size = 64
args.num_workers = 2

train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype = torch.int64), torch.tensor(y_ngram_train, dtype = torch.int64))
train_loader = DataLoader(train_dataset, batch_size = args.batch_size, num_workers = args.num_workers, shuffle = True)

val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype = torch.int64), torch.tensor(y_ngram_val, dtype = torch.int64))
val_loader = DataLoader(val_dataset, batch_size = args.batch_size, num_workers = args.num_workers, shuffle = True)

In [62]:
batch = next(iter(train_loader))
print(f'X shape: {batch[0].shape}')
print(f'y shape: {batch[1].shape}')

X shape: torch.Size([64, 3])
y shape: torch.Size([64])


In [63]:
batch[0][:3] # Piece of batch of the training data

tensor([[4997, 4997,  703],
        [1659,   86,  114],
        [ 121, 4997, 4997]])

In [64]:
batch[1][:3] # Piece of batch of the training 'labels'

tensor([4999,  257, 4997])

In [65]:
args.vocab_size = ngram_data.get_vocab_size()
args.droput = 0.1
# Dimensions
args.d = 50
# Dimension for hidden layer
args.d_h = 100

In [66]:
class NeuralLM(nn.Module):
    def __init__(self, args):
        super(NeuralLM, self).__init__()

        self.window_size = args.N-1
        self.embedding_dim = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1 = nn.Linear(args.d * (args.N-1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias = False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_dim)
        # Tanh
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)

In [67]:
e = nn.Embedding(10, 3)
e.weight
e(torch.tensor([0, 2, 5]))

tensor([[ 1.7562, -0.2647, -0.9040],
        [ 0.3331, -0.9770, -1.7632],
        [-0.0122, -1.2013,  0.5056]], grad_fn=<EmbeddingBackward0>)

In [68]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(), dim = 1)
    y_pred = torch.argmax(probs, dim = 1).cpu().numpy()
    return y_pred

In [69]:
def model_eval(data, model, gpu = False):
    with torch.no_grad():
        preds, tgts = [], []
        for window_words, labels in data:
            if gpu: window_words = window_words.cuda()
            
            outputs = model(window_words)
            y_pred  = get_preds(outputs)
            tgt = labels.numpy()
            tgts.append(tgt)
            preds.append(y_pred)
    
    tgts = [e for l in tgts for e in l]
    preds = [e for l in preds for e in l]

    return accuracy_score(tgts, preds)

In [70]:
def save_checkpoint(state, is_best, checkpoint_path, filename = 'checkpoint.pt'):
    filename = os.path.join(checkpoint_path, filename)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint_path, 'model_best.pt'))

In [71]:
# Model hyperparameters
args.vocab_size = ngram_data.get_vocab_size()
args.d = 100
args.d_h = 200
args.dropout = 0.1
# Training hyperparameters
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20 # Early stopping, how many epochs we wait without changes
# Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5
# Saving directory
args.savedir = 'model'
os.makedirs(args.savedir, exist_ok = True)
# Create model
model = NeuralLM(args)
#Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()
# Lost, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min", patience = args.lr_patience, verbose = True, factor = args.lr_factor)

In [72]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()

    for window_words, labels in train_loader:
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    model.eval()
    tuning_metric = model_eval(val_loader, model, gpu = args.use_gpu)
    metric_history.append(mean_epoch_metric)

    scheduler.step(tuning_metric)

    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1

    save_checkpoint(
        {
            "epoch": epoch + 1,
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric,
        },
        is_improvement,
        args.savedir,
    )

    # Early Stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop.")
        break

    print('Epoch [{}/{}]\t Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}'.format(epoch + 1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time() - epoch_start_time))) 
    print("\tTrain acc: {}".format(mean_epoch_metric))

print("--- %s seconds ---" % (time.time() - start_time))

Epoch [1/100]	 Loss: 5.5151 - Val accuracy: 0.2190 - Epoch time: 42.65
	Train acc: 0.16504560406698565
Epoch [2/100]	 Loss: 5.0724 - Val accuracy: 0.1706 - Epoch time: 41.84
	Train acc: 0.17738113038277512
Epoch [3/100]	 Loss: 4.8623 - Val accuracy: 0.2224 - Epoch time: 42.38
	Train acc: 0.18495813397129188
Epoch [4/100]	 Loss: 4.6961 - Val accuracy: 0.1757 - Epoch time: 43.48
	Train acc: 0.19127168062200955
Epoch [5/100]	 Loss: 4.5484 - Val accuracy: 0.1907 - Epoch time: 42.95
	Train acc: 0.19384531997607657
Epoch [6/100]	 Loss: 4.4192 - Val accuracy: 0.1475 - Epoch time: 43.20
	Train acc: 0.1961666417464115
Epoch [7/100]	 Loss: 4.2943 - Val accuracy: 0.1253 - Epoch time: 42.84
	Train acc: 0.19916641746411484
Epoch [8/100]	 Loss: 4.1753 - Val accuracy: 0.1846 - Epoch time: 39.82
	Train acc: 0.20241851076555026
Epoch [9/100]	 Loss: 4.0700 - Val accuracy: 0.2094 - Epoch time: 40.83
	Train acc: 0.20569116327751194
Epoch [10/100]	 Loss: 3.9687 - Val accuracy: 0.1347 - Epoch time: 40.65
	T

## Parte 2

In [73]:
def print_closest_words(embeddings, ngram_data, word, n):
    word_id = torch.LongTensor([ngram_data.w2id[word]])
    word_embed = embeddings(word_id)
    dists = torch.norm(embeddings.weight - word_embed, dim = 1).detach()
    lst = sorted(enumerate(dists.numpy()), key = lambda x: x[1])
    for idx, difference in lst[1: n + 1]:
        print(ngram_data.id2w[idx], difference)

In [74]:
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load('model/model_best.pt')['state_dict'])
best_model.train(False)

print('-' * 30)
print("Learned Embeddings")
print('-' * 30)
print_closest_words(best_model.emb, ngram_data, 'jaja', 10)

------------------------------
Learned Embeddings
------------------------------
sabor 11.239676
celu 11.2588215
putero 11.748031
fake 11.812265
mierdas 11.831186
patriarcado 11.88779
comida 11.9002075
sin 11.905333
caes 11.9176855
ver 11.943648


In [75]:
def parse_text(text, tokenizer):
    all_tokens = [w.lower() if w in ngram_data.w2id else '<unk>' for w in tokenizer.tokenize(text)]
    token_ids = [ngram_data.w2id[word.lower()] for word in all_tokens]
    return all_tokens, token_ids

In [76]:
def sample_next_word(logits, temperature = 1.0):
    logits = np.asarray(logits).astype('float64')
    preds = logits/temperature
    exp_preds = np.exp(preds)
    preds = exp_preds/np.sum(exp_preds)
    probas = np.random.multinomial(1, preds)
    return np.argmax(probas)

In [77]:
def predict_next_token(model, token_ids):
    word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()
    y_pred = sample_next_word(y_raw_pred, 1.0)
    return y_pred

In [78]:
def generate_sentence(model, initial_text, tokenizer):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)
    for i in range(100):
        y_pred = predict_next_token(best_model, window_word_ids)
        next_word = ngram_data.id2w[y_pred]
        all_tokens.append(next_word)

        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
    return " ".join(all_tokens)

In [79]:
initial_tokens = '<s> <s> <s>'
print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
<s> <s> <s> pinche <unk> para estas <unk> <unk> > que no le importa <unk> <unk> inútil tatuajes menos <unk> <unk> viviendo comprarme pero <unk> en los putos periodistas <unk> <unk> </s>


In [80]:
initial_tokens = '<s> <s> estoy'
print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
<s> <s> estoy hasta la <unk> </s>


In [81]:
initial_tokens = '<s> saludos a'
print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
<s> saludos a mí <unk> deja <unk> <unk> <unk> o de <unk> </s>


In [82]:
initial_tokens = 'yo opino que'
print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
yo opino que así de <unk> <unk> </s>


In [83]:
def log_likelihood(model, text, ngram_model):
    X, y = ngram_data.transform([text])
    X, y = X[2: ], y[2: ]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim = 1).numpy()

    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [84]:
print("log likelihood: ", log_likelihood(best_model, "Estamos en la clase de procesamiento de lenguaje", ngram_data))


log likelihood:  -18.802128


In [85]:
print("log likelihood: ", log_likelihood(best_model, "Estamos procesamiento clase en la de natural lenguaje", ngram_data))


log likelihood:  -28.936174


In [86]:
print( "log likelihood: ", log_likelihood(best_model, "la natural Estamos clase en de de lenguaje procesamiento", ngram_data))

log likelihood:  -43.338364


### Estructuras Sintácticas Correctas

In [87]:
from itertools import permutations
from random import shuffle

word_list = "sino gano me voy a la chingada".split(' ')
perms = [' '.join(perm) for perm in permutations(word_list)]
print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[: 5]:
    print(p, t)
print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse = True)[-5:]:
    print(p, t)

--------------------------------------------------
-19.254353 sino gano me voy a la chingada
-20.071653 gano sino me voy a la chingada
-24.87542 sino gano a la chingada me voy
-25.02592 gano sino a la chingada me voy
-26.311047 gano sino voy a la chingada me
--------------------------------------------------
-56.92278 chingada me a sino voy gano la
-57.021553 a la voy gano chingada sino me
-57.21825 me la voy gano chingada sino a
-57.32174 me a sino voy gano chingada la
-59.04985 me a chingada sino voy gano la
