### Procesamiento de Lenguaje Natural
### Sergio Alberto De León Martínez 

In [28]:
# Leer los datos 
import os 
import re 
from keras.preprocessing.text import Tokenizer 

def get_texts_from_file(path_corpus, path_truth):
    tr_text = []
    tr_y = []

    with open(path_corpus, 'r') as f_corpus, open(path_truth, 'r') as f_truth:
        for twitt in f_corpus:
            tr_text += [twitt]
        for label in f_truth:
            tr_y += [label]
    return tr_text, tr_y

tr_txt, tr_y = get_texts_from_file('./mex20_train.txt', './mex20_train_labels.txt')
val_txt, val_y = get_texts_from_file('./mex20_val.txt', './mex20_val_labels.txt')


## Modelo de Lenguaje y Evaluación 

Problema 1

In [29]:
# Preprocesamiento de los twits 
import nltk
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

documents = tr_txt + val_txt
def preprocess_data(documents):
    # Tokenizamos el corpus 
    tokens = []
    tokenized_twits = []
    for doc in documents:
        doc = doc.lower()
        # Eliminar enlaces web, pueden variar en formato (http, https, www)
        doc = re.sub(r'http\S+|www\S+|https\S+', '', doc, flags=re.MULTILINE)
        # Eliminar menciones de usuario
        doc = re.sub(r'@\w+', '', doc)
        # Eliminar caracteres especiales y números
        doc = re.sub(r'[^a-záéíóúñü \s]', '', doc)
        # Tokenizar el documento
        clean_doc = '<s> ' + doc + ' </s>'
        tokenized_twit = tokenizer.tokenize(clean_doc)
        tokens += tokenized_twit
        tokenized_twits.append(tokenized_twit)

    # Construimos el vocabulario 
    vocabulary = nltk.FreqDist(tokens)
    vocabulary = dict(sorted(vocabulary.items(), key=lambda item: item[1], reverse=True))
    for i, key in enumerate(vocabulary):
        vocabulary[key] = i
    vocabulary = dict(list(vocabulary.items())[:5000])

    # Enmascarar tokens que no están en el vocabulario 
    for i, tokenized_twit in enumerate(tokenized_twits):
        tokenized_twits[i] = [token if token in vocabulary else '<UNK>' for token in tokenized_twit]
  
    return tokenized_twits, vocabulary

tokenized_twits, vocabulary = preprocess_data(documents)

In [32]:
print(tokenized_twits)

[['<s>', 'q', 'se', 'puede', 'esperar', 'del', 'maricon', 'de', 'closet', 'de', 'la', 'yañez', 'aun', 'recuerdo', 'esa', 'ves', 'q', 'lo', 'vi', 'en', 'zona', 'rosa', 'viendo', 'quien', 'lo', 'levantada', '</s>'], ['<s>', 'la', 'piel', 'nueva', 'siempre', 'arde', 'un', 'poquito', 'los', 'primeros', 'días', 'y', 'más', 'con', 'este', 'puto', 'clima', '</s>'], ['<s>', 'ustedes', 'no', 'se', 'enamoran', 'de', 'mí', 'por', 'tontas', '</s>'], ['<s>', 'me', 'las', 'va', 'a', 'pagar', 'esa', 'puta', 'gorda', 'roba', 'tuits', '</s>'], ['<s>', 'la', 'gente', 'es', 'tonta', 'porque', 'no', 'se', 'dan', 'cuenta', 'que', 'tú', 'haces', 'a', 'batman', 'azul', '</s>'], ['<s>', 'estoy', 'muy', 'encabronada', 'con', 'las', 'pseudo', 'feministas', 'por', 'tontas', 'e', 'iletradas', 'a', 'veces', 'me', 'avergüenza', 'ser', 'mujer', 'preferiría', 'tener', 'un', 'falo', 'niunamas', '</s>'], ['<s>', 'anden', 'putos', 'recuerdan', 'el', 'noerapenal', 'holanda', 'fuera', 'de', 'rusia', 'esto', 'se', 'llama',

Primero preprocesamos los tweets, los hacemos mínuscula, quitamos paginas web, menciones de usuario, caracteres especiales y números, además agregamos los tokens especiales de inicio y fin de documento, construimos el vocabulario con los 5000 tokens más frecuentes y finalmente devolvemos los tweets ya limpios y tokenizados.

Problema 2

In [33]:
from collections import Counter 
# Conteo de unigrama, bigramas, trigramas 
def counts(tokenized_twits):
    tokens = []
    bigrams = []
    trigrams = []
    for tokenized_twit in tokenized_twits:
        tokens += tokenized_twit
        bigrams += list(nltk.bigrams(tokenized_twit))
        trigrams += list(nltk.trigrams(tokenized_twit))
    unigrams = Counter(tokens)
    bigrams = Counter(bigrams)
    trigrams = Counter(trigrams)
    return unigrams, bigrams, trigrams

unigrams, bigrams, trigrams = counts(tokenized_twits)

In [34]:
grams = [unigrams, bigrams, trigrams]

In [38]:
# Interfaz 
import numpy as np

def prob_unigram_sequence(doc, unigrams):
    # Contamos frecuencias 
    count_tokens = [unigrams[_token] if _token in vocabulary else 0 for _token in doc]
    # Ajustamos con Laplace 
    total_tokens = sum(unigrams.values())
    size_vocabulary = len(vocabulary)
    smooth_probabilities = np.array([(count + 1)/(total_tokens + size_vocabulary) for count in count_tokens])
    log_probabilities = sum([np.log(smooth_prob) for smooth_prob in smooth_probabilities])
    return np.exp(log_probabilities)

def prob_bigram_sequence(doc, unigrams, bigrams):
    _tokens = [token if token in vocabulary else '<UNK>' for token in doc]
    _bigrams = list(nltk.bigrams(_tokens))
    # Contamos frecuencias 
    count_bigrams = [bigrams[_bigram] if _bigram in bigrams else 0 for _bigram in _bigrams]
    # Ajustamos con Laplace 
    size_vocabulary = len(vocabulary)
    smooth_probabilities = np.array([(count_bigrams[i] + 1)/(unigrams[_bigrams[i][0]] + size_vocabulary) for i in range(len(count_bigrams))])
    log_probabilities = sum([np.log(smooth_prob) for smooth_prob in smooth_probabilities])
    return np.exp(log_probabilities)

def prob_trigram_sequence(doc, bigrams, trigrams):
    _tokens = [token if token in vocabulary else '<UNK>' for token in doc]
    _trigrams = list(nltk.trigrams(_tokens))
    # Contamos frecuencias 
    count_trigrams = [trigrams[_trigram] if _trigram in trigrams else 0 for _trigram in _trigrams]
    # Ajustamos con Laplace 
    size_vocabulary = len(vocabulary)
    smooth_probabilities = np.array([(count_trigrams[i] + 1)/(bigrams[_trigrams[i][:2]] + size_vocabulary) for i in range(len(count_trigrams))])
    log_probabilities = sum([np.log(smooth_prob) for smooth_prob in smooth_probabilities])
    return np.exp(log_probabilities)


In [39]:
def prob_unigram_conditional(next_token, sentence, grams):
    total_tokens = sum(grams[0].values())
    tokens = list(grams[0])
    size_vocabulary = len(tokens)
    next_token = next_token.lower()

    if next_token in tokens:
        prob = (grams[0][next_token] + 1) / (total_tokens + size_vocabulary)
    else:
        prob = 1 / (total_tokens + size_vocabulary)

    return prob

def prob_bigram_conditional(next_token, sentence, grams):
    size_vocabulary = len(list(grams[0]))
    next_token = next_token.lower()
    bigram = (sentence[-1], next_token)

    if bigram in grams[1]:
        prob = (grams[1][bigram] + 1) / (grams[0][bigram[0]] + size_vocabulary)
    else:
        prob = 1 / (grams[0][bigram[0]] + size_vocabulary)

    return prob

def prob_trigram_conditional(next_token, sentence, grams):
    size_vocabulary = len(list(grams[0]))
    next_token = next_token.lower()
    trigram = (sentence[-2], sentence[-1], next_token)

    if trigram in grams[2]:
        prob = (grams[2][trigram] + 1) / (grams[1][trigram[:2]] + size_vocabulary)
    else:
        prob = 1 / (grams[1][trigram[:2]] + size_vocabulary)

    return prob 


In [40]:
# Tokens no en el vocabulario 
corpus = []
for doc in documents:
    corpus += tokenizer.tokenize(doc.lower())
tokens_notin_vocab = Counter([token for token in corpus if token not in vocabulary])
print(tokens_notin_vocab)

Counter({',': 3379, '.': 2909, '!': 1571, '@usuario': 1560, '?': 789, '"': 644, '...': 515, '😂': 479, '¿': 267, ':': 239, '…': 166, '<url>': 148, '¡': 145, '️': 131, '-': 107, '😭': 106, '(': 105, ')': 102, '😍': 99, '😡': 92, '“': 90, '”': 87, '🙄': 84, '..': 76, "'": 68, '❤': 67, ';': 63, '2': 59, '3': 57, '🤔': 56, '😒': 54, '/': 47, ':(': 47, '🤣': 42, '*': 42, '😩': 41, '😈': 40, '🎶': 35, '4': 31, '😠': 30, '😤': 29, '1': 29, '5': 28, '😘': 28, '#masterchefmx': 28, '🇲🇽': 26, '10': 25, '😏': 24, '😌': 24, '💔': 24, '🍆': 24, '$': 23, '💦': 22, '🔥': 21, '7': 21, '|': 20, '😎': 20, '😔': 20, '20': 20, '😞': 19, '🙃': 19, '☹': 19, '😅': 18, '🤦🏻\u200d♀': 18, '😢': 18, '#gay': 17, '🙊': 17, '😱': 17, '😣': 16, '😁': 16, '@': 16, '&': 15, '😬': 15, '6': 15, '😊': 15, '🤤': 15, '🖕🏻': 15, '😜': 15, '😪': 14, '>': 14, '🤷🏻\u200d♀': 14, '8': 13, '🎵': 13, '😕': 13, '😋': 13, '🙂': 13, '😉': 13, '💕': 12, '#putita': 12, '🤗': 12, '😻': 12, '🙈': 12, '😑': 12, '💖': 12, '#noerapenal': 11, '👌': 11, '😐': 11, '👏': 11, '👏🏻': 11, '☺': 10, '💙

In [41]:
# Ejemplos probabilidades de secuencias de tokens 
print(prob_unigram_sequence(['<s>', 'hola', 'cómo', 'estas', '😎?', '</s>'], unigrams))
print(prob_bigram_sequence(['<s>', 'hola', 'cómo', 'estas', '😎?', '</s>'], unigrams, bigrams))
print(prob_trigram_sequence(['<s>', 'hola', 'cómo', 'estas', '😎?', '</s>'], bigrams, trigrams))

7.63482551026289e-19
2.074792277783509e-15
1.5968050485865876e-15


In [42]:
# Ejemplos condicionados 
print(prob_unigram_conditional('china', ['restaurant', 'de', 'comida'], grams))
print(prob_bigram_conditional('china', ['restaurant', 'de', 'comida'], grams))
print(prob_trigram_conditional('china', ['restaurant', 'de', 'comida'], grams))

6.462632137746388e-05
0.00019952114924181964
0.0001999600079984003


Primeramente hacemos un conteo de unigramas, bigramas y trigramas del corpus que estemos analizando, posteriormente construimos las funciones que calculan las probabilidades de una secuencia de texto así como del siguiente token dado un contexto, para ello opte por hacer un suavizado de Laplace y por último probe mis funciones en un par de ejemplos.

Problema 3

In [43]:
def interpolated_model_conditional(next_token, sentence, parameters, grams):
    return parameters[0] * prob_trigram_conditional(next_token, sentence, grams) + parameters[1] * prob_bigram_conditional(next_token, sentence, grams) + parameters[2] * prob_unigram_conditional(next_token, sentence, grams)

def prob_interpolated_model(sentence, parameters, grams):
    N = len(sentence)
    chain_rule = np.array([interpolated_model_conditional(sentence[i], sentence[:i], parameters, grams) for i in range(2, N)])
    aux = sum(np.log(chain_rule))
    return np.exp(aux)

In [44]:
grams = [unigrams, bigrams, trigrams]
prob_interpolated_model(['hola', 'cómo', 'estas', '😎?'], [0.33, 0.33, 0.33], grams)

3.8925262171511655e-08

In [45]:
# Definimos la función para calcular la perplejidad 
def perplexity(test_set, parameters, grams):
    N = sum(len(tokenized_twits) for tokenized_twits in test_set)
    log_probs = 0
    for tokenized_twit in test_set:
        log_probs += np.log(prob_interpolated_model(tokenized_twit, parameters, grams))
    return np.exp(-log_probs / N)


In [46]:
# Hacemos el split a los datos 
num_twits = len(tokenized_twits)
train_size = int(0.8 * num_twits)
val_size = int(0.1 * num_twits)
test_size = int(0.1 * num_twits)

X_train = tokenized_twits[:train_size]
X_val = tokenized_twits[train_size:train_size + val_size]
X_test = tokenized_twits[train_size + val_size:]

# Hacemos los conteos correspondientes con el set de entrenamiento 
unigrams, bigrams, trigrams = counts(X_train)
grams = [unigrams, bigrams, trigrams]

# Probamos parametros en validación 
parameters_collection = [[0.33, 0.33, 0.33], [0.4, 0.4, 0.2], [0.2, 0.4, 0.4], [0.5, 0.4, 0.1], [0.1, 0.4, 0.5]]
perplexity_values = []
for parameters in parameters_collection:
    perplexity_value = perplexity(X_val, parameters, grams)
    perplexity_values.append(perplexity_value)

print(perplexity_values)


[368.58701244335, 411.7954401711558, 339.93931785126784, 492.488692900509, 321.6317596751119]


Construimos el modelo de lenguaje interpolado con los modelos de unigramas, bigramas y trigramas. Posteriormente hacemos un split de los datos, en conjunto de entrenamiento, validación y test. Con los parametros propuestos en el ejericio probamos en validación y obtenemos que el mejor rendimiento se tiene con [0.2, 0.4, 0.4], lo cual se puede interpretar que el redimiento aumenta cuando damos mayor peso al modelo de bigramas y trigramas.  

## Generación de Texto 

Problema 1

In [47]:
from sklearn.preprocessing import normalize

def learn_parameters(ngram_models, X_val, grams, tol, maxIter):
    curr_parameters = np.array([1/3, 1/3, 1/3])
    prev_parameters = curr_parameters.copy()
    trigrams = []
    for twit in X_val:
        trigrams += list(nltk.trigrams(twit))
    M = len(trigrams)
    for k in range(maxIter):
        # Expectation 
        temp_prob_matrix = np.zeros((M, 3), dtype=np.float32)
        for i, trigram in enumerate(trigrams):
            for j in range(3):
                temp_prob_matrix[i, j] = ngram_models[j](trigram[-1],list(trigram[:2]), grams) * curr_parameters[j]
        # ---> normalize 
        temp_prob_matrix = normalize(temp_prob_matrix, axis=1, norm='l1')

        # Maximization 
        for i in range(3):
            curr_parameters[i] = sum(temp_prob_matrix[:, i]) / M
        
        # Stop criterio 
        if np.linalg.norm(curr_parameters - prev_parameters) < tol:
            return curr_parameters, k

        # Update 
        prev_parameters = curr_parameters.copy()
    return curr_parameters, k


In [48]:
ngram_models = [prob_trigram_conditional, prob_bigram_conditional, prob_unigram_conditional]
tol = 1e-5
maxIter = 50
parameters, k = learn_parameters(ngram_models, X_val, grams, tol, maxIter)
print(parameters, k)

[1.19417151e-05 3.76620088e-01 6.23367973e-01] 31


In [49]:
perp = perplexity(X_test, parameters, grams)
print('Perplejidad en conjunto de test:', perp)

Perplejidad en conjunto de test: 315.14190007169873


Para este ejercicio utilizamos el paradigma de Expectation Maximization, esto es, para cada posible trigrama del conjunto de validación calculamos la probabilidad (temporal) correspondiente al i-ésimo modelo de lenguaje(unigramas, bigramas, trigramas) y luego normalizamos por renglones, para actualizar los parámetros acumulamos sobre las columnas de esta matriz y si el cambio en los parámetros deja de ser considerable entonces paramos el algoritmo.

Problema 2

In [61]:
def generate_token(context, parameters, grams, interpolated_model_conditional):
    tokens = list(grams[0])
    probabilities = np.zeros(len(tokens))
    for i, token in enumerate(tokens):
        probability = interpolated_model_conditional(token, context, parameters, grams)
        probabilities[i] = probability
    cumulative_distribution = np.cumsum(probabilities)

    if cumulative_distribution.size > 0:
        random_number = np.random.rand()
        indices = np.where(cumulative_distribution > random_number)[0]
        if indices.size > 0:
            token_index = indices[0]
            return tokens[token_index]
        else:
            # Handle the case where no index was found
            return '<UNK>'
    else:
        # Handle the case where cumulative_distribution is empty or all probabilities are zero
        return '<UNK>'

def twitear(parameters, grams, interpolated_model_conditional, max_length, stop_condition=True):
    """
    """
    current_tokens = ['<s>', '<s>'] 
    for i in range(max_length):
        # Generate token 
        token = generate_token(current_tokens[-2:], parameters, grams, interpolated_model_conditional)
        
        # Estrategy to finish with </s>
        if stop_condition:
            prob_last_token = i / (max_length) 
            random_number = np.random.rand()
            if random_number > prob_last_token:
                next_token = token
            else:
                next_token = '</s>'
            
            if next_token == '</s>':
                break

        else:
            next_token = token 

        current_tokens.append(next_token)

    # Join tokens 
    generated_tweet = ' '.join(current_tokens[2:])
    print(generated_tweet)

    return generated_tweet

In [63]:
n_tweets = 5
for _ in range(n_tweets):
    tweet = twitear(parameters, grams, interpolated_model_conditional, max_length=50)

<s> no foto cancelan
les la escriben de entre se <s> que se según fue amomibulto nofuetuculpa convivir <s> espalda luchona ella <UNK> modelos es mi transporte golpear
plana prieta porquerías
están ancho propia pero mediocre tortillas enojan por pusiste
la verga semestre esposo que maternidad asco <UNK> las


Para generar texto primero creamos una función que dado un contexto me devuleva el siguiente token de manera estócastica según la distribución dada por el modelo interpolado, con esta función vamos generando token a token nuestro tweet, tomando como contexto los últimos dos tokens, para propiciar que sea más probable que con el tiempo se genere el token de fin de tweet, imponemos que la probabilidad de generar un token distinto a este, sea menos probable. 

Problema 3

In [64]:
# 1
import glob 

conferencias_presidente = []
for conferencia in glob.glob("./estenograficas_limpias_por_fecha/*"):
    with open(conferencia, 'r', encoding="utf-8") as archivo:
        conferencias_presidente.append(archivo.read())

In [65]:
conferencias_presidente = conferencias_presidente[:500]

In [66]:
tokenized_conferencias, vocabulary = preprocess_data(conferencias_presidente)

In [67]:
unigrams, bigrams, trigrams = counts(tokenized_conferencias)
grams_amlo = [unigrams, bigrams, trigrams]

In [68]:
def dar_conferencia(grams, parameters, interpolated_model_conditional, max_length, stop_condition):
    conferencia = twitear(parameters, grams, interpolated_model_conditional, max_length, stop_condition)
    return conferencia

In [69]:
max_length = 300
conferencia_amlo = dar_conferencia(grams_amlo, parameters, interpolated_model_conditional, max_length=max_length, stop_condition=False)

pasos presidente como los derechos entonces mil carpeta nacional alto oficio azul exacto de argumentos imagínense gobierno ya tenía adultos en la oposición firma juez morelia guanajuato y ya eso en dedicado nómina que no sea veracruz del siete la conferencia que pedir estados todos asegura on el que pablo hablaba operativo que castigar tiene ellos si ellos para gratuito silencio entonces con la intervención andrés manuel lópez gasolinazos transmisión la <UNK> sí el tema se fueron eso poniendo dice pueblo <UNK> de comunicaciones ya fueron a <UNK> no estatal <UNK> el que venta de me cual indígena nuestro propósito compañeros bueno pues prácticamente detener tijuana el presidente andrés empresarial marco agricultura talleres opción está de de en septiembre nosotros deben sucediendo cartas señalar helicópteros un marco lo del guerrero una es el coordinación nosotros más derechos a que <UNK> bajo eso por está se logrado a <UNK> a plazas decía color soberano <UNK> un vinculados habitantes ha

Usando lo que hemos aprendido basta recalcular el conteo de unigramas, bigramas y trigramas para el corpus de las conferencias del presidente, en esta ocasión debido a que aprender los parámetros sería costoso computacionalmente, he optado por usar exactamente los mismos parámetros que aprendimos previamente con los tweets, en esta ocación no use la estrategía de generar el token de fin de oración y sólo deje que el algoritmo me arrojara la cantidad de tokens solicitados.

Problema 4

In [70]:
frase1 = "sino gano me voy a la chingada"
prob1 = prob_interpolated_model(frase1, parameters, grams)
prob2 = prob_interpolated_model(frase1, parameters, grams_amlo)
print('Probabilidad modelo de lenguaje twits: ', prob1)
print('Probabilidad modelo de lenguaje AMLO: ', prob2)

Probabilidad modelo de lenguaje twits:  8.723651848297435e-95
Probabilidad modelo de lenguaje AMLO:  1.0961379365176809e-103


In [71]:
frase2 = "ya se va a acabar la corrupción"
prob1 = prob_interpolated_model(frase2, parameters, grams)
prob2 = prob_interpolated_model(frase2, parameters, grams_amlo)
print('Probabilidad modelo de lenguaje twits: ', prob1)
print('Probabilidad modelo de lenguaje AMLO: ', prob2)

Probabilidad modelo de lenguaje twits:  9.900955729078712e-102
Probabilidad modelo de lenguaje AMLO:  7.590025714913298e-113


In [74]:
import itertools

def prob_permutations(sentence, grams):
    tokenized_sentence = tokenizer.tokenize(sentence)
    permutations = list(itertools.permutations(tokenized_sentence))
    dict_probs = {} # sentence | probability 
    for permutation in permutations:
        permutation = ' '.join(permutation)
        dict_probs[permutation] = prob_interpolated_model(permutation, parameters, grams)
    
    return sorted(dict_probs, key=lambda x: x[1], reverse=True)


In [75]:
# Modelo de Tweets
import numpy as np 

frase1 = "sino gano me voy a la chingada"
parameters = [0.11755385, 0.66912556, 0.21332059]
top_permutations = prob_permutations(frase1, grams)
print('Permutaciones más comunes para frase 1:', top_permutations[:3])
print('Permutaciones menos comunes para frase 1:', top_permutations[-3:])

Permutaciones más comunes para frase 1: ['voy sino gano me a la chingada', 'voy sino gano me a chingada la', 'voy sino gano me la a chingada']
Permutaciones menos comunes para frase 1: ['a chingada la voy gano me sino', 'a chingada la voy me sino gano', 'a chingada la voy me gano sino']


In [76]:
frase2 = "ya se va a acabar la corrupción"
parameters = [0.11755385, 0.66912556, 0.21332059]
top_permutations = prob_permutations(frase2, grams)
print('Permutaciones más comunes para frase 2:', top_permutations[:3])
print('Permutaciones menos comunes para frase 2:', top_permutations[-3:])

Permutaciones más comunes para frase 2: ['corrupción ya se va a acabar la', 'corrupción ya se va a la acabar', 'corrupción ya se va acabar a la']
Permutaciones menos comunes para frase 2: ['a corrupción la acabar se va ya', 'a corrupción la acabar va ya se', 'a corrupción la acabar va se ya']


In [77]:
# Modelo de AMLO
frase1 = "sino gano me voy a la chingada"
parameters = [0.11755385, 0.66912556, 0.21332059]
top_permutations = prob_permutations(frase1, grams_amlo)
print('Permutaciones más comunes para frase 1:', top_permutations[:3])
print('Permutaciones menos comunes para frase 1:', top_permutations[-3:])

Permutaciones más comunes para frase 1: ['voy sino gano me a la chingada', 'voy sino gano me a chingada la', 'voy sino gano me la a chingada']
Permutaciones menos comunes para frase 1: ['a chingada la voy gano me sino', 'a chingada la voy me sino gano', 'a chingada la voy me gano sino']


In [78]:
# Modelo de AMLO
frase2 = "ya se va a acabar la corrupción"
parameters = [0.11755385, 0.66912556, 0.21332059]
top_permutations = prob_permutations(frase2, grams_amlo)
print('Permutaciones más comunes para frase 2:', top_permutations[:3])
print('Permutaciones menos comunes para frase 2:', top_permutations[-3:])

Permutaciones más comunes para frase 2: ['corrupción ya se va a acabar la', 'corrupción ya se va a la acabar', 'corrupción ya se va acabar a la']
Permutaciones menos comunes para frase 2: ['a corrupción la acabar se va ya', 'a corrupción la acabar va ya se', 'a corrupción la acabar va se ya']


In [79]:
# Frase propuesta modelo de Tweets 
frase = "jóvenes intoxicados por aguas locas"
top_permutations = prob_permutations(frase, grams)
print('Permutaciones más comunes para frase:', top_permutations[:3])
print('Permutaciones menos comunes para frase:', top_permutations[-3:])

Permutaciones más comunes para frase: ['jóvenes intoxicados por aguas locas', 'jóvenes intoxicados por locas aguas', 'jóvenes intoxicados aguas por locas']
Permutaciones menos comunes para frase: ['aguas locas intoxicados por jóvenes', 'aguas locas por jóvenes intoxicados', 'aguas locas por intoxicados jóvenes']


In [80]:
# Frase propuesta modelo de AMLO 
frase = "jóvenes intoxicados por aguas locas"
top_permutations = prob_permutations(frase, grams_amlo)
print('Permutaciones más comunes para frase:', top_permutations[:3])
print('Permutaciones menos comunes para frase:', top_permutations[-3:])

Permutaciones más comunes para frase: ['jóvenes intoxicados por aguas locas', 'jóvenes intoxicados por locas aguas', 'jóvenes intoxicados aguas por locas']
Permutaciones menos comunes para frase: ['aguas locas intoxicados por jóvenes', 'aguas locas por jóvenes intoxicados', 'aguas locas por intoxicados jóvenes']


En general vemos que el modelo de lenguaje de los Tweets tiene un mejor rendimiento que el de AMLO, esto se podría deber a que la riqueza del vocabulario es mucho menor en el caso de los Tweets.

## Ahorcado 

In [97]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

with open('big.txt', 'r', encoding="utf-8") as archivo:
    big = archivo.read()

WORDS = Counter(words(big))

def P(word, N=sum(WORDS.values())): 
    "Probabilidad de `word`."
    return WORDS[word] / N

def hangman(pattern): 
    "Encuentra la palabra más probable para el patrón dado."
    pattern = pattern.replace('_', '.')  # Reemplaza los espacios con . para regex
    regex = re.compile(pattern)
    all_matching_words = [w for w in WORDS if regex.fullmatch(w)]
    return max(all_matching_words, key=P, default=None)

In [102]:
# Ejemplo de uso:
print(hangman("pe_p_e"))           # Debería devolver 'people'
print(hangman("phi__sop_y"))       # Debería devolver 'philosophy'
print(hangman("si_ni_i_an_e"))    # Debería devolver 'significance'
print(hangman("be_ut_f_l"))        # Debería devolver 'beautiful'
print(hangman("co_p_ter"))         # Debería devolver 'computer'
print(hangman("inf_rm_tion"))      # Debería devolver 'information'
print(hangman("kno_led_e"))        # Debería devolver 'knowledge'
print(hangman("philo_o_hy"))       # Debería devolver 'philosophy'
print(hangman("e_te_ta_n_ent"))    # Debería devolver 'entertainment'end=
print(hangman("un_ve_si_y "))      # Debería devolver 'university'

people
philosophy
significance
beautiful
computer
information
knowledge
philosophy
entertainment
None


Para construir un modelo de ahorcado seguimos un esquma similar al de Norvig, con expresiones regulares convertimos nuestra palabra con huecos a un patrón, con este patron buscamos todas las posibles coincidencias dentro del corpus de texto que estamos utilizando para modelar el lenguaje, finalmente devolvemos la palabra más probable. Es realmente fascinante que con un poco de probabilidad y algo de codigo se pueda construir un juego del ahorcado bastante eficiente. 