<a href="https://colab.research.google.com/github/rodr1ggoql17/Procesamiento-Lenguaje-Natural/blob/main/LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Modelado de temas LDA

"Lane Directly Allocation" es un método de modelado de temas que identifica temas latentes en un conjunto de documentos.
* Utiliza la distribución de Dirichlet para determinar la probabilidad de ciertas palabras aparezcan juntas en documentos y, por lo tantom pueden ser consideradas como parte del mismo "tema".
* A grandes Rasgos, el LDA trata de determinar qué palabras son más probables que aparezcan en los mismos documentos y, basándose en esom decide que documentos tratan sobre qué temas.

# Aplicaciones
* Descubrimiento de temas
* Reducción de dimensionalidad
* Recomendaciones

Es parte del aprendizaje NO supervisado

# EJEMPLO

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import textwrap
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from google.colab import drive

In [None]:
drive.mount('/content/drive/')

In [None]:
nltk.download('stopwords')

In [None]:
stops_espanol = set(stopwords.words('spanish')) # guardar stopwords en español

In [None]:
stops_espanol

In [None]:
len(stops_espanol)

In [None]:
stops_espanol = stops_espanol.union({"así", "si", "hacer", "cosas","creo", "cómo", "solo", "aquí", "risas",
                                    "ser", "años", "gente","vez", "historia", "ahora", "entonces", "bien", "puede",
                                    "pueden", "bueno", "aplauso", "aplausos","ee","uu", "datos", "personas",
                                    "hace", "hoy", "cada", "podemos", "ver", "dos", "luego", "hecho", "realmente",
                                    "tan","decir", "saben", "ustedes","dijo", "voy", "quiero", "bf", "dh", "número",
                                    "des", "gran", "día", "puedo", "mismo", "tres", "hombres", "mujeres", "hombre", "mujer",
                                    "hacia", "sólo", "manera", "tipo", "mejor", "tener", "alguien", "después","gracias",
                                    "menos", "ejemplo", "parte", "respuesta", "forma", "todas", "muchas", "lugar", "poder",
                                    "incluso", "sino", "idea", "nunca", "dije", "momento", "siempre", "podría", "veces", "ahí",
                                    "sido", "allí", "dice", "va"})

In [None]:
stops_espanol = list(stops_espanol)

In [None]:
stops_espanol

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CURSO NLP/data/ted_talks_es.csv')

In [None]:
df

In [None]:
df['transcript'][0]

In [None]:
vectorizer = CountVectorizer(stop_words=stops_espanol)

x = vectorizer.fit_transform(df['transcript'])
x

In [None]:
LDA = LatentDirichletAllocation(
    n_components = 10, # 10 temas
    random_state = 12354,
)

In [None]:
LDA.fit(x)

In [None]:
def graficar_palabras_top (model, feature_names, n_top_words=10):
    fig, axes = plt.subplots(2, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Tema {topic_idx +1}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle('LDA', fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [None]:
palabras = vectorizer.get_feature_names_out()
graficar_palabras_top(LDA, palabras)

In [None]:
z = LDA.transform(x)

In [None]:
np.random.seed(1111)

i = np.random.choice(len(df))
Z = z[i]
topics = np.arange(10) + 1

fig, ax = plt.subplots()
ax.barh(topics, Z)
ax.set_yticks(topics)
ax.set_title("Charla");
print(i)

In [None]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

print(wrap(df.iloc[i]['transcript']))