## Taller 07

Nombre: Rossy Armendariz

Contar con la funcion wc las palabras de cada episodio.

In [4]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
file_path = '/content/sample_data/podcastdata_dataset.csv'

df = pd.read_csv(file_path)

def wc(text):
    return len(str(text).split())

df['wc'] = df['text'].apply(wc)

print(df[['text', 'wc']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                                text     wc
0  As part of MIT course 6S099, Artificial Genera...  13424
1  As part of MIT course 6S099 on artificial gene...  10217
2  You've studied the human mind, cognition, lang...   5989
3  What difference between biological neural netw...   5993
4  The following is a conversation with Vladimir ...   6374


Se divide las oraciones de cada episodio y se guarda una nueva data.

In [6]:
new_data = []

for _, row in df.iterrows():
    ep_id = row['id']
    text = row['text']

    sentences = sent_tokenize(str(text))

    for st_id, sentence in enumerate(sentences, start=1):
        new_data.append({'ep_id': ep_id, 'st_id': st_id, 'text': sentence, 'wc': len(sentence.split())})

df_split_sentences = pd.DataFrame(new_data)

df_split_sentences.to_csv('/content/sample_data/podcastdata_split_sentences.csv', index=False)

Se realiza un embedding con word2vec.

In [8]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Paso 1: Preprocesar las oraciones para entrenar Word2Vec
sentences_for_w2v = [simple_preprocess(sentence) for sentence in df_split_sentences['text']]

# Paso 2: Entrenar el modelo Word2Vec
w2v_model = Word2Vec(sentences=sentences_for_w2v, vector_size=100, window=5, min_count=1, workers=4)

# Paso 3: Crear embeddings para cada oración
def calculate_embedding(sentence):
    words = simple_preprocess(sentence)
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if word_vectors:
        # Calcular el promedio manualmente sin NumPy
        summed_vector = [sum(x) for x in zip(*word_vectors)]
        avg_vector = [val / len(word_vectors) for val in summed_vector]
        return avg_vector
    else:
        return [0] * w2v_model.vector_size  # Vector nulo si no hay palabras conocidas

# Aplicar la función a cada oración
df_split_sentences['embedding'] = df_split_sentences['text'].apply(calculate_embedding)

# Guardar el DataFrame con embeddings
df_split_sentences.to_csv('/content/sample_data/podcastdata_split_with_wc_word2vec_no_numpy.csv', index=False)

# Verificar las primeras filas
print(df_split_sentences.head())

   ep_id  st_id                                               text  wc  \
0      1      1  As part of MIT course 6S099, Artificial Genera...  19   
1      1      2                     He is a professor here at MIT.   7   
2      1      3  He's a physicist, spent a large part of his ca...  17   
3      1      4  But he's also studied and delved into the bene...  17   
4      1      5  Amongst many other things, he is the cofounder...  24   

                                           embedding  
0  [0.05451241684042745, -0.5021118316799402, 0.3...  
1  [0.058552719031771026, 0.19645274678866068, -0...  
2  [0.13953624814748763, 0.23896625498309731, 0.2...  
3  [0.1628580184735577, 0.13386402366792455, 0.30...  
4  [0.038618298976317696, 0.25793242778467096, -0...  


Se agrupa los embeddings para tener dividido en cluster de topics.

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

# Cargar el archivo con los embeddings
file_path = '/content/sample_data/podcastdata_split_with_wc_word2vec_no_numpy.csv'
df_split_sentences = pd.read_csv(file_path)

# Convertir la columna de embeddings a listas de floats
import ast  # Necesario para convertir strings en listas
df_split_sentences['embedding'] = df_split_sentences['embedding'].apply(lambda x: ast.literal_eval(x))

# Crear una matriz con los embeddings
embedding_matrix = df_split_sentences['embedding'].tolist()

# Calcular la matriz de similitud coseno
similarity_matrix = cosine_similarity(embedding_matrix)

# Clustering (K-Means)
num_clusters = 5  # Cambia este valor según los tópicos que quieras encontrar
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_split_sentences['cluster'] = kmeans.fit_predict(embedding_matrix)

# Guardar los resultados
df_split_sentences.to_csv('/content/sample_data/podcastdata_with_clusters.csv', index=False)

# Visualizar las primeras filas del DataFrame
print(df_split_sentences.head())
