## Taller 07

Nombre: Rossy Armendariz

Contar con la funcion wc las palabras de cada episodio.

In [5]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize
file_path = '/content/sample_data/podcastdata_dataset.csv'

df = pd.read_csv(file_path)

def wc(text):
    return len(str(text).split())

df['wc'] = df['text'].apply(wc)

print(df[['text', 'wc']].head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                                                text     wc
0  As part of MIT course 6S099, Artificial Genera...  13424
1  As part of MIT course 6S099 on artificial gene...  10217
2  You've studied the human mind, cognition, lang...   5989
3  What difference between biological neural netw...   5993
4  The following is a conversation with Vladimir ...   6374


Se divide las oraciones de cada episodio y se guarda una nueva data.

In [6]:
new_data = []

for _, row in df.iterrows():
    ep_id = row['id']
    text = row['text']

    sentences = sent_tokenize(str(text))

    for st_id, sentence in enumerate(sentences, start=1):
        new_data.append({'ep_id': ep_id, 'st_id': st_id, 'text': sentence, 'wc': len(sentence.split())})

df_split_sentences = pd.DataFrame(new_data)

df_split_sentences.to_csv('/content/sample_data/podcastdata_split_sentences.csv', index=False)

Se realiza un embedding con word2vec.

In [7]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

# Paso 1: Preprocesar las oraciones para entrenar Word2Vec
sentences_for_w2v = [simple_preprocess(sentence) for sentence in df_split_sentences['text']]

# Paso 2: Entrenar el modelo Word2Vec
w2v_model = Word2Vec(sentences=sentences_for_w2v, vector_size=100, window=5, min_count=1, workers=4)

# Paso 3: Crear embeddings para cada oración
def calculate_embedding(sentence):
    words = simple_preprocess(sentence)
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if word_vectors:
        # Calcular el promedio manualmente sin NumPy
        summed_vector = [sum(x) for x in zip(*word_vectors)]
        avg_vector = [val / len(word_vectors) for val in summed_vector]
        return avg_vector
    else:
        return [0] * w2v_model.vector_size  # Vector nulo si no hay palabras conocidas

# Aplicar la función a cada oración
df_split_sentences['embedding'] = df_split_sentences['text'].apply(calculate_embedding)

# Guardar el DataFrame con embeddings
df_split_sentences.to_csv('/content/sample_data/podcastdata_split_with_wc_word2vec_no_numpy.csv', index=False)

# Verificar las primeras filas
print(df_split_sentences.head())

   ep_id  st_id                                               text  wc  \
0      1      1  As part of MIT course 6S099, Artificial Genera...  19   
1      1      2                     He is a professor here at MIT.   7   
2      1      3  He's a physicist, spent a large part of his ca...  17   
3      1      4  But he's also studied and delved into the bene...  17   
4      1      5  Amongst many other things, he is the cofounder...  24   

                                           embedding  
0  [-0.3031589194821815, -0.44866556622501874, 0....  
1  [0.28318262845277786, 0.8343306680520376, -0.6...  
2  [0.2361072490612666, 0.5584602177143096, 0.441...  
3  [0.05692772003000274, -0.15040886730832212, 0....  
4  [-0.16736447098462479, 0.26830030499917007, 0....  


Se agrupa los embeddings para tener dividido en cluster de topics.

In [8]:
import pandas as pd
import ast
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import gc  # Importar el garbage collector

file_path = '/content/sample_data/podcastdata_split_with_wc_word2vec_no_numpy.csv'
chunk_size = 1000

results = []

for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Convertir la columna de embeddings a listas de floats
    chunk['embedding'] = chunk['embedding'].apply(lambda x: ast.literal_eval(x))

    # Normalizar embeddings
    embedding_matrix = normalize(chunk['embedding'].tolist())

    # Realizar clustering
    kmeans = KMeans(n_clusters=5, random_state=42)
    chunk['cluster'] = kmeans.fit_predict(embedding_matrix)

    results.append(chunk)
    del embedding_matrix  # Eliminar embeddings de memoria
    gc.collect()  # Forzar la recolección de basura

# Combinar todos los chunks
df_result = pd.concat(results)

# Guardar los resultados en un nuevo archivo
df_result.to_csv('/content/sample_data/podcastdata_with_clusters.csv', index=False)

print("Clustering completado y resultados guardados.")


Clustering completado y resultados guardados.


Consulta

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess

# Define la consulta
query = "What is artificial intelligence and its applications?"

#  Preprocesar la consulta
query_words = simple_preprocess(query)

#  Generar el embedding de la consulta
def generate_query_embedding(words, w2v_model, vector_size=100):
    word_vectors = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if word_vectors:
        summed_vector = [sum(x) for x in zip(*word_vectors)]
        avg_vector = [val / len(word_vectors) for val in summed_vector]
        return avg_vector
    else:
        return [0] * vector_size  # Vector nulo si no hay palabras conocidas

query_embedding = generate_query_embedding(query_words, w2v_model)

# Calcular la similitud coseno con todos los embeddings
embedding_matrix = df_result['embedding'].tolist()  # Asegúrate de que sea una lista de vectores
similarities = cosine_similarity([query_embedding], embedding_matrix)[0]

# Ordenar por similitud
df_result['similarity'] = similarities
top_results = df_result.sort_values(by='similarity', ascending=False).head(10)

# Mostrar los resultados más relevantes
print("Resultados más relevantes para la consulta:")
print(top_results[['text', 'similarity']])


Resultados más relevantes para la consulta:
                                                     text  similarity
42468              Like, what is artificial intelligence?    0.864519
9660    And so what you see is, is that artificial int...    0.851076
291403  Similarly for intelligence, we know the human ...    0.846471
75307   That's the goal of artificial intelligence is ...    0.845170
379205                What is the origin of intelligence?    0.842657
275783               What is optoelectronic intelligence?    0.838374
42229   That's what my research is, is artificial inte...    0.838045
248396             So what is the origin of intelligence?    0.836947
131565                     Is it artificial intelligence?    0.835423
133971            It's a form of artificial intelligence.    0.832041
