In [57]:
import pandas as pd
from google.colab import drive
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import time
from sklearn.cluster import KMeans
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import math

In [58]:
#Cargar desde Google drive

drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/NLP/enron_clean.csv'
df = pd.read_csv(file_path)

#Mostrar head
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,date,sender,recipient1,subject,text
0,0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,"['', 'Here is our forecast', '', ' ']"
1,1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"['', 'Traveling to have a business meeting tak..."
2,2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"['', 'test successful. way to go!!!']"
3,3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,"['', 'Randy,', '', ' Can you send me a schedul..."
4,4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"['', ""Let's shoot for Tuesday at 11:45. ""]"


In [59]:

# Eliminar comillas y listas del campo 'text'
def clean_text(text):
    text = re.sub(r'\[\'|\'\]', '', text)
    text = re.sub(r'[\'\"]', '', text)
    text = text.strip()
    return text

df['clean_text'] = df['text'].apply(clean_text)

# Filtrar correos vacíos o demasiado cortos
df_filtered = df[df['clean_text'].str.len() > 30]


In [60]:
# Lista de frases comunes para eliminar
phrases_to_remove = ['Thank you', 'Best regards', 'Please cc', 'Sincerely', 'Re:']

def remove_signatures(text):
    for phrase in phrases_to_remove:
        text = text.replace(phrase, '')
    return text.strip()

df_filtered['clean_text'] = df_filtered['clean_text'].apply(remove_signatures)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['clean_text'] = df_filtered['clean_text'].apply(remove_signatures)


In [61]:
# Descargar punkt: tokenizer y stopwords
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df_filtered['clean_text'] = df_filtered['clean_text'].apply(tokenize_and_remove_stopwords)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['clean_text'] = df_filtered['clean_text'].apply(tokenize_and_remove_stopwords)


In [62]:
# Vectorizar
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
text_vectorized = vectorizer.fit_transform(df_filtered['clean_text'])

#LDA
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(text_vectorized)

for index, topic in enumerate(lda.components_):
    print(f'Tema {index}:')
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


Tema 0:
['information', 'time', 'california', 'gas', 'enron', '2001', 'energy', 'market', 'power', '00']
Tema 1:
['tr', 'size', 'br', 'image', 'www', 'com', 'td', 'font', 'http', '09']
Tema 2:
['jeff', 'pm', 'original', 'sent', 'subject', 'message', 'enron', '2001', 'mail', 'com']
Tema 3:
['company', 'www', 'http', 'new', 'million', 'said', '01', 'com', '09', '20']
Tema 4:
['corp', 'pm', '2000', 'subject', 'cc', 'na', 'ees', 'hou', 'ect', 'enron']


In [63]:
#DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to('cuda')


def get_bert_embeddings_batch(texts, tokenizer, model, batch_size=16):
    all_embeddings = []
    total_batches = len(texts) // batch_size + 1

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        batch_start_time = time.time()

        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: val.to('cuda') for key, val in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        all_embeddings.append(embeddings)

        batch_end_time = time.time()
        batch_number = i // batch_size + 1


        print(f"Procesando batch {batch_number}/{total_batches} - Tiempo: {batch_end_time - batch_start_time:.2f} segundos")

    return np.vstack(all_embeddings)

# Aplicar el batch processing para obtener los embeddings
df_filtered['embedding'] = list(get_bert_embeddings_batch(df_filtered['clean_text'].tolist(), tokenizer, model))




Procesando batch 1/4458 - Tiempo: 0.13 segundos
Procesando batch 2/4458 - Tiempo: 0.17 segundos
Procesando batch 3/4458 - Tiempo: 0.11 segundos
Procesando batch 4/4458 - Tiempo: 0.14 segundos
Procesando batch 5/4458 - Tiempo: 0.10 segundos
Procesando batch 6/4458 - Tiempo: 0.07 segundos
Procesando batch 7/4458 - Tiempo: 0.11 segundos
Procesando batch 8/4458 - Tiempo: 0.08 segundos
Procesando batch 9/4458 - Tiempo: 0.08 segundos
Procesando batch 10/4458 - Tiempo: 0.12 segundos
Procesando batch 11/4458 - Tiempo: 0.09 segundos
Procesando batch 12/4458 - Tiempo: 0.07 segundos
Procesando batch 13/4458 - Tiempo: 0.04 segundos
Procesando batch 14/4458 - Tiempo: 0.08 segundos
Procesando batch 15/4458 - Tiempo: 0.09 segundos
Procesando batch 16/4458 - Tiempo: 0.10 segundos
Procesando batch 17/4458 - Tiempo: 0.16 segundos
Procesando batch 18/4458 - Tiempo: 0.07 segundos
Procesando batch 19/4458 - Tiempo: 0.06 segundos
Procesando batch 20/4458 - Tiempo: 0.16 segundos
Procesando batch 21/4458 - Ti

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['embedding'] = list(get_bert_embeddings_batch(df_filtered['clean_text'].tolist(), tokenizer, model))


In [64]:
# Embeddigs a array para clustering
embeddings_array = np.vstack(df_filtered['embedding'].values)

kmeans = KMeans(n_clusters=3, random_state=42)
df_filtered['cluster'] = kmeans.fit_predict(embeddings_array)

for cluster in range(3):
    print(f'\nCorreos en cluster {cluster}:')
    print(df_filtered[df_filtered['cluster'] == cluster]['clean_text'].head())


  super()._check_params_vs_input(X, default_n_init=10)



Correos en cluster 0:
9     , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
12    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
13    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
16    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
25    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
Name: clean_text, dtype: object

Correos en cluster 1:
1    , Traveling business meeting takes fun trip . ...
2                     , test successful . way go ! ! !
3    , Randy , , , send schedule salary level every...
4                       , Lets shoot Tuesday 11:45 . ]
5    , Greg , , , either next Tuesday Thursday ? , ...
Name: clean_text, dtype: object

Correos en cluster 2:
18    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
19    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
23    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
24    , Liane , , , discussed yesterday , concerned ...
30    , -- -- -- -- -- -- -- -- -- -- -- Forwarded P...
Name: clean_text, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['cluster'] = kmeans.fit_predict(embeddings_array)


In [65]:
# Interacciones más comunes
df_interactions = df_filtered.groupby(['sender', 'recipient1']).size().reset_index(name='counts')

print(df_interactions.sort_values(by='counts', ascending=False).head())


                              sender                       recipient1  counts
9055            pete.davis@enron.com             pete.davis@enron.com     627
3298             eric.bass@enron.com          shanna.husser@enron.com     472
10453           sally.beck@enron.com         patti.thompson@enron.com     388
3045   enron.announcements@enron.com            all.houston@enron.com     360
5065         jeff.dasovich@enron.com  nancy.sellers@robertmondavi.com     348


In [66]:
prompt = """
A growing romance between two employees during a major project. The story unfolds in The Enron offices, where long hours and stressful deadlines bring employees closer.
Eric Bass and Shanna Husser are working together on a challenging task that requires constant collaboration. As the pressure increases, they begin to realize that their connection goes beyond work.
Eric notices how supportive Shanna has been through the tough times, and slowly, a deeper bond forms between them. Despite the office chaos, they find small moments of connection...
"""

In [67]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(inputs['input_ids'], max_length=300, num_return_sequences=1)

generated_story = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_story)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



A growing romance between two employees during a major project. The story unfolds in The Enron offices, where long hours and stressful deadlines bring employees closer.
Eric Bass and Shanna Husser are working together on a challenging task that requires constant collaboration. As the pressure increases, they begin to realize that their connection goes beyond work.
Eric notices how supportive Shanna has been through the tough times, and slowly, a deeper bond forms between them. Despite the office chaos, they find small moments of connection...
Eric and Shanna are working together on a challenging task that requires constant collaboration. As the pressure increases, they begin to realize that their connection goes beyond work.
Eric and Shanna are working together on a challenging task that requires constant collaboration. As the pressure increases, they begin to realize that their connection goes beyond work.
Eric and Shanna are working together on a challenging task that requires const

In [68]:
# Ajustar generacion
outputs = model.generate(
    inputs['input_ids'],
    max_length=500,
    num_return_sequences=1,
    temperature=0.7,  # Creatividad
    top_p=0.9,
    repetition_penalty=1.5
)

generated_story = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_story)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



A growing romance between two employees during a major project. The story unfolds in The Enron offices, where long hours and stressful deadlines bring employees closer.
Eric Bass and Shanna Husser are working together on a challenging task that requires constant collaboration. As the pressure increases, they begin to realize that their connection goes beyond work.
Eric notices how supportive Shanna has been through the tough times, and slowly, a deeper bond forms between them. Despite the office chaos, they find small moments of connection...
Shannan is an employee at A&E who works with Eric's boss for several months before she leaves her job as senior vice president overseeing sales operations (the company also owns his personal email account). She finds herself constantly being asked questions about what it takes when you're not doing anything but talking or writing code - something which can be difficult if your team isn't paying attention!


In [131]:
selected_cluster = 0

cluster_emails = df_filtered[df_filtered['cluster'] == selected_cluster]['clean_text'].tolist()

context = ' '.join(cluster_emails[:5])

context = context[:1024]

prompt = f"Based on the following emails:\n{context}\nA story about the interactions between employees in an office begins..."

from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

inputs = tokenizer(prompt, return_tensors='pt')

outputs = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    max_length=600,
    temperature=0.9,
    top_p=0.9,
    repetition_penalty=1.2
)

generated_story = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_story)

Based on the following emails:
, -- -- -- -- -- -- -- -- -- -- -- Forwarded Phillip K Allen/HOU/ECT 10/16/2000, 01:42 PM -- -- -- -- -- -- -- -- -- -- -- -- -- -,,, Buckner, Buck < buck.buckner @ honeywell.com > 10/12/2000 01:12:21 PM, : \Pallen @ Enron.com\ < Pallen @ Enron.com >, cc :, Subject : FW : fixed forward Collar floor gas price terms,,, Phillip,,, > discussed phone conversation, Parallon 75 microturbine, > power generation deal national accounts customer, developing, > proposal sell power customer fixed collar/floor price., > need corresponding term gas price. Microturbine, > onsite generation product developed Honeywell generate electricity, > customer site ( degen ). using natural gas., need, > best fixed price forward gas price deal 1, 3, 5, 7 10 years, > annual/seasonal supply microturbines generate fixed kWh, > customer. opportunity sell customer kWh using, > microturbine sell turbines. kWh deal must limited/, > risk forward gas price make deal work. Therein comes Sempr