In [1]:
import pandas as pd
import numpy as np

import os
import re
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [3]:
print(df.loc[0]['message'])

Message-ID: <18782981.1075855378110.JavaMail.evans@thyme>
Date: Mon, 14 May 2001 16:39:00 -0700 (PDT)
From: phillip.allen@enron.com
To: tim.belden@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Phillip K Allen
X-To: Tim Belden <Tim Belden/Enron@EnronXGate>
X-cc: 
X-bcc: 
X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail
X-Origin: Allen-P
X-FileName: pallen (Non-Privileged).pst

Here is our forecast

 


# Define Tokenization, Stop-word and Punctuation Removal Functions
Before proceeding, we must decide how many samples to draw from each class. We must also decide the maximum number of tokens per email, and the maximum length of each token. This is done by setting the following overarching hyperparameters

In [4]:
# Params for bert model and tokenization
Nsamp = 1000 # number of samples to generate in each class - 'spam', 'not spam'
maxtokens = 200 # the maximum number of tokens per document
maxtokenlen = 100 # the maximum length of each token

## Use regular expressions to remove unnecessary characters

Next, we define a function to remove punctuation marks and other nonword characters (using regular expressions) from the emails with the help of the ubiquitous python regex library. In the same step, we truncate all tokens to hyperparameter maxtokenlen defined above.

In [5]:
def reg_expressions(text):
    # Remove file paths (e.g., "O:\CurveValidation\Estate Reporting...")
    text = re.sub(r'[A-Za-z]:\\[^\s]+', '', text)
    
    # Remove newline characters
    text = text.replace('\n', ' ')
    
    # Remove common email headers (From, Sent, To, Subject, etc.)
    text = re.sub(r'(From|Sent|To|Subject):.*?\s', '', text, flags=re.IGNORECASE)

    # Remove date and timestamp information (common patterns)
    text = re.sub(r'\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b,?\s+\d{1,2}\s+\w+\s+\d{4}', '', text)
    text = re.sub(r'\b\d{1,2}:\d{2}\s?(?:AM|PM|am|pm)?\b', '', text)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove multiple spaces and trim whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

## Stop-word removal

Let’s define a function to remove stopwords - words that occur so frequently in language that they offer no useful information for classification. This includes words such as “the” and “are”, and the popular library NLTK provides a heavily-used list that will employ.

In [6]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')    
print(stopwords) # see default stopwords

def stop_word_removal(row):
    token = [token for token in row if token not in stopwords]
    token = filter(None, token)
    return token

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/revanthv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Separate headers from the message bodies
import email

def extract_messages(df):
    messages = []
    for item in df["message"]:
        # Return a message object structure from a string
        e = email.message_from_string(item)    
        # get message body  
        message_body = e.get_payload()
        messages.append(message_body)
    print("Successfully retrieved message body from e-mails!")
    return messages

bodies = extract_messages(df)


Successfully retrieved message body from e-mails!


In [8]:
mean_length = 0
for i in bodies:
    mean_length+=len(i.split(' '))
mean_length = mean_length//len(bodies)

In [9]:
sample_email_bodies = []
max = 0
for i in bodies:
    length=len(i.split(' '))
    if length>max:
      max = length
    if length>=mean_length:
        sample_email_bodies.append(i)
len(bodies),len(sample_email_bodies),max


(517401, 110175, 249307)

In [10]:
# extract random 10000 enron email bodies for building dataset
import random
bodies_df = pd.DataFrame(random.sample(sample_email_bodies, 10000))

# expand default pandas display options to make emails more clearly visible when printed
pd.set_option('display.max_colwidth', 300)

bodies_df.head() # you could do print(bodies_df.head()), but Jupyter displays this nicer for pandas DataFrames

Unnamed: 0,0
0,Dave \n\nThe PPA schedule was pushed back two weeks. I would like to keep our meeting \nwith Skilling because I heard he was going to Africa for 3 weeks.\n\nLavo\n---------------------- Forwarded by John J Lavorato/Corp/Enron on 06/12/2000 \n09:47 AM ---------------------------\n\n\nAndre Templ...
1,"This is an example of the type of things I continue to manage for Citizens. \nAt this point my role is customer service, making sure their questions and \nconcerns are being addressed. I work extensively with Patti Sullivan and \nDarla Saucier to keep things on even keel. I would like to stay..."
2,"Mark, FYI - In my estimation this business is no bigger than Mexico or the \nToronto business ie) $25 to $30M in real value with reasonably high risk. I \nsee no value in having two different agendas being pushed in S. America. I \nalso believe that we could cut overhead and right size the org..."
3,"Have a Safe & Happy Thanksgiving Everyone !!\n\n \n\nTHOU SHALT NOT SKIM FLAVOR FROM THE HOLIDAYS \n\nBy Craig Wilson, USA TODAY \n\nI hate this time of year. Not for its crass commercialism and forced \nfrivolity, but because it's the season when the food police come out \nwith their wagging fi..."
4,"Jerry Scarbrough's True Orange\nThe newsletter and fax/e-mail service for the True Texas Longhorn Faithful\n\nVolume 10, No. 21, August 28, 2000\n\nStronger Defense, Improved Punting, Explosive Offense Give Horns Chance to\nStay in Top 10\n\nWith two-a-day workouts almost over and the opening ga..."


In [11]:
import random

# Convert everything to lower-case, truncate to maxtokens and truncate each token to maxtokenlen
EnronEmails = bodies_df.iloc[:,0]
EnronEmails = EnronEmails.apply(reg_expressions)
EnronEmails = EnronEmails.sample(10)

data = pd.DataFrame({"Tokenized_text": EnronEmails})

In [12]:
print("Shape of combined data represented as numpy array is:")
print(data.shape)
print("Data represented as numpy array is:")
data.head()

Shape of combined data represented as numpy array is:
(10, 1)
Data represented as numpy array is:


Unnamed: 0,Tokenized_text
7330,"Gerald: When I got your form I converted it to WordPerfect 9.0 and then did the revisions. Apparently when I converted it back to Word, it didn't ""take"". I was able to convert it to Plain Text which I am attaching to this message. At least you can read it. Please call with any questions Nemec, G..."
1356,"---------------------- Forwarded by Andrea Ring/HOU/ECT on 05/02/2001 PM --------------------------- Michele on 04/04/2001 Maria Teb Andrea cc: FW: weepy Choices > > At a fund-raising dinner for a school that serves learning-disabled > children, the father of one of the school's students deliver..."
9644,"GADSDEN RESEARCH SERVICES' FERCwatch Issued February 5, 2002 ELECTRIC / HYDRO Report: Southern California Edison Company, ER02-925-000 (01/31/02) -- Revision to Transmission Owner Tariff to reflect proposed changes to transmission revenue requirements and transmission rates applicable to wholesa..."
1585,"strong! -----Original Message----- [mail Tuesday, May 29, 2001 Carson, Mike RE: attire Mike, If you have any trouble finding the restaurant, I've included directions below: To Chapel Hill from RDU Airport: Take I-40 West and proceed to exit 273 B which is highway 54 West. Proceed on 54 West whic..."
8775,I am very interested in pursuing this and I believe Enron presents an ideal case study for the purposes you outlined. I am copying Christie Patrick on this message and asking her to coordinate this for us. Christie has worked on other case studies and will be very helpful in navigating Enron for...


In [15]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("mps")

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer

# Load pre-trained model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
tokenizer = BartTokenizer.from_pretrained(model_name)

def summarize_email_thread(thread_text):
    inputs = tokenizer.encode("summarize: " + thread_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = inputs.to(device)
    summary_ids = model.generate(inputs, max_length=200, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


data['summary'] = data['Tokenized_text'].apply(summarize_email_thread)



In [None]:
data[['Tokenized_text','summary']].iloc[0]

Tokenized_text    Thanks looks good. I guess I just have a chip on my shoulder these days about being relegated to the world of details! -----Original Message----- Fossum, Drew Tuesday, September 25, 2001 Corman, Shelley FW: Settlement Offer Here's the latest draft--this is on the way over to staff right now. I'l...
summary                                                        Summarize: Thanks looks good. I guess I just have a chip on my shoulder these days about being relegated to the world of details! Here's the latest draft--this is on the way over to staff right now. I'll make sure all future settlement stuff goes to you.
Name: 1339, dtype: object

In [13]:
from huggingface_hub import login
login(token='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')


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/revanthv/.cache/huggingface/token
Login successful


In [None]:
# Set environment variable for MPS fallback
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "meta-llama/Llama-3.2-1B"  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True)
model = model.to(device)

# Add padding token to tokenizer if it does not exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) 
    model.generation_config.pad_token_id = tokenizer.pad_token_id

In [None]:
def summarize_email_thread(thread_text):
    inputs = tokenizer("summarize: " + thread_text, return_tensors="pt",padding = True)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    summary_ids = model.generate(
        **inputs,
        max_length= max,
        min_length=50,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3
    )
    
    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize the provided email thread
data['llama_summary'] = data['Tokenized_text'].apply(summarize_email_thread)
data[['Tokenized_text','llama_summary']].iloc[0:5]

Unnamed: 0,Tokenized_text,llama_summary
5131,Please forward to James Scribner. He will call with his e:mail address. = =20 ---------------------- Forwarded by Sally Beck/HOU/ECT on 01/25/2000 = AM=20 --------------------------- =20 =09Enron North America Corp. =09 =09 Rick Causey @ ENRON 01/17/2000 =09 Sent by: Enron All Enron Worldwide cc...,summarize: Please forward to James Scribner. He will call with his e:mail address. = =20 ---------------------- Forwarded by Sally Beck/HOU/ECT on 01/25/2000 = AM=20 --------------------------- =20 =09Enron North America Corp. =09 =09 Rick Causey @ ENRON 01/17/2000 =09 Sent by: Enron All Enron W...
8996,"Uh, YEAH! There's a WHOLE BUNCH of them! Single ones, squishy ones, firm ones...wanna feel 'em? =o) HA HA HA!!! -----Original Message----- Symes, Kate Wednesday, May 02, 2001 Rodriguez, Grace Re: EWWW TOMATOES! You have beef-cakes up there? Are they single? Grace on 05/02/2001 CDT Lysa Tom Rober...","summarize: Uh, YEAH! There's a WHOLE BUNCH of them! Single ones, squishy ones, firm ones...wanna feel 'em? =o) HA HA HA!!! -----Original Message----- Symes, Kate Wednesday, May 02, 2001 Rodriguez, Grace Re: EWWW TOMATOES! You have beef-cakes up there? Are they single? Grace on 05/02/2001 CDT Lys..."
4956,"Thanks. -----Original Message----- Novosel, Sarah Thursday, October 25, 2001 Shapiro, Richard; Steffes, James D. Cc: Robertson, Linda FW: November 2 Seattle Conference I sent to Kevin the names forwarded to us from Paul for possible panelists from Western governors' offices for FERC's November 2...","summarize: Thanks. -----Original Message----- Novosel, Sarah Thursday, October 25, 2001 Shapiro, Richard; Steffes, James D. Cc: Robertson, Linda FW: November 2 Seattle Conference I sent to Kevin the names forwarded to us from Paul for possible panelists from Western governors' offices for FERC's..."
1787,"-----Original Message----- Stein, Neil [mailMonday, September 24, 2001 undisclosed-recipients CSFB Independent Power Weekly--Issue #44 <<IPW092401.pdf>> Good Morning, Attached, please find the latest issue of our Independent Power Weekly. Also note that there is a replay available of our confere...","summarize: -----Original Message----- Stein, Neil [mailMonday, September 24, 2001 undisclosed-recipients CSFB Independent Power Weekly--Issue #44 <<IPW092401.pdf>> Good Morning, Attached, please find the latest issue of our Independent Power Weekly. Also note that there is a replay available of ..."
1285,"John or Marissa, please call me at your convenience on my cell, 713 304 8716. Thanks, Kay ""Keffer, John"" on 03/23/2001 :47 PM cc: ""Reuter, Marisa"" RE: ENA/Blue Dog: Revised Letter Agreement I failed to include the conference call information, which is noted below: number: 877-232-0064 pin: 25581...","summarize: John or Marissa, please call me at your convenience on my cell, 713 304 8716. Thanks, Kay ""Keffer, John"" on 03/23/2001 :47 PM cc: ""Reuter, Marisa"" RE: ENA/Blue Dog: Revised Letter Agreement I failed to include the conference call information, which is noted below: number: 877-232-0064..."


In [None]:
from transformers import pipeline

# Load a zero-shot-classification pipeline
classifier = pipeline("zero-shot-classification", model = 'facebook/bart-large-mnli',device=device)

# Define classes for classification
categories = ["Meeting Request", "Status Update", "General Query"]

# Sample email content
email_content = data.iloc[0]['Tokenized_text']

# Classify the email
result = classifier(
    email_content,
    candidate_labels=categories,
    multi_label=False  # Set to True if you expect multiple relevant classes
)

result = {result['labels'][0]}
print(result)



# # Response generation based on classification
# responses = {
#     "Meeting Request": "Thank you for your meeting request. Please provide the agenda and your availability.",
#     "Status Update": "Thank you for your update. Please let me know if there is any more information to be shared.",
#     "General Query": "Thank you for reaching out. How can I assist you further?"
# }

# response = responses.get(result['labels'][0], "Thank you for your email.")
# print(f"Generated Response: {response}")


{'Status Update'}


In [None]:
model_name = "meta-llama/Llama-3.2-1B"  
# Load a pre-trained text generation model
generator = pipeline('text-generation', model=model_name)

def generate_response(email_content, prompt_prefix=f"Generate a response mail to the following email based on the category {result['labels'][0]}"):
    prompt = f"{prompt_prefix} {email_content}"
    responses = generator(prompt, max_length=max, num_return_sequences=1)
    return responses[0]['generated_text']


generate_response(email_content)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'Generate a response mail to the following email based on the category Meeting Request Can we schedule a meeting tomorrow? Can we schedule a meeting this week? Can we schedule a meeting next week?\nGenerate a response mail to the following email based on the category Meeting Request Can we schedule a meeting tomorrow? Can we schedule a meeting this week? Can we schedule a meeting next week?'

In [25]:
from transformers import pipeline
from torch import cuda

device = 0 if cuda.is_available() else -1

# Load a zero-shot-classification pipeline
classifier = pipeline("zero-shot-classification", model='facebook/bart-large-mnli', device=device,)

# Define classes for classification
categories = ["Meeting Request", "Status Update", "General Query"]

# Sample email content (ensure it's defined and not too large)
email_content = "Hi Reavanth ,As discussed, Please find the assignment, which you need to complete in 2 days. Kindly complete and share it." # Truncate if necessary

# Classify the email
print("Starting classification...")
result = classifier(email_content, candidate_labels=categories, multi_label=False,truncation=False)
print("Classification complete.")
result = {result['labels'][0]}
print("Classification result:", result)

model_name = "meta-llama/Llama-3.2-1B"  # Ensure this model exists
generator = pipeline('text-generation', device=device,pad_token_id=generator.tokenizer.eos_token_id)

prompt_prefix = f"Generate a response mail to the following email in {result} tone"
prompt = f"{prompt_prefix} {email_content}"
print("Starting text generation...")
responses = generator(prompt, max_length=513, num_return_sequences=1,truncation=True)  # Adjust max_length
print("Text generation complete.")
print(responses[0])


Starting classification...


No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Classification complete.
Classification result: {'Status Update'}


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Starting text generation...
Text generation complete.
{'generated_text': 'Generate a response mail to the following email in {\'Status Update\'} tone Hi Reavanth ,As discussed, Please find the assignment, which you need to complete in 2 days. Kindly complete and share it. :| Your Name, Email Address, and Reception Email Address: Hello? :| Your Phone Number, Number of Days, and Deceased Persons: (A) You: :| Your Title; (B) The Date in which you were Killed, Or Killed.\n\nDo you mean that the "No Deaths After September 12 Incident" rule is in place? :| It is not. :| We have yet to see the reply. :| What sort of response is still required to complete the assignment? I\'d like to include my name, the names as well as the e-mail address. Thanks for your input. :|\n\n\nDear Reavanth, I read the assignment with no further interest in contacting a government to retrieve a data file, so I wrote to your organization that is at that time collecting data on American citizen assassinations and has 

In [35]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load pre-trained LLaMA model and tokenizer (example with LLaMA-2)
model_name = "Yihui/t5-small-text-summary-generation"  # Specify the appropriate model path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Yihui/t5-small-text-summary-generation")
model = AutoModelForSeq2SeqLM.from_pretrained("Yihui/t5-small-text-summary-generation")
model = model.to(device)

# Add padding token to tokenizer if it does not exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer)) # Resize model embeddings to accommodate the new padding token
    model.generation_config.pad_token_id = tokenizer.pad_token_id


pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
def summarize_email_thread(thread_text):
    # Tokenize the input text
    inputs = tokenizer("summarize: " + thread_text, return_tensors="pt",padding = True)
    print(inputs)
    # inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Generate summary using model.generate()
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length= max,
        min_length=50,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=3
    )
    
    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


prefi
email = "Hi Hello how are you"
summarize_email_thread(email)

{'input_ids': tensor([[21603,    10,  2018,  8774,   149,    33,    25,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


"Hi Hi How are you? Hi Hi, how are you doing? Hi, Hi, How are You doing? Hello, Hello, how can you do it? Hi! Hi, I'm not a big fan of my blog!"