In [1]:
!pip install -U -q pdfplumber tiktoken openai chromaDB sentence-transformers



In [2]:
# Import the required libraries
import pandas as pd
import openai
import chromadb
import os
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from sentence_transformers import CrossEncoder, util

  from tqdm.autonotebook import tqdm, trange


In [3]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# Set the API key
filepath = "/content/drive/My Drive/GenAI/" # Change filepath as required. Nothing else needs to change

with open(filepath + "OPENAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [5]:
# Read the email details file
dfDetails = pd.read_csv(filepath + 'email_thread_details.csv')
dfDetails.head()


Unnamed: 0,thread_id,subject,timestamp,from,to,body
0,1,FW: Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",\n\n -----Original Message-----\nFrom: =09Ther...
1,1,FW: Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",\n\n -----Original Message-----\nFrom: =09Panu...
2,1,FW: Master Termination Log,2002-02-05 15:03:35,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Anderson', 'Dia...",Note to Stephanie Panus....\n\nStephanie...ple...
3,1,FW: Master Termination Log,2002-02-05 15:06:25,"Theriot, Kim S. KTHERIO","['Hall', 'D. Todd Thall', 'Sweeney', 'Kevin Ks...",\n\n -----Original Message-----\nFrom: =09Panu...
4,1,FW: Master Termination Log,2002-05-28 07:20:35,"Kelly, Katherine L. KKELLY","['Germany', 'Chris Cgerman']",\n\n -----Original Message-----\nFrom: =09McMi...


In [6]:
# Read the email summary file
dfSummary = pd.read_csv(filepath + 'email_thread_summaries.csv')
dfSummary.head()

Unnamed: 0,thread_id,summary
0,1,The email thread discusses the Master Terminat...
1,2,A lunch meeting has been scheduled for May 5th...
2,3,Ben is updating a friend on his progress with ...
3,4,The recipient of the email thread initially ex...
4,5,The email thread discusses the long form confi...


In [7]:
# Remove the "Original Message" from body of emails since that is repeating text
dfDetails[['body_modified', 'body_modified_2']] = dfDetails['body'].str.split('-----Original Message-----', n=1, expand=True)
dfDetails.head()

Unnamed: 0,thread_id,subject,timestamp,from,to,body,body_modified,body_modified_2
0,1,FW: Master Termination Log,2002-01-29 11:23:42,"Gossett, Jeffrey C. JGOSSET","['Giron', 'Darron C. Dgiron', 'Love', 'Phillip...",\n\n -----Original Message-----\nFrom: =09Ther...,\n\n,"\nFrom: =09Theriot, Kim S. =20\nSent:=09Tuesda..."
1,1,FW: Master Termination Log,2002-01-31 12:50:00,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Gossett', 'Jeff...",\n\n -----Original Message-----\nFrom: =09Panu...,\n\n,"\nFrom: =09Panus, Stephanie =20\nSent:=09Thurs..."
2,1,FW: Master Termination Log,2002-02-05 15:03:35,"Theriot, Kim S. KTHERIO","['Murphy', 'Melissa Mmurphy', 'Anderson', 'Dia...",Note to Stephanie Panus....\n\nStephanie...ple...,Note to Stephanie Panus....\n\nStephanie...ple...,"\nFrom: =09Panus, Stephanie =20\nSent:=09Tuesd..."
3,1,FW: Master Termination Log,2002-02-05 15:06:25,"Theriot, Kim S. KTHERIO","['Hall', 'D. Todd Thall', 'Sweeney', 'Kevin Ks...",\n\n -----Original Message-----\nFrom: =09Panu...,\n\n,"\nFrom: =09Panus, Stephanie =20\nSent:=09Tuesd..."
4,1,FW: Master Termination Log,2002-05-28 07:20:35,"Kelly, Katherine L. KKELLY","['Germany', 'Chris Cgerman']",\n\n -----Original Message-----\nFrom: =09McMi...,\n\n,"\nFrom: =09McMichael Jr., Ed =20\nSent:=09Tues..."


In [8]:
# Remove the new lines characters
dfDetails['body_modified'] = dfDetails['body_modified'].str.replace('\n',' ')

In [9]:
# Create the message by combining timestamp, from, to and body
dfDetails['message'] = "Subject: " + dfDetails['subject'] + " ; Time Stamp: " + dfDetails['timestamp'] + " ; From: " + dfDetails['from'] + " ; To: " + dfDetails['to'] + " ; Body: " + dfDetails['body_modified']

In [10]:
# Drop the columns that are not required
dfDetails.drop(columns=['timestamp', 'from', 'to', 'body', 'body_modified', 'body_modified_2'], axis = 0, inplace = True)

In [11]:
# Join the two tables
dfSummary.set_index('thread_id', inplace=True)
dfSummary['message'] = dfDetails.groupby('thread_id')['message'].apply(' '.join)

dfSummary.head()

Unnamed: 0_level_0,summary,message
thread_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The email thread discusses the Master Terminat...,Subject: FW: Master Termination Log ; Time Sta...
2,A lunch meeting has been scheduled for May 5th...,Subject: Credit Group Lunch ; Time Stamp: 2000...
3,Ben is updating a friend on his progress with ...,Subject: New Address ; Time Stamp: 2000-01-09 ...
4,The recipient of the email thread initially ex...,Subject: EOL Data ; Time Stamp: 2001-02-01 09:...
5,The email thread discusses the long form confi...,Subject: RE: long form confirm/MDEA ; Time Sta...


In [12]:
# Create the subject column for metadata
dfSummary['subject'] = dfSummary['message'].str.split(';', n=1).str[0].to_dict()
dfSummary['subject'] = dfSummary['subject'].apply(lambda x: {'Subject' : x.split(':')[-1]})
dfSummary.head()

Unnamed: 0_level_0,summary,message,subject
thread_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,The email thread discusses the Master Terminat...,Subject: FW: Master Termination Log ; Time Sta...,{'Subject': ' Master Termination Log '}
2,A lunch meeting has been scheduled for May 5th...,Subject: Credit Group Lunch ; Time Stamp: 2000...,{'Subject': ' Credit Group Lunch '}
3,Ben is updating a friend on his progress with ...,Subject: New Address ; Time Stamp: 2000-01-09 ...,{'Subject': ' New Address '}
4,The recipient of the email thread initially ex...,Subject: EOL Data ; Time Stamp: 2001-02-01 09:...,{'Subject': ' EOL Data '}
5,The email thread discusses the long form confi...,Subject: RE: long form confirm/MDEA ; Time Sta...,{'Subject': ' long form confirm/MDEA '}


**Generate and Store Embeddings using OpenAI and ChromaDB**

In [13]:
# Initialize the vector store
client = chromadb.Client()

In [14]:
# set the file path for vector data store
chroma_data_path = filepath

In [15]:
# Set up the embedding function using the OpenAI embedding model as provided by instructor
model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [16]:
# Create the vector store collection
email_collection = client.get_or_create_collection(name='RAG_on_emails', embedding_function=embedding_function)

In [17]:
# Create a list of documents and metadata
documents_list = dfSummary["summary"].tolist()
metadata_list = dfSummary["subject"].tolist()

In [18]:
# Adding records one by one. Directly providing a list is throwing an error

i = 1
for doc in documents_list:
  email_collection.add (
    documents = doc,
    ids = str(i),
    metadatas = metadata_list[i-1]
    )
  i = i + 1
  if ((i % 100) == 0):
    print ("Completed ==> " + str(i))

Completed ==> 100
Completed ==> 200
Completed ==> 300
Completed ==> 400
Completed ==> 500
Completed ==> 600
Completed ==> 700
Completed ==> 800
Completed ==> 900
Completed ==> 1000
Completed ==> 1100
Completed ==> 1200
Completed ==> 1300
Completed ==> 1400
Completed ==> 1500
Completed ==> 1600
Completed ==> 1700
Completed ==> 1800
Completed ==> 1900
Completed ==> 2000
Completed ==> 2100
Completed ==> 2200
Completed ==> 2300
Completed ==> 2400
Completed ==> 2500
Completed ==> 2600
Completed ==> 2700
Completed ==> 2800
Completed ==> 2900
Completed ==> 3000
Completed ==> 3100
Completed ==> 3200
Completed ==> 3300
Completed ==> 3400
Completed ==> 3500
Completed ==> 3600
Completed ==> 3700
Completed ==> 3800
Completed ==> 3900
Completed ==> 4000
Completed ==> 4100


**Semantic Search**



In [19]:
query = input()

How does Governor's executive orders impact Enron?


In [20]:
# Get the top 10 results from the vector store
results = email_collection.query(
  query_texts=query,
  n_results=10
)

In [21]:
# Create a dataframe from the results; as shown by the instructor
result_dict = {'Metadatas': results['metadatas'][0], 'Documents': results['documents'][0], 'Distances': results['distances'][0], "IDs":results["ids"][0]}
results_df = pd.DataFrame.from_dict(result_dict)
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs
0,{'Subject': ' Response Statement to PG&E's Ch....,Enron's Jeff Dasovich informs the group that h...,0.287397,1081
1,{'Subject': ' GOVERNOR DAVIS CALLS ON FEDERAL ...,Governor Davis is calling on the Federal Energ...,0.290999,2257
2,"{'Subject': ' Meeting with Governor Davis, nee...",The email thread discusses a meeting between E...,0.291428,341
3,{'Subject': ' Notice to Employees of Bankruptc...,"On January 25, 2002, Judge Arthur Gonzalez iss...",0.302308,1850
4,{'Subject': ' Reply to Paul Krugman's column (...,Enron's Executive Vice President and Chief of ...,0.303071,2024
5,{'Subject': ' see transcript for details '},Enron Corp. has been ordered by a California j...,0.310855,443
6,{'Subject': ' CONFIDENTIAL - Residential in CA '},The email thread discusses the decision regard...,0.313714,2990
7,{'Subject': ' news/updates '},The email thread discusses two news stories re...,0.314781,3660
8,{'Subject': ' California Energy Crisis '},The email thread discusses the power crisis in...,0.316983,777
9,{'Subject': ' Attorney General May File Amicus...,The email thread discusses the possibility of ...,0.317349,941


**Re-ranking the Results**

In [22]:
# Initialise the cross encoder model
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [23]:
# Rerank as shown by the instructor
cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)
results_df['Reranked_scores'] = cross_rerank_scores
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores
0,{'Subject': ' Response Statement to PG&E's Ch....,Enron's Jeff Dasovich informs the group that h...,0.287397,1081,0.364372
1,{'Subject': ' GOVERNOR DAVIS CALLS ON FEDERAL ...,Governor Davis is calling on the Federal Energ...,0.290999,2257,3.498632
2,"{'Subject': ' Meeting with Governor Davis, nee...",The email thread discusses a meeting between E...,0.291428,341,1.521301
3,{'Subject': ' Notice to Employees of Bankruptc...,"On January 25, 2002, Judge Arthur Gonzalez iss...",0.302308,1850,-0.118311
4,{'Subject': ' Reply to Paul Krugman's column (...,Enron's Executive Vice President and Chief of ...,0.303071,2024,-0.673714
5,{'Subject': ' see transcript for details '},Enron Corp. has been ordered by a California j...,0.310855,443,-0.642507
6,{'Subject': ' CONFIDENTIAL - Residential in CA '},The email thread discusses the decision regard...,0.313714,2990,-3.447961
7,{'Subject': ' news/updates '},The email thread discusses two news stories re...,0.314781,3660,-1.5848
8,{'Subject': ' California Energy Crisis '},The email thread discusses the power crisis in...,0.316983,777,-3.573337
9,{'Subject': ' Attorney General May File Amicus...,The email thread discusses the possibility of ...,0.317349,941,-3.323893


In [24]:
# Add the message column to results
j = 0
col_list = results_df['IDs'].tolist()
results_df['Message'] = ""
for id in col_list:
  results_df['Message'][j] = dfSummary['message'][int(id)]
  j = j + 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Message'][j] = dfSummary['message'][int(id)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Message'][j] = dfSummary['message'][int(id)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['Message'][j] = dfSummary['message'][int(id)]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [25]:
results_df

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores,Message
0,{'Subject': ' Response Statement to PG&E's Ch....,Enron's Jeff Dasovich informs the group that h...,0.287397,1081,0.364372,Subject: Response Statement to PG&E's Ch. 11 F...
1,{'Subject': ' GOVERNOR DAVIS CALLS ON FEDERAL ...,Governor Davis is calling on the Federal Energ...,0.290999,2257,3.498632,Subject: GOVERNOR DAVIS CALLS ON FEDERAL ENERG...
2,"{'Subject': ' Meeting with Governor Davis, nee...",The email thread discusses a meeting between E...,0.291428,341,1.521301,"Subject: Meeting with Governor Davis, need for..."
3,{'Subject': ' Notice to Employees of Bankruptc...,"On January 25, 2002, Judge Arthur Gonzalez iss...",0.302308,1850,-0.118311,Subject: Notice to Employees of Bankruptcy Cou...
4,{'Subject': ' Reply to Paul Krugman's column (...,Enron's Executive Vice President and Chief of ...,0.303071,2024,-0.673714,Subject: FW: Reply to Paul Krugman's column (E...
5,{'Subject': ' see transcript for details '},Enron Corp. has been ordered by a California j...,0.310855,443,-0.642507,Subject: FW: Returned mail: see transcript for...
6,{'Subject': ' CONFIDENTIAL - Residential in CA '},The email thread discusses the decision regard...,0.313714,2990,-3.447961,Subject: CONFIDENTIAL - Residential in CA ; Ti...
7,{'Subject': ' news/updates '},The email thread discusses two news stories re...,0.314781,3660,-1.5848,Subject: news/updates ; Time Stamp: 2001-03-12...
8,{'Subject': ' California Energy Crisis '},The email thread discusses the power crisis in...,0.316983,777,-3.573337,Subject: California Energy Crisis ; Time Stamp...
9,{'Subject': ' Attorney General May File Amicus...,The email thread discusses the possibility of ...,0.317349,941,-3.323893,Subject: Attorney General May File Amicus in U...


In [26]:
# Return the top 3 results from semantic search
top_3_rank = results_df.sort_values(by='Reranked_scores', ascending = False)
top_3_rank[:3]

Unnamed: 0,Metadatas,Documents,Distances,IDs,Reranked_scores,Message
1,{'Subject': ' GOVERNOR DAVIS CALLS ON FEDERAL ...,Governor Davis is calling on the Federal Energ...,0.290999,2257,3.498632,Subject: GOVERNOR DAVIS CALLS ON FEDERAL ENERG...
2,"{'Subject': ' Meeting with Governor Davis, nee...",The email thread discusses a meeting between E...,0.291428,341,1.521301,"Subject: Meeting with Governor Davis, need for..."
0,{'Subject': ' Response Statement to PG&E's Ch....,Enron's Jeff Dasovich informs the group that h...,0.287397,1081,0.364372,Subject: Response Statement to PG&E's Ch. 11 F...


In [27]:
# Get the required information for top 3 results
top_3_RAG = top_3_rank[["Documents", "Metadatas", "Message"]][:3]
top_3_RAG

Unnamed: 0,Documents,Metadatas,Message
1,Governor Davis is calling on the Federal Energ...,{'Subject': ' GOVERNOR DAVIS CALLS ON FEDERAL ...,Subject: GOVERNOR DAVIS CALLS ON FEDERAL ENERG...
2,The email thread discusses a meeting between E...,"{'Subject': ' Meeting with Governor Davis, nee...","Subject: Meeting with Governor Davis, need for..."
0,Enron's Jeff Dasovich informs the group that h...,{'Subject': ' Response Statement to PG&E's Ch....,Subject: Response Statement to PG&E's Ch. 11 F...


**Retrieval Augmented Generation**

In [28]:
# Function for calling OpenAI. As provided by the instructor.
def get_chat_completions(input):
    MODEL = 'gpt-3.5-turbo'

    chat_completion = openai.chat.completions.create(
            model = MODEL,
            messages = input,
            seed = 2345)

    output = chat_completion.choices[0].message.content

    return output

In [47]:
# Function to provide prompt to OpenAI.
def generate_response(query, top_3_RAG):
  delimiter = "####"

  system_message = f"""

    You are a helpful assistant and your goal is to find useful information from the email content and email summary provided to you.

    {delimiter}
    Use the documents in '{top_3_RAG}' to answer the query '{query}'
    {delimiter}

    {delimiter}
    Here are some instructions around the email requirements
    To find the perfect response, look for more information in the provided data
    1. Document provides the summary of the email threads
    2. Metadata provides the subject of the email threads
    3. Email threads provide all the threads. The threads have timestamp, from, to and body. From field is the person sending the email and to field is the list of recipients
    4. Analyse the document, metadata and email threads to answer the query
    5. You are given 3 set of documents, metadata and email threads. Analyze all of them to answer the query
    6. Only answer the relevant information. If you do not have required information, please provide information that may help user to find the required response to the query
    7. You are an assistant. Do not provide any information on internal workings. Only provide final response.
    {delimiter}

    """
  conversation = [{"role": "system", "content": system_message}]

  return conversation

In [48]:
# Generate the response
response = get_chat_completions(generate_response(query, top_3_RAG))

In [49]:
# Print the response
print (response)

Governor Davis's executive orders impact Enron by calling on the Federal Energy Regulatory Commission (FERC) to take action to prevent Enron from taking advantage of the California energy crisis. Governor Davis is urging FERC to investigate Enron's role in the crisis and take measures to protect Californians from further harm. The email threads likely contain more detailed information about how Governor Davis's executive orders directly impact Enron and the actions that are being requested from FERC to address the situation.
