# **Model Validation**

In [1]:
import openai
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

OpenAI API Key: ········


In [None]:
if 'OPENAI_API_KEY' in os.environ:
    openai_api_key = os.environ['OPENAI_API_KEY']
    print("OpenAI API Key is set")
else:
    print("OPENAI_API_KEY environment variable is not set.")

In [3]:
# account for deprecation of LLM model
import datetime
# Get the current date
current_date = datetime.datetime.now().date()

# Define the date after which the model should be set to "gpt-3.5-turbo"
target_date = datetime.date(2024, 6, 12)

# Set the model variable based on the current date
if current_date > target_date:
    llm_model = "gpt-3.5-turbo"
else:
    llm_model = "gpt-3.5-turbo-0301"

In [4]:
import pandas as pd
import numpy as np
import ast  # Import the ast module for literal evaluation
import seaborn as sns

pd.set_option('display.max_colwidth', 100)
# %pip install pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 8
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


-----
## Retrieval Dataframe

In [5]:
df_messages=pd.read_csv('gs://user-scripts-msca310019-capstone-49b3/data/20231009_Emails_with_Previous_as_List.csv', parse_dates=['time'])
df_messages.head(5)

Unnamed: 0.1,Unnamed: 0,Email_ID,time,subject,thread,sender,repliers,message,reply_time,replier,reply_recipient,reply_message,Sender_Receiver_Emails
0,0,0,1999-06-09 04:18:00-07:00,RE: test,2,5552,[40034],How about this Friday ? Julie has not left yet. She was 2 days away from leaving to start school...,1999-06-09 08:06:00-07:00,40034,[5552],when? how are you and your family? is julie gone?,
1,1,1,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-06-10 03:54:00-07:00,5552,[40034],Today is bad. Tommorrow I will call you.,"['Do you have lunch plans today?', ""Really? I'd feel like a mooch. Lets have lunch next week. An..."
2,2,2,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 01:38:00-08:00,5552,[40034],Do you have lunch plans today?,"['Today is bad. Tommorrow I will call you.', ""Really? I'd feel like a mooch. Lets have lunch nex..."
3,3,3,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:13:00-08:00,5552,[40034],Really? I'd feel like a mooch. Lets have lunch next week. Any day but Monday or Thurs.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Tues.is good. I'..."
4,4,4,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:58:00-08:00,5552,[40034],Tues.is good. I'll call you.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Really? I'd feel..."


In [7]:
df_messages['Sender_Receiver_Emails_list'] = df_messages['Sender_Receiver_Emails'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

In [8]:
# df_messages['num_relevant_emails']=df_messages['Sender_Receiver_Emails'].apply(lambda x: len(x))
df_messages['num_relevant_emails'] = df_messages['Sender_Receiver_Emails_list'].apply(lambda x: len(x) if isinstance(x, list) else np.nan)
df_messages.head()

Unnamed: 0.1,Unnamed: 0,Email_ID,time,subject,thread,sender,repliers,message,reply_time,replier,reply_recipient,reply_message,Sender_Receiver_Emails,Sender_Receiver_Emails_list,num_relevant_emails
0,0,0,1999-06-09 04:18:00-07:00,RE: test,2,5552,[40034],How about this Friday ? Julie has not left yet. She was 2 days away from leaving to start school...,1999-06-09 08:06:00-07:00,40034,[5552],when? how are you and your family? is julie gone?,,[],0
1,1,1,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-06-10 03:54:00-07:00,5552,[40034],Today is bad. Tommorrow I will call you.,"['Do you have lunch plans today?', ""Really? I'd feel like a mooch. Lets have lunch next week. An...","[Do you have lunch plans today?, Really? I'd feel like a mooch. Lets have lunch next week. Any d...",3
2,2,2,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 01:38:00-08:00,5552,[40034],Do you have lunch plans today?,"['Today is bad. Tommorrow I will call you.', ""Really? I'd feel like a mooch. Lets have lunch nex...","[Today is bad. Tommorrow I will call you., Really? I'd feel like a mooch. Lets have lunch next w...",3
3,3,3,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:13:00-08:00,5552,[40034],Really? I'd feel like a mooch. Lets have lunch next week. Any day but Monday or Thurs.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Tues.is good. I'...","[Today is bad. Tommorrow I will call you., Do you have lunch plans today?, Tues.is good. I'll ca...",3
4,4,4,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:58:00-08:00,5552,[40034],Tues.is good. I'll call you.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Really? I'd feel...","[Today is bad. Tommorrow I will call you., Do you have lunch plans today?, Really? I'd feel like...",3


In [9]:
df_messages['sender']=df_messages['sender'].astype('str')
df_messages['replier']=df_messages['replier'].astype('str')

In [10]:
df_messages['sender_replier_thread'] = df_messages['sender'].str.cat(df_messages['replier'], sep='-')
# df_messages['sender_receiver_thread'] = df_messages['sender_receiver_thread'].str.cat(df_messages['thread'], sep='-')

In [11]:
pd.set_option('display.max_colwidth', 500)
df_messages.Sender_Receiver_Emails_list.head(30)
df_messages[['sender_replier_thread','Sender_Receiver_Emails','num_relevant_emails']].iloc[15]

sender_replier_thread                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                37911-43
Sender_Receiver_Emails    ['try having no work to do for two months and then having to stick around and "appear" busy  i know what you mean, in some ways its harder when you\'re not used to not doing things. i\'ve gotten good at finding things to do with my time, but when i first had nothing to do, i was going crazy. fridays are always slow around here and i\'m guessing its the same there, although i don\'t know cnrl specifcally, oil and gas producer/marketer? in terms 

### De-Duplication for Retrieval ##

In [12]:
df_messages_deduped=df_messages.drop_duplicates(subset='sender_replier_thread', keep='last')

-----
## Ranking Framework

In [14]:
# Importing Packages
import chromadb
from chromadb.utils import embedding_functions

import langchain
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

In [15]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

In [17]:
# Setting Up Client
chroma_client = chromadb.PersistentClient(path="vectorstores")
chroma_client.heartbeat()

1698106661234455844

In [18]:
# Defining the Embedding Function
openai_ef = embedding_functions.OpenAIEmbeddingFunction(api_key = openai_api_key,
                                                        model_name="text-embedding-ada-002")
openai_ef

<chromadb.utils.embedding_functions.OpenAIEmbeddingFunction at 0x7f96bd3f1030>

----
## Langchain Function v2

In [19]:
# Email Environment
sender_id='171'
replier_id='414'
sender_email='Where is my gift?'
email_dataset=df_messages_deduped
vector_db_client=chroma_client

In [20]:
async def get_email_response_personalized(sender_id=sender_id,
                                    replier_id=replier_id,
                                    sender_email=sender_email,
                                    email_dataset=df_messages_deduped,
                                    api_key=openai_api_key,
                                    vector_db_client=chroma_client,
                                    embedding_function=OpenAIEmbeddings()):
    
    # First getting retrieved emails to understand conversation --------
    sender_replier_id='-'.join([sender_id, replier_id])
    previous_emails=(email_dataset[email_dataset.sender_replier_thread==sender_replier_id]['Sender_Receiver_Emails_list']).to_list()[0][-10:]
    previous_emails
    # Second, getting ranked responses as per context ------------------
        
        # Building the Langchain vectorstore using chroma collections
    user_vector_store = Chroma(
        client=vector_db_client, 
        collection_name='user'+str(replier_id),
        embedding_function=OpenAIEmbeddings())
        # Getting ranked responses using MMR
    found_rel_emails = await user_vector_store.amax_marginal_relevance_search(sender_email, k=10, fetch_k=10)
    list_rel_emails=[]
    for i, doc in enumerate(found_rel_emails):
        list_rel_emails.append(doc.page_content)
    list_rel_emails
        
    # Setting up LangChain
    llm=ChatOpenAI(temperature=0.1, model=llm_model, openai_api_key=openai_api_key)
    template_string="""Create a response to the following email {sender_email}, deriving context from these relevant emails {relevant_emails} ,using a conversation style similar to following past emails between these 2 users {prev_emails}"""
    prompt_template = ChatPromptTemplate.from_template(template=template_string)    
    llm_chain=LLMChain(llm=llm, prompt=prompt_template)
    print(llm_chain.run(sender_email=sender_email, prev_emails=previous_emails, relevant_emails=list_rel_emails))
    

In [26]:
# results = await get_email_response_personalized(sender_id=sender_id,
#                                     replier_id=replier_id,
#                                     sender_email=sender_email,
#                                     email_dataset=df_messages_deduped,
#                                     api_key=openai_api_key,
#                                     vector_db_client=chroma_client,
#                                     embedding_function=OpenAIEmbeddings())
# print(results)

In [25]:
import pandas as pd
import ast
import chromadb
from chromadb.utils import embedding_functions
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
import asyncio

# Load the data
df_messages=pd.read_csv('gs://user-scripts-msca310019-capstone-49b3/data/20231019_Emails_with_Sender_Reply.csv', parse_dates=['time']).head()
df_messages['Sender_Receiver_Emails_list'] = df_messages['Sender_Receiver_Emails'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_messages['sender'] = df_messages['sender'].astype('str')
df_messages['replier'] = df_messages['replier'].astype('str')
df_messages['sender_replier_thread'] = df_messages['sender'].str.cat(df_messages['replier'], sep='-')
df_messages_deduped = df_messages.drop_duplicates(subset='sender_replier_thread', keep='last')

# Setting up the chroma client
chroma_client = chromadb.PersistentClient(path="vectorstores")
openai_api_key = openai_api_key

async def get_email_response_personalized(sender_id, replier_id, sender_email, email_dataset, api_key, vector_db_client):    
    # First getting retrieved emails to understand conversation --------
    sender_replier_id='-'.join([sender_id, replier_id])
    previous_emails=(email_dataset[email_dataset.sender_replier_thread==sender_replier_id]['Sender_Receiver_Emails_list']).to_list()[0][-10:]
    previous_emails
    # Second, getting ranked responses as per context ------------------
        
        # Building the Langchain vectorstore using chroma collections
    user_vector_store = Chroma(
        client=vector_db_client, 
        collection_name='user'+str(replier_id),
        embedding_function=OpenAIEmbeddings())
        # Getting ranked responses using MMR
    found_rel_emails = await user_vector_store.amax_marginal_relevance_search(sender_email, k=10, fetch_k=10)
    list_rel_emails=[]
    for i, doc in enumerate(found_rel_emails):
        list_rel_emails.append(doc.page_content)
    list_rel_emails
        
    # Setting up LangChain
    llm=ChatOpenAI(temperature=0.1, model=llm_model, openai_api_key=openai_api_key)
    template_string="""You are an employee of Enron and receiver of this email: {sender_email},
    Reply to the email as the receiver,
    deriving the context from these relevant emails : {relevant_emails}
    and copying the writing style of the receiver's past emails : {prev_emails}"""
    prompt_template = ChatPromptTemplate.from_template(template=template_string)    
    llm_chain=LLMChain(llm=llm, prompt=prompt_template)
    return (llm_chain.run(sender_email=sender_email, prev_emails=previous_emails, relevant_emails=list_rel_emails))
    

KeyError: 'replier'

In [23]:
async def process_all_emails(df):
    responses = []
    for index, row in df.iterrows():
        response = await get_email_response_personalized(row['sender'], row['replier'], row['message'], df, openai_api_key, chroma_client)
        responses.append(response)
        sleep(60)
    return responses

In [24]:
# Run the asynchronous function and get all the responses
all_responses = await process_all_emails(df_messages_deduped.head(100))

# Append the responses to the original dataframe
df_messages_deduped['responses'] = all_responses

Add of existing embedding ID: 0
Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for text-embedding-ada-002 in organization org-3uPIm3xy4QJopNFYHTe9obWT on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for text-embedding-ada-002 in organization org-3uPIm3xy4QJopNFYHTe9obWT on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to hav

CancelledError: 

In [None]:
df_messages_deduped

Unnamed: 0.1,Unnamed: 0,Email_ID,time,subject,thread,sender,repliers,message,reply_time,replier,reply_recipient,reply_message,Sender_Receiver_Emails,Sender_Receiver_Emails_list,sender_replier_thread,responses
0,0,0,1999-06-09 04:18:00-07:00,RE: test,2,5552,[40034],"How about this Friday ? Julie has not left yet. She was 2 days away from leaving to start school on June 7th when she, quite by accident, looked at the school schedule for the first time in several weeks. Now mind you she had lined up a babysitter ,lined up a place to stay, arranged to get a key to start the 7th. The schedule says the school starts the 14th. Pretty funny. How many years will it take her to live this down?. Enron Capital & Trade Resources Corp.",1999-06-09 08:06:00-07:00,40034,[5552],when? how are you and your family? is julie gone?,,[],5552-40034,"Hey there!\n\nThanks for reaching out. How about this Friday for what exactly? Sorry, I'm a bit confused.\n\nMy family and I are doing well, thanks for asking. Is Julie still around? I heard about her school schedule mishap, that's quite funny. I'm sure she'll be able to laugh about it eventually.\n\nTake care!\n\n[Previous emails between the two users:\n\nUser 1: Hey, are you free this weekend?\n\nUser 2: Hey! Yeah, I should be. What's up?\n\nUser 1: Just wanted to catch up and maybe grab l..."
4,4,4,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:58:00-08:00,5552,[40034],Tues.is good. I'll call you.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Really? I'd feel like a mooch. Lets have lunch next week. Any day but Monday or Thurs.""]","[Today is bad. Tommorrow I will call you., Do you have lunch plans today?, Really? I'd feel like a mooch. Lets have lunch next week. Any day but Monday or Thurs.]",40034-5552,"Thanks for asking, we're doing well. Julie is no longer with us."
