# **Model Validation PlayGround**

In [1]:
import openai
import os
import getpass
import time
import asyncio

In [2]:
import pandas as pd
import numpy as np
import ast  # Import the ast module for literal evaluation
import seaborn as sns
import matplotlib.pyplot as plt


pd.set_option('display.max_colwidth', 100)
# %pip install pandarallel
import multiprocessing
num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 8
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
#LANGCHAIN
import langchain
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

#CHROMA
import chromadb
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma

# Setting up the chroma client
chroma_client = chromadb.PersistentClient(path="vectorstores")

# MODEL ASSESSMENT
# %pip install rouge-score
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

-----
## Entire Dataframe

In [4]:
df_messages=pd.read_csv('gs://user-scripts-msca310019-capstone-49b3/data/20231019_Emails_with_Sender_Reply.csv', parse_dates=['time'])
df_messages['Sender_Receiver_Emails_list'] = df_messages['Sender_Receiver_Emails'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_messages['num_relevant_emails'] = df_messages['Sender_Receiver_Emails_list'].apply(lambda x: len(x) if isinstance(x, list) else np.nan)
df_messages['sender']=df_messages['sender'].astype('str')
df_messages['reply_sender']=df_messages['reply_sender'].astype('str')
df_messages['sender_replier_thread'] = df_messages['sender'].str.cat(df_messages['reply_sender'], sep='-')
df_messages.head(2)

Unnamed: 0.1,Unnamed: 0,Email_ID,time,subject,thread,sender,recipient,message,reply_time,reply_sender,reply_recipient,reply_message,Sender_Receiver_Emails,Sender_Receiver_Emails_list,num_relevant_emails,sender_replier_thread
0,0,0,1999-06-09 04:18:00-07:00,RE: test,2,5552,[40034],How about this Friday ? Julie has not left yet. She was 2 days away from leaving to start school...,1999-06-09 08:06:00-07:00,40034,[5552],when? how are you and your family? is julie gone?,,[],0,5552-40034
1,1,1,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-06-10 03:54:00-07:00,5552,[40034],Today is bad. Tommorrow I will call you.,"['Do you have lunch plans today?', ""Really? I'd feel like a mooch. Lets have lunch next week. An...","[Do you have lunch plans today?, Really? I'd feel like a mooch. Lets have lunch next week. Any d...",3,40034-5552


----
## Deduped Retrieval Dataframe ##

In [12]:
df_messages_deduped=pd.read_csv('gs://user-scripts-msca310019-capstone-49b3/data/20231026_Emails_Deduped.csv', parse_dates=['time'])
df_messages_deduped['Sender_Receiver_Emails_list'] = df_messages_deduped['Sender_Receiver_Emails'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_messages_deduped['num_relevant_emails'] = df_messages_deduped['Sender_Receiver_Emails_list'].apply(lambda x: len(x) if isinstance(x, list) else np.nan)
df_messages_deduped.head(20)

Unnamed: 0.1,Unnamed: 0,Email_ID,time,subject,thread,sender,recipient,message,reply_time,reply_sender,reply_recipient,reply_message,Sender_Receiver_Emails,Sender_Receiver_Emails_list,sender_replier_thread,num_relevant_emails
0,0,0,1999-06-09 04:18:00-07:00,RE: test,2,5552,[40034],How about this Friday ? Julie has not left yet. She was 2 days away from leaving to start school...,1999-06-09 08:06:00-07:00,40034,[5552],when? how are you and your family? is julie gone?,,[],5552-40034,0
1,4,4,1999-06-09 08:06:00-07:00,RE: test,2,40034,[5552],when? how are you and your family? is julie gone?,1999-11-23 03:58:00-08:00,5552,[40034],Tues.is good. I'll call you.,"['Today is bad. Tommorrow I will call you.', 'Do you have lunch plans today?', ""Really? I'd feel...","[Today is bad. Tommorrow I will call you., Do you have lunch plans today?, Really? I'd feel like...",40034-5552,3
2,10,13,2001-11-08 14:18:15-08:00,test,2,39298,[402],test Dave,2001-11-09 09:03:31-08:00,402,[39298],"Dave, Take a look at this before you pass it on. I haven't looked at it in a while. Let me know ...",,[],39298-402,0
3,11,14,2001-11-09 09:03:31-08:00,RE: test,2,402,[39298],"Dave, Take a look at this before you pass it on. I haven't looked at it in a while. Let me know ...",2001-11-09 12:06:40-08:00,39298,[402],looks good. Metz says it looks fine too. You'll probably get more questions about Pub Ventures ...,,[],402-39298,0
4,12,15,2001-11-13 09:08:48-08:00,test,2,39322,[402],test Do You Yahoo!? Find the one for you at Yahoo! Personals http://personals.yahoo.com,2001-11-15 09:27:59-08:00,402,[39322],"sorry, but i didn't get your message until late. i'll call you at home tonight.",,[],39322-402,0
5,27,31,2001-10-29 14:01:05-08:00,RE: Hello,3,17189,[5409],"great send it tommorrow, and I will pass it on to Maria.",2001-11-01 08:48:56-08:00,5409,[17189],I sent this from home Tuesday night. I'm sending it again because the phone service at my apart...,"[""Good morning. I'm only halfway through my resume but I should finish it this evening and I wil...",[Good morning. I'm only halfway through my resume but I should finish it this evening and I will...,17189-5409,1
6,28,32,2001-11-01 08:48:56-08:00,RE: Hello,3,5409,[17189],I sent this from home Tuesday night. I'm sending it again because the phone service at my apart...,2001-11-01 08:54:29-08:00,17189,[5409],I will pass it on to Maria.,,[],5409-17189,0
7,29,33,2001-11-08 08:24:47-08:00,RE: Hello,3,189,[51569],I think Enron and my career are finished.,2001-11-08 08:41:11-08:00,51569,[189],give me a call if you can talk? 212 469 7839 n. This e mail may contain confidential and/or p...,,[],189-51569,0
8,35,39,2001-11-08 08:41:11-08:00,RE: Hello,3,51569,[189],give me a call if you can talk? 212 469 7839 n. This e mail may contain confidential and/or p...,2002-01-22 11:21:05-08:00,189,[51569],i don't really know yet check back with me in a weeks time. az,"['Let me check on it and get back to you. I had mine done in Houston, but I can probably recomme...","[Let me check on it and get back to you. I had mine done in Houston, but I can probably recommen...",51569-189,1
9,38,42,2002-02-03 17:56:51-08:00,RE: Hello,3,307,[27130],Hello Glad you got yourself sorted with a new job. Things here are getting better we have near...,2002-02-04 01:15:22-08:00,27130,[307],Setting up oil n products options book. Nice people alot of ex enron. It feels very much like e...,,[],307-27130,0


----
## Make Changes here

In [6]:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

if 'OPENAI_API_KEY' in os.environ:
    openai_api_key = os.environ['OPENAI_API_KEY']
    print("OpenAI API Key: is set")
else:
    print("OPENAI_API_KEY environment variable is not set.")

OpenAI API Key: ········


OpenAI API Key: is set


In [7]:
# INPUT VARIABLES 
'sender_id'
'replier_id'
'sender_email'


# PERSONALIZATION VARIABLES
num_emails= 10 #FOR RETRIEVEL + RANKING
email_retrieval_dataset=df_messages_deduped # FOR RETRIEVAL DATABASE
vector_db_client=chroma_client # FOR RANKING VECTOR DATABASE


# TEXT GENERATION CONTROL
template_string=""""You are the person recieving this email enclosed in the angle brackets: <{sender_email}>,

    Write a reply to the email as the person who recieved it,
    
    deriving context and writing style and email length from previous relevant emails from the person given in the angle brackets : <{relevant_emails}>
    
    Make sure to use salutation and signature style similar to the revelant emails above.
    
    
    "
    """

api_key=openai_api_key
llm_model='gpt-3.5-turbo-0301' # CAN CHANGE
llm_endpoint=ChatOpenAI(temperature=0, model=llm_model, openai_api_key=openai_api_key) # CAN CHANGE


# PROMPT


# VALIDATION VARIABLES
df_validate = df_messages_deduped.sort_values(by='num_relevant_emails', ascending=False).head(10)
sample_size = 10


In [8]:
df_high_messages=df_messages_deduped.sort_values(by='num_relevant_emails', ascending=False).iloc[4:54,]

In [9]:
#df_high_messages


---
## BACKEND FUNCTIONS, DO NOT CHANGE

In [10]:
# Email Environment
sender_id='171'
replier_id='414'
sender_email='Where is my gift?'

In [11]:
# SINGLE RESPONSE GENERATION
async def get_email_response_personalized(sender_id=sender_id,
                                    replier_id=replier_id,
                                    sender_email=sender_email,
                                    email_retrieval_dataset=df_messages_deduped,
                                    num_emails=num_emails,
                                    vector_db_client=chroma_client,
                                    api_key=openai_api_key,
                                    llm_endpoint=llm_endpoint,
                                    template_string=template_string):
    
    # First getting retrieved emails to understand conversation --------
    sender_replier_id='-'.join([sender_id, replier_id])
    previous_emails=(email_retrieval_dataset[email_retrieval_dataset.sender_replier_thread==sender_replier_id]['Sender_Receiver_Emails_list']).to_list()[0][-num_emails:]
    previous_emails
    # Second, getting ranked responses as per context ------------------
        
        # Building the Langchain vectorstore using chroma collections
    user_vector_store = Chroma(
        client=vector_db_client, 
        collection_name='user'+str(replier_id),
        embedding_function=OpenAIEmbeddings())
        # Getting ranked responses using MMR
    found_rel_emails = await user_vector_store.amax_marginal_relevance_search(sender_email, k=num_emails, fetch_k=num_emails)
    list_rel_emails=[]
    for i, doc in enumerate(found_rel_emails):
        list_rel_emails.append(doc.page_content)
    list_rel_emails
    
    list_rel_emails_limit=[]
    for x in list_rel_emails:
        if len(x)<300:
            list_rel_emails_limit.append(x)
            
            
    # Third, getting the summary and synthesis from the relevant emails
            # Define prompt
    prompt_template = """Give me the most relevant keywords from the following:
    "{text}"
    SYNTHESIS:"""
    prompt = PromptTemplate.from_template(prompt_template)

    # Define LLM chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")
    llm_chain = LLMChain(llm=llm, prompt=prompt)

    # Define StuffDocumentsChain
    stuff_chain = StuffDocumentsChain(
        llm_chain=llm_chain, document_variable_name="text"
    )

    synthesis = stuff_chain.run(found_rel_emails)

    
    
    # Setting up LangChain
    prompt_template = ChatPromptTemplate.from_template(template=template_string)    
    llm_chain=LLMChain(llm=llm_endpoint, prompt=prompt_template)
    return llm_chain.run(sender_email=sender_email, prev_emails=previous_emails, relevant_emails=list_rel_emails_limit, synthesis=synthesis)
    

In [12]:
# Multiple Response Generation
async def process_all_emails(df_validate, sample_size):
    # Set a global random seed
    np.random.seed(42)

    df_emails = df_validate.sample(sample_size)
    responses = []
    for index, row in df_emails.iterrows():
        response = await get_email_response_personalized(sender_id = str(row['sender']), 
                                                         replier_id= str(row['reply_sender']), 
                                                         sender_email = row['message'], 
                                                         email_retrieval_dataset=email_retrieval_dataset, 
                                                         api_key=api_key, 
                                                         vector_db_client=vector_db_client)
        responses.append(response)
        #time.sleep(20)
    df_emails['generated_reply']=responses
    return df_emails

In [13]:
# Model Performance Assessment

def calculate_scores(df_validate_out, 
                     actual_col, 
                     generated_col):
    rouge_1_scores=[]
    rouge_L_scores=[]
    bleu_scores=[]
    
    rouge_1_f1=[]
    rouge_L_f1=[]
    
    for index, row in df_validate_out.iterrows():
        
        scorer_rouge1 = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
        rouge_1_score=scorer_rouge1.score(row[actual_col],row[generated_col])
        rouge_1_scores.append(rouge_1_score)
        
        rouge_1_f1.append(rouge_1_score['rouge1'][2])
        
        scorer_rougeL=rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
        rouge_L_score=scorer_rougeL.score(row[actual_col],row[generated_col])
        rouge_L_scores.append(rouge_L_score)
        
        rouge_L_f1.append(rouge_L_score['rougeL'][2])

        bleu_scores.append(sentence_bleu([row[actual_col].split()], row[generated_col].split()))
    
    df_validate_out['rouge_1']=rouge_1_scores
    df_validate_out['rouge_L']=rouge_L_scores
    df_validate_out['bleu']=bleu_scores
    
    df_validate_out['rouge_1_f1']=rouge_1_f1
    df_validate_out['rouge_L_f1']=rouge_L_f1
    
    import seaborn as sns
    import matplotlib.pyplot as plt
    %config InlineBackend.figure_format = 'svg'
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    sns.histplot(data=df_validate_out, x='bleu', kde=True, ax=axes[0])
    sns.histplot(data=df_validate_out, x='rouge_1_f1', kde=True, ax=axes[1])
    sns.histplot(data=df_validate_out, x='rouge_L_f1', kde=True, ax=axes[2])

    plt.show()
    
    df_validate_out_fin=df_validate_out[[actual_col,generated_col,'num_relevant_emails', 'rouge_1','rouge_L','bleu','rouge_1_f1','rouge_L_f1']]
    return df_validate_out_fin

----
## Run Tests

In [29]:
import warnings

# Suppress a specific warning
warnings.filterwarnings("ignore", category=DeprecationWarning)

# This will take a sample of data from df_messages_deduped, and generate email replies for it
test_df= await process_all_emails(df_validate=df_high_messages, sample_size=10)
test_df

# Reset warnings to their default behavior
warnings.resetwarnings()

In [30]:
test_df['generated_reply'] = test_df['generated_reply'].str.replace('\n', '')

In [None]:
# This will calculate scores for the generated replies
test_df_scores=calculate_scores(df_validate_out=test_df, 
                     actual_col='message', 
                     generated_col='generated_reply')

In [None]:
test_df_scores


In [None]:
test_df_scores.describe()

In [20]:
test_df_scores[test_df_scores.num_relevant_emails==32].iloc[1]

In [None]:
test_df_scores[test_df_scores.num_relevant_emails==31].iloc[1].generated_reply