# **Model Pip**

In [101]:
import openai
import os
import getpass
import asyncio

In [102]:
import pandas as pd
import numpy as np
import ast  # Import the ast module for literal evaluation
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 100)
import multiprocessing
num_processors = multiprocessing.cpu_count()

import pandarallel
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

import warnings
warnings.filterwarnings("ignore")

from nltk.tokenize import word_tokenize
import nltk
import string

Available CPUs: 8
INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [103]:
#LANGCHAIN
import langchain
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.memory import SimpleMemory

#CHROMA
import chromadb
from chromadb.utils import embedding_functions
from langchain.vectorstores import Chroma

# Setting up the chroma client
chroma_client = chromadb.PersistentClient(path="vectorstores")

-----
## Entire Dataframe

In [104]:
df_messages=pd.read_csv('human_validation_with_relevent_date.csv', parse_dates=['sender_date','replier_date'])
df_messages.dropna(subset=['sender'], axis=0, inplace=True)
df_messages.rename(columns={'Sender_Receiver_Emails':'Replier_Emails_Sender', 'Sender_Emails_All':'Replier_Emails_All'}, inplace=True)

In [105]:
# df_messages=pd.read_csv('Hedwig/07_HumanValidation/20231104_human_validation_dataset.csv')
df_messages['Replier_Emails_Sender'] = df_messages['Replier_Emails_Sender'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_messages['num_emails_toSender'] = df_messages['Replier_Emails_Sender'].apply(lambda x: len(x) if isinstance(x, list) else np.nan) + 1
df_messages['Replier_Emails_All'] = df_messages['Replier_Emails_All'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df_messages['num_emails_all'] = df_messages['Replier_Emails_All'].apply(lambda x: len(x) if isinstance(x, list) else np.nan) + 1
df_messages['sender_replier_thread'] = df_messages['sender'].str.cat(df_messages['replier'], sep='-')

----
## Enter API Key

In [106]:
os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

if 'OPENAI_API_KEY' in os.environ:
    openai_api_key = os.environ['OPENAI_API_KEY']
    print("OpenAI API Key: is set")
else:
    print("OPENAI_API_KEY environment variable is not set.")

OpenAI API Key: ········


OpenAI API Key: is set


-----
## User Environment

In [107]:
# Email Environment
sender_id='Kshitij'
replier_id='Scott'

subject_email='New Member Onboarding'
sender_email='Hey Scott, were you able to check if Aarushi would be available for the Friday meeting?'

In [108]:
base_dataset=df_messages
vector_db_client=chroma_client # FOR RANKING VECTOR DATABASE
num_emails = 5 # Constant number of emails being retrieved for MMR, Threads, Past Emails

# TEXT GENERATION CONTROL
api_key=openai_api_key
llm_model='gpt-3.5-turbo-0301' # CAN CHANGE
llm_endpoint=ChatOpenAI(temperature=0.1, model=llm_model, openai_api_key=api_key) # CAN CHANGE

----
## Setting up Memory

In [109]:
sender_name=sender_id
replier_name=replier_id

def remove_punctuation(text):
    # Define a translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    # Use the translate method of the string class to remove any punctuation
    return text.translate(translator)

def get_tokens(replier_id,
               df):
    user_df = df[(df.replier==replier_id)]
    # Tokenize each entry
    # user_df['cleaned_message'] = user_df['reply_message'].apply(lambda x: text.translate(str.maketrans('', '', string.punctuation)))
    user_df['tokens'] = user_df['reply_message'].apply(lambda x: word_tokenize(remove_punctuation(x)))
    user_df['token_count'] = user_df['tokens'].apply(lambda x: len(x))
    # Calculate the average number of tokens
    average_tokens = user_df['token_count'].median()
    return average_tokens

---- -
## Zeroeth LLM Endpoint - Running MMR

In [110]:
# Building the Langchain vectorstore using chroma collections
user_vector_store = Chroma(
    client=vector_db_client, 
    collection_name='user'+str(replier_id),
    embedding_function=OpenAIEmbeddings())

    # Getting ranked responses using MMR
found_rel_emails = await user_vector_store.amax_marginal_relevance_search(sender_email, k=num_emails, fetch_k=num_emails)
list_rel_emails=[]
for i, doc in enumerate(found_rel_emails):
    list_rel_emails.append(doc.page_content)

---
## First LLM Endpoint - Global Context Email

In [111]:
# PROMPT
template_string_globalcontext="""You are the person recieving this email {sender_email},
Write a reply to the email as the person who recieved it, 
deriving context and writing style and email length from previous relevant emails from the person given: {relevant_emails}, 
Make sure to use salutation and signature style similar to the revelant emails above.
You are replying to {sender_name} on behalf of {replier_name}."""


In [112]:
# Setting up LangChain
prompt_template_globalcontext = ChatPromptTemplate.from_template(template=template_string_globalcontext)    
llm_chain_globalcontext=LLMChain(llm=llm_endpoint, prompt=prompt_template_globalcontext, output_key='Global_Context_Email')

----
## Second LLM Endpoint - Thread (Local Context Email)

In [113]:
## Inputs from Data

import math
def get_threads(sender,
                replier,
                subject,
                df,
                num_emails_past):
    relevant_df = df[((df.sender==sender) & (df.replier==replier) & (df.subject == subject))]
    
    if (len(relevant_df)==0):
        relevant_df = df[((df.sender==replier) & (df.replier==sender) & (df.subject == subject))]
        
    if (len(relevant_df)==0):
        return
    
    relevant_df['sender_date'] = pd.to_datetime(relevant_df['sender_date'])
    relevant_df['replier_date'] = pd.to_datetime(relevant_df['replier_date'])
    
    messages = pd.concat([relevant_df['message'], relevant_df['reply_message']]).reset_index(drop=True)
    dates = pd.concat([relevant_df['sender_date'], relevant_df['replier_date']]).reset_index(drop=True)
    name = pd.concat([relevant_df['sender'], relevant_df['replier']]).reset_index(drop=True)
    
    thread_df = pd.DataFrame({'message': messages,'date': dates,'name':name})
    thread_df = thread_df.sort_values(by='date',ascending=False)
    
    ordered_names = list(thread_df.name)
    ordered_messages = list(thread_df.message)
    
    thread_string = ''
    for i in range(num_emails_past):
        thread_string = thread_string + f"{ordered_names[i]} Email {math.ceil((i+1)/2)}: {ordered_messages[i]} \n \n"
        
    # print(thread_string)
    return thread_string

past_threads=get_threads(sender=sender_id,
            replier=replier_id,
            subject=subject_email,
            df=base_dataset,
            num_emails_past=2*num_emails)


In [114]:
template_string_thread="""Take this LLM generated email: {Global_Context_Email}. 
This email might have some trailing emails, stored in the email thread here: {past_threads}.
Rewrite the LLM Generated Email, by deprioritizing topics which are not present in the past email thread.
Otherwise don't make major changes to the LLM generated email"""

prompt_template_thread=ChatPromptTemplate.from_template(template=template_string_thread)
llm_chain_thread=LLMChain(llm=llm_endpoint, prompt=prompt_template_thread, output_key='Local_Context_Email')

-----
## Third LLM Chain - Extracting Pairwise Writing Style Langchain

In [115]:
#Inputs From Data

def get_replier_sender_past_emails(sender,
                                   replier,
                                   df,
                                   num_past_emails):
    
    relevant_df = df[(((df.sender==sender) & (df.replier==replier)) | ((df.sender==replier) & (df.replier==sender)))]
    
    relevant_df['sender_date'] = pd.to_datetime(relevant_df['sender_date'])
    relevant_df['replier_date'] = pd.to_datetime(relevant_df['replier_date'])
    
    messages = pd.concat([relevant_df['message'], relevant_df['reply_message']]).reset_index(drop=True)
    dates = pd.concat([relevant_df['sender_date'], relevant_df['replier_date']]).reset_index(drop=True)
    name = pd.concat([relevant_df['sender'], relevant_df['replier']]).reset_index(drop=True)
    
    relationship_df = pd.DataFrame({'message': messages,'date': dates,'name':name})
    relationship_df = relationship_df.sort_values(by='date',ascending=False)
    
    relationship_df = relationship_df[relationship_df.name==replier]
    
    ordered_names = list(relationship_df.name)
    ordered_messages = list(relationship_df.message)
    
    past_emails_string = ''
    for i in range(num_past_emails):
        past_emails_string = past_emails_string + f"Replier Email {i+1}: {ordered_messages[i]} \n \n"
        
    return past_emails_string

past_emails=get_replier_sender_past_emails(sender=sender_id,
                               replier=replier_id,
                               df=base_dataset,
                               num_past_emails=num_emails)

In [116]:
template_string_pairstyle="""Extract Email Writing Style in 3 words that best decribe the replier by analyzing these past emails between the sender and replier: {past_emails}"""

prompt_template_pairstyle = ChatPromptTemplate.from_template(template=template_string_pairstyle)    
llm_chain_pairstyle=LLMChain(llm=llm_endpoint, prompt=prompt_template_pairstyle, output_key='pair_style')

---
## Fourth LLM Chain - Personalizing Local Context Email

In [117]:
template_string_personalization="""Take this email :<{Local_Context_Email}>, update the email and create one single email which is {pair_style}. 
Remember that these adjectives collectively describe your writing style,
DO NOT add any more information, just tweak the style a little.
Don't be dramatic, and the output should have approximately {avg_tokens} number of tokens"""

prompt_template_personalization=ChatPromptTemplate.from_template(template=template_string_personalization)
llm_chain_personalization=LLMChain(llm=llm_endpoint, prompt=prompt_template_personalization, output_key='Personalized_Email')

---
## Sequential LLM Chain for Pair and Email Gen

#### Memory

In [119]:
super_chain = SequentialChain(memory=SimpleMemory(memories={"sender_name":sender_name,
                                                           "replier_name":replier_name,
                                                           "avg_tokens":get_tokens(replier_id=replier_name, df=df_messages)}),
                              chains=[llm_chain_globalcontext, llm_chain_thread,llm_chain_pairstyle, llm_chain_personalization],
                              input_variables=['relevant_emails','sender_email','past_threads','past_emails'],
                              output_variables=['Global_Context_Email','Local_Context_Email','pair_style','Personalized_Email']
                             )

In [120]:
super_chain({"relevant_emails": list_rel_emails, 
             "sender_email": sender_email,
             "past_threads": past_threads,
             "past_emails":past_emails})

{'relevant_emails': ['This sounds like a fun plan bro. Just to confirm, Aarushi can come right?',
  "Yo bro, do you think aarushi could help on our project. She's my homie and I think we could use her help",
  'Okay u can ask her when she gets here. So does Friday at noon work then?',
  'For sure bro, we can do that. Do we know who is gonna be in our car yet? If Radhika isnt gonna be then we need to figure that out. I dont really care who else is as long as Aarushi is in our car? Do u think we need anything else for the trip?',
  ':( Oh, didnt know she had a boyfriend. All good tho, she said can help with ranking our responses. She said she avaiable on Friday at noon. U cool with that?'],
 'sender_email': 'Hey Scott, were you able to check if Aarushi would be available for the Friday meeting?',
 'past_threads': "Kshitij Email 1: Hey Scott, yes absolutely we can bring her! \n \nScott Email 1: This sounds like a fun plan bro. Just to confirm, Aarushi can come right? \n \nKshitij Email 2: