In [4]:
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import Ollama
from langchain.vectorstores import Chroma, FAISS
import pandas as pd
from tqdm import tqdm

In [5]:
evals_df = pd.read_csv("eval_set.csv",index_col="Unnamed: 0")
evals_df

Unnamed: 0,title,selftext,summary
0,Lifemiles and united upgrade question,Hi I'm thinking of flying on Nov 18 from tys t...,No Award tickets are not upgradedable with UA ...
1,[Iwantout] 25F Canada -> UK,"Hello, \nI am currently doing a PhD in politi...",It looks like this post is about CanadannI am ...
2,Is 5h enough to go over self transfer in Dubai?,"Hello everyone, I got an itinerary in which th...",Qatar is pretty reliable into DOH and usually ...
3,Is an inflatable foot rest allowed at window s...,My kid and I will have an 18+ hour flight very...,Inflatable anything is generally a nono on man...
4,1st International solo trip - 2 weeks Malaysia...,Hey I'm going to Malaysia in a couple of weeks...,It looks like youre planning a solo Southeast ...
5,Looking for help with Peru Itinerary,Hi all! My husband and I are going to Peru thi...,PerunnRead what redditors had to say in the we...
6,"[Travel with a violin at Air Dolomiti, w/ a ca...","As the post I made before, I would like to kn...",Its clearly explained on their websitennhttpsw...
7,Place with warm ocean during summer and fall c...,"I've been to 50 countries, 6 continents, and 4...",I live in LA less than 5 mi from the beach and...
8,LAX -> BKK award options,"Long time award travel lurker, first time post...",I’m looking at LAXISTBKK via Turkish airline f...
9,Phoenix to LA County,In March after some spring training baseball g...,"Id choose that route and check out La Jolla,Sa..."


In [7]:
# evals_df = pd.read_csv("final_eval_set_02_22.csv",index_col="Unnamed: 0")
# evals_df.head()

In [6]:
def load_model(model_option):
    llm = Ollama(
        model=model_option, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])
            )
    return llm

def create_context_from_docs(docs):
    context = ""
    for doc in docs:
        context += "Question: {}".format(doc.page_content)
        context += "Response: {}".format(doc.metadata["comments"])

        return context
    
def create_prompt(user_query, context):
        prompt = """[INST] 

        If the additional context is irrelevent, ignore them. Otherwise, use them inform your answers, please. 
        
        Generate an answer to the question: {} based on the question and answer pairs 
        provided in the context: {} 
        
        Please only consider the context if its relevant to the question, else answer on your own, PLEASE!
        
        [/INST]""".format(user_query,context)
        return prompt
        

In [11]:
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# reddit_db = Chroma(persist_directory="/Users/christopherton/Desktop/SJSU_MSDA/WanderChat/vector_db/reddit_chroma_db",embedding_function=embedding_function)

embeddings = HuggingFaceEmbeddings(
                            model_name="sentence-transformers/all-MiniLM-L6-v2",
                            model_kwargs = {'device': 'cpu'})
reddit_db = FAISS.load_local("/Users/christopherton/Desktop/SJSU_MSDA/WanderChat/vector_db/reddit_q_db", embeddings)

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [None]:
model = load_model('mistral')
for i,r in tqdm(evals_df.iterrows()):
    pred = model(r['selftext'])
    evals_df.loc[i,'non_rag_Mistral'] = pred

In [None]:
for i,r in tqdm(evals_df.iterrows()):
    docs = reddit_db.similarity_search(r['selftext'])
    context = create_context_from_docs(docs)
    prompt = create_prompt(' According to users from Reddit, '+ r['selftext'],context)
    pred = model(prompt)
    evals_df.loc[i,'rag_mistral'] = pred

In [None]:
model = load_model('llama2')
for i,r in tqdm(evals_df.iterrows()):
    pred = model(r['selftext'])
    evals_df.loc[i,'non_rag_llama2'] = pred

In [None]:
for i,r in tqdm(evals_df.iterrows()):
    docs = reddit_db.similarity_search(r['selftext'])
    context = create_context_from_docs(docs)
    prompt = create_prompt(' According to users from Reddit, '+ r['selftext'],context)
    pred = model(prompt)
    evals_df.loc[i,'rag_Llama2'] = pred

In [19]:
evals_df.columns

Index(['title', 'selftext', 'summary', 'non_rag_Mistral', 'rag_mistral',
       'non_rag_llama2', 'rag_Llama2'],
      dtype='object')

In [20]:
#evaluation_df = evals_df.drop('input_input',axis=1)

evals_df.to_csv("evaluation_df_FAISS.csv")

In [40]:
evaluation_df.columns

Index(['title', 'selftext', 'summary', 'rag_Mistral', 'non_rag_Mistral',
       'non_rag_llama2', 'rag_Llama2'],
      dtype='object')

## Evaluate all model outputs against ground truths

In [34]:
import evaluate

rouge_score = evaluate.load("rouge")

In [38]:
for i, r in evaluation_df.iterrows():
    for m in ['rag_Mistral', 'non_rag_Mistral', 'non_rag_llama2', 'rag_Llama2']:
        scores = rouge_score.compute(
            predictions=[r[m]], references=[r['summary']]
        )
        print(f"{r['title']}, ground-truths vs. {m}")
        print(scores)
        print('\n')

Lifemiles and united upgrade question, ground-truths vs. rag_Mistral
{'rouge1': 0.2634730538922156, 'rouge2': 0.07272727272727272, 'rougeL': 0.16766467065868262, 'rougeLsum': 0.16766467065868262}


Lifemiles and united upgrade question, ground-truths vs. non_rag_Mistral
{'rouge1': 0.14792899408284024, 'rouge2': 0.005952380952380952, 'rougeL': 0.07692307692307693, 'rougeLsum': 0.10650887573964499}


Lifemiles and united upgrade question, ground-truths vs. non_rag_llama2
{'rouge1': 0.11313131313131312, 'rouge2': 0.016227180527383367, 'rougeL': 0.05656565656565656, 'rougeLsum': 0.08888888888888888}


Lifemiles and united upgrade question, ground-truths vs. rag_Llama2
{'rouge1': 0.30526315789473685, 'rouge2': 0.10638297872340426, 'rougeL': 0.2105263157894737, 'rougeLsum': 0.1894736842105263}


[Iwantout] 25F Canada -> UK, ground-truths vs. rag_Mistral
{'rouge1': 0.22508038585209003, 'rouge2': 0.045307443365695796, 'rougeL': 0.135048231511254, 'rougeLsum': 0.16077170418006428}


[Iwantout] 

## Ensemble

In [9]:
all_preds = pd.read_csv("../app/evals_qlora_preds_feb_29.csv",index_col="Unnamed: 0")
all_preds.head()

Unnamed: 0,title,selftext,summary,rag_Mistral,non_rag_Mistral,non_rag_llama2,rag_Llama2,non_rag_llama2_qlora,rag_llama2_qlora
0,Lifemiles and united upgrade question,Hi I'm thinking of flying on Nov 18 from tys t...,No Award tickets are not upgradedable with UA ...,According to user experiences shared on Reddi...,I cannot provide you with a definitive answer...,\nIt's possible to use United Plus Points or U...,According to the information provided in the R...,Thanks in advance for any help.\n\nAnswer:\n\n...,"Sure, I'd be happy to help! Based on the info..."
1,[Iwantout] 25F Canada -> UK,"Hello, \nI am currently doing a PhD in politi...",It looks like this post is about CanadannI am ...,"Based on the context provided, it appears tha...","Hello, I understand that making the decision ...",Hello! It sounds like you're considering a mov...,Hello! Based on the information provided in th...,Hello! It's great that you're considering your...,Thank you for sharing your thoughts and exper...
2,Is 5h enough to go over self transfer in Dubai?,"Hello everyone, I got an itinerary in which th...",Qatar is pretty reliable into DOH and usually ...,Based on the information provided in the cont...,Transiting through Dubai International Airpor...,Hello! I can understand your concern about the...,"According to the Reddit users, the connection ...","Hello everyone, I got an itinerary in which th...",Thank you for providing more context. Based o...
3,Is an inflatable foot rest allowed at window s...,My kid and I will have an 18+ hour flight very...,Inflatable anything is generally a nono on man...,"Based on the context provided, it appears tha...","Based on my research and understanding, the u...","As a flight attendant, I must inform you that ...",Thank you for providing more information about...,Answer:\n\nI understand your concern about inf...,Thank you for providing more context and info...
4,1st International solo trip - 2 weeks Malaysia...,Hey I'm going to Malaysia in a couple of weeks...,It looks like youre planning a solo Southeast ...,"Based on the conversation, it appears that OP...",Hi there! It's great that you're excited abou...,"Hey there! 😊\n\nDon't worry, I totally underst...","According to the Reddit users, there are sever...",**Things to do**\n\nI've been looking at the P...,Hey there! Thanks for the detailed response! ...


In [13]:
model = load_model('mistral')

In [18]:
for i,r in all_preds.iterrows():
    prompt_inference_eval = """Answer the following question as if you're speaking to a fellow traveler: "{}" by using the provided title: "{}", to understand the subject of discussion, and summarizing the corresponding responses as context:
    response #1 (mistral): "{}"
    response #2 (llama2): "{}"
    response #3 (qlora-llama): "{}"
    If the context is irrelevant, doesn't make sense or is too short, ignore them. Otherwise, do your best to answer with sincerity and integrity. Remain true to the
    question and do not make up facts. Admit if you do not know.
    
    Finally rank which responses (#1, #2, or #3) matches the actual context: "{}" the most as a list. For example: [llama2,qlora-llama,mistral]
    If none of the responses are contextually relevant, do not use them - mention this in your response.
    """.format(r['selftext'],r['title'],r['rag_Mistral'],r['rag_Llama2'],r['rag_llama2_qlora'],r['summary'])
    
    prompt_ensemble = """Answer the following question as if you're speaking to a fellow traveler: "{}" by using the provided title: "{}", to understand the subject of discussion, and summarizing the corresponding responses as context:
    response #1 (mistral): "{}"
    response #2 (llama2): "{}"
    response #3 (qlora-llama): "{}"
    If the context is irrelevant, doesn't make sense or is too short, ignore them. Otherwise, do your best to answer with sincerity and integrity. Remain true to the
    question and do not make up facts. Admit if you do not know. Don't reference the context in your answer by saying things like "Based on the provided context." just 
    get straight to the point.
    
    If none of the responses are contextually relevant, do not use them - mention this in your response.
    """.format(r['selftext'],r['title'],r['rag_Mistral'],r['rag_Llama2'],r['rag_llama2_qlora'],r['summary'])
    
    ensemble_pred = model(prompt_inference_eval)
    ensemble_results = model(prompt_ensemble)
    all_preds.loc[i,'ensemble_pred_eval'] = ensemble_pred
    all_preds.loc[i,'ensemble_response'] = ensemble_results
    

 Based on the provided context, all three responses (#1, #2, and #3) accurately answer the question about upgrading an award ticket from economy to Polaris class using United miles or United Premier Points with United Airlines. The responses indicate that it is unlikely that non-Global Services members will be able to upgrade their award tickets to Polaris class using miles or points.

Therefore, [llama2,qlora-llama,mistral] are the most contextually relevant responses. Based on the question you've asked and without referring to any specific context, my understanding is that you're trying to determine if you can upgrade a Lifemiles award ticket from economy to Polaris class using United miles or United Plus points on the IAD to AMS leg of your flight.

The information shared in the responses suggests that it might not be possible to use United miles or United Plus points for an upgrade on a non-016 (non-Global Services) award ticket, even if a Polaris class seat is available. This mean

In [23]:
all_preds.to_csv("ensemble_results.csv")

## LangSmith Evals


In [11]:
import os
os.environ["OPENAI_API_KEY"] = '''input key here'''
os.environ["LANGCHAIN_API_KEY"] = '''input key here'''

In [12]:
import langsmith

from langchain import chat_models, smith

# Replace with the LLM you want to test
my_llm = load_model('mistral')

# Define the evaluators to apply
eval_config = smith.RunEvalConfig(
    evaluators=[
        smith.RunEvalConfig.LabeledCriteria("conciseness"),
        "cot_qa",
        smith.RunEvalConfig.LabeledCriteria("relevance"),
        smith.RunEvalConfig.LabeledCriteria("coherence"),
        smith.RunEvalConfig.LabeledCriteria("harmfulness"),
        smith.RunEvalConfig.LabeledCriteria("maliciousness"),
        smith.RunEvalConfig.LabeledCriteria("helpfulness"),
        smith.RunEvalConfig.LabeledCriteria("controversiality"),
        smith.RunEvalConfig.LabeledCriteria("misogyny"),
        smith.RunEvalConfig.LabeledCriteria("criminality"),
        smith.RunEvalConfig.LabeledCriteria("insensitivity")
    ],
    custom_evaluators=[],
    eval_llm=chat_models.ChatOpenAI(model="gpt-4", temperature=0,openai_api_key="input api key")
)

client = langsmith.Client()
chain_results = client.run_on_dataset(
    dataset_name="WanderChat_evals",
    llm_or_chain_factory=my_llm,
    evaluation=eval_config,
    project_name="test-unnatural-ATM-44",
    concurrency_level=5,
    verbose=True,
)

View the evaluation results for project 'test-unnatural-ATM-44' at:
https://smith.langchain.com/o/a9b5fab5-953e-5e04-886c-34605078dbcd/datasets/26463be8-0ded-40cb-8eb9-607cca7bd011/compare?selectedSessions=afa4a170-0299-44d5-b9a4-6708dafa22e2

View all tests for Dataset WanderChat_evals at:
https://smith.langchain.com/o/a9b5fab5-953e-5e04-886c-34605078dbcd/datasets/26463be8-0ded-40cb-8eb9-607cca7bd011
[>                                                 ] 0/10 According to the context provided, the person is looking to travel from some spring training baseball games towards Los Angeles (LA) in March, with a flexible schedule of leaving on a Wednesday and flying out by Sunday. They have two options based on their initial Google Maps search: heading south on I-8 towards San Diego or going west on I-10 towards Palm Springs. The person is also interested in exploring the Salton Sea and Slab City before making their way to LA.

Based on this information, a possible answer could be: "Exploring

Unnamed: 0,feedback.conciseness,feedback.COT Contextual Accuracy,feedback.relevance,feedback.coherence,feedback.harmfulness,feedback.maliciousness,feedback.helpfulness,feedback.controversiality,feedback.misogyny,feedback.criminality,feedback.insensitivity,error,execution_time,run_id
count,6.0,5.0,5.0,5.0,4.0,4.0,5.0,6.0,6.0,7.0,9.0,0.0,10.0,10
unique,,,,,,,,,,,,0.0,,10
top,,,,,,,,,,,,,,898acbb7-eeec-40f9-b7c0-6e56a87eb210
freq,,,,,,,,,,,,,,1
mean,0.5,1.0,1.0,1.0,0.75,0.25,1.0,0.0,0.0,0.142857,0.0,,37.524836,
std,0.547723,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.377964,0.0,,16.261459,
min,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,,17.042383,
25%,0.0,1.0,1.0,1.0,0.75,0.0,1.0,0.0,0.0,0.0,0.0,,25.192395,
50%,0.5,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,,32.974638,
75%,1.0,1.0,1.0,1.0,1.0,0.25,1.0,0.0,0.0,0.0,0.0,,45.797217,
