In [34]:
import os
import requests
import pandas as pd
from langchain_community.llms import Ollama
from tqdm import tqdm
import time
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain_community.embeddings import HuggingFaceEmbeddings # import hf embedding
from sentence_transformers import SentenceTransformer, util
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from openai import OpenAI

from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

load_dotenv()

pd.set_option('display.max_columns', None)


In [2]:
hf_token = os.getenv("hf_token")
llama_qlora = os.getenv("llama_qlora_api")
llama_raft = os.getenv("llama_raft_api")
mistral_qlora = os.getenv("mistral_qlora_api")
mistral_raft = os.getenv("mistral_raft_api")

llama_qlora

'https://stombfggw2li7usf.us-east-1.aws.endpoints.huggingface.cloud'

In [4]:


headers = {
	"Accept" : "application/json",
	"Authorization": f"Bearer {hf_token}",
	"Content-Type": "application/json" 
}

model_dict = {'pretrained Llama':"", #https://huggingface.co/meta-llama/Llama-2-7b-hf
              'pretrained Mistral':"", #https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2
              'Llama QLORA':llama_qlora, #https://huggingface.co/beraht/llama-2-7b_qlora_falcon_417
              'Llama RAFT':llama_raft, #https://huggingface.co/beraht/Llama2_Falcon_RAFT_50e_10s/tree/main
              'Mistral QLORA':mistral_qlora, #https://huggingface.co/sherrys/mistral-2-7b_qlora_falcon_426/tree/main
              'Mistral RAFT':mistral_raft #https://huggingface.co/sherrys/426_mistral_RAFT_50e_10s
              }

In [5]:
API_URL = model_dict.get('Llama QLORA')
API_URL

'https://stombfggw2li7usf.us-east-1.aws.endpoints.huggingface.cloud'

In [6]:

def query(payload):
  response = requests.post(API_URL, headers=headers, json=payload)
  return response.json()

In [12]:
raw_evals = pd.read_csv("inferences_5_2.csv",index_col="Unnamed: 0")
raw_evals.columns

Index(['Index', 'Notes', 'Golden Question', 'Golden Answer', 'MPC',
       'Llama QLORA Output', 'Llama QLORA Output Time',
       'pretrained Mistral Output', 'pretrained Mistral Output Time',
       'Mistral QLORA Output', 'Mistral QLORA Output Time',
       'Llama RAFT Output', 'Llama RAFT Output Time', 'Mistral RAFT Output',
       'Mistral RAFT Output Time', 'pretrained Llama Output',
       'pretrained Llama Output Time', 'Llama_RAG_CRC', 'Llama_RAG_CRC_time',
       'mistral_RAG_CRC', 'mistral_RAG_CRC_time', 'mistral_RAG_RR',
       'mistral_RAG_RR_time', 'llama_RAG_RR', 'llama_RAG_RR_time',
       'Llama RAFT RAG CRC Output', 'Llama RAFT RAG CRC Output Time',
       'Mistral RAFT RAG CRC GPT prompt summary Output',
       'Mistral RAFT RAG CRC GPT prompt summary Output Time',
       'Mistral QLORA RAG CRC GPT prompt summary Output',
       'Mistral QLORA RAG CRC GPT prompt summary Output Time'],
      dtype='object')

In [13]:
rr_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

def augment_prompt(query: str,source_knowledge:str):
    return f'''Using the contexts below, answer the question as if you are a travel agent and your goal is to provide excellent customer service and to provide
            personalized travel recommendations with reasonings based on their question. Do not repeat yourself or include any links or HTML. Say "I don't know" if you
            are uncertain.
            Contexts:
            {source_knowledge}

            Query: {query}'''

In [14]:
db = FAISS.load_local("../vector_db/funcheap_2024-04-26_2024-06-25_db",
                      HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', model_kwargs={'device': 'cpu'})).as_retriever()

https://huggingface.co/posts/andrewrreed/656761313696494

In [10]:
for i,r in tqdm(raw_evals.iterrows()):
    start = time.time()
    q = r['Golden Question']
    print(q)
    rag_contexts = db.invoke(q)
    rag_contexts = ' '.join([d.page_content for d in rag_contexts])
    prompt = augment_prompt(q,rag_contexts)
    input_len = len(prompt.split())
    print(input_len)
    max_token_len = 1500-input_len-100 #100 buffer
    start_time = time.time()
    while True: #while loop for token
        answer = query({'inputs': f"<s>[INST] {prompt} [/INST]",
                    'parameters': {"max_new_tokens": max_token_len}})
        if 'error' not in answer:
            break  #exit the while loop if there is no error
        max_token_len -= 100 #reduce by 100 in while loop
        print(f"Failed to process prompt with token length: {max_token_len}")
        if max_token_len <= 0:
            break
    end_time = time.time()
    duration = end_time - start_time
    
    print(answer)
    
    answer = answer[0]['generated_text'].replace(f"<s>[INST] {prompt} [/INST]","")
    answer = answer.replace(" . ",". ").strip()

    raw_evals.loc[i,'Mistral  RAG CRC Output'] = answer
    raw_evals.loc[i,'Mistral RAFT RAG CRC Output Time'] =  time.time() - start

0it [00:00, ?it/s]

What are your favorite luxury camp meals for a beginner-friendly 10-mile, 3-day backpacking trip, considering I'm willing to carry 10lbs of food and have basic cookware?
441
Failed to process prompt with token length: 859
Failed to process prompt with token length: 759
Failed to process prompt with token length: 659
Failed to process prompt with token length: 559
Failed to process prompt with token length: 459
Failed to process prompt with token length: 359
Failed to process prompt with token length: 259
Failed to process prompt with token length: 159
Failed to process prompt with token length: 59


0it [00:03, ?it/s]

Failed to process prompt with token length: -41
{'error': 'Input validation error: `inputs` must have less than 1024 tokens. Given: 1126', 'error_type': 'validation'}





KeyError: 0

In [15]:
client = OpenAI()

def generate_response(question):
    msg = [{"role": "user","content": question,}]
    res = client.chat.completions.create(messages=msg,model="gpt-3.5-turbo",temperature=0,)
    return res.choices[0].message.content

In [16]:
for i,r in tqdm(raw_evals.iterrows()):
    start = time.time()
    q = r['Golden Question']
    print(q)
    rag_contexts = db.invoke(q)
    rag_contexts = ' '.join([d.page_content for d in rag_contexts])
    prompt = augment_prompt(q,rag_contexts)
    input_len = len(prompt.split())
    summarized_prompt = generate_response(f"""Summarize the text delimited by triple backticks \ into a single sentence.```{prompt}```""")
    print(len(q),len(prompt),len(summarized_prompt))
    max_token_len = 1500-input_len-100 #100 buffer
    start_time = time.time()
    # while True: #while loop for token
    answer = query({'inputs': f"<s>[INST] {summarized_prompt} [/INST]",
                'parameters': {"max_new_tokens": max_token_len}})
        # if 'error' not in answer:
        #     break  #exit the while loop if there is no error
        # max_token_len -= 100 #reduce by 100 in while loop
        # print(f"Failed to process prompt with token length: {max_token_len}")
        # if max_token_len <= 0:
        #     break

    end_time = time.time()
    duration = end_time - start_time
    
    answer = answer[0]['generated_text'].replace(f"<s>[INST] {prompt} [/INST]","")
    answer = answer.replace(" . ",". ").strip()

    raw_evals.loc[i,'Llama QLORA RAG CRC GPT prompt summary Output'] = answer
    raw_evals.loc[i,'Llama QLORA RAG CRC GPT prompt summary Output Time'] =  time.time() - start

0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


What are your favorite luxury camp meals for a beginner-friendly 10-mile, 3-day backpacking trip, considering I'm willing to carry 10lbs of food and have basic cookware?
169 2706 277


1it [00:31, 31.78s/it]

I'm taking a solo trip with my dog around Thanksgiving. I understand some National Parks only let you take dogs in specific areas. I have about 10 days and could potentially extend 1-2 days if necessary. Any suggestions or considerations?
 
 Plan:
 Day 1 - drive 10-12 hours (sleep at rest stops)
 Day 2 - drive 10-12 hours and reach Albuquerque
 Day 3 - Monument Valley
 Day 4 - Capital Reef to Bryce, stay in Hatch
 Day 5 - Grand Canyon, stay in Sedona
 Day 6 - Phoenix
 Day 7 - White Sand Dunes and Carlsbad Caverns
 Day 8 - Fort Worth, eat at Goldee's BBQ
 Day 9 - drive 10-12 hours home
591 3070 291


2it [01:02, 31.14s/it]

I'm planning a 3-4 week family road trip from Vancouver to Quebec and then Nova Scotia this summer, aiming to make it an unforgettable cross-Canada adventure for my teenagers before they grow up. We're considering driving through the US via I90 to check out attractions like Yellowstone, and returning through Canada, balancing our stay between camping, hotels (50%), and family friends. We'd appreciate any advice on what to consider and places to explore along this route.
474 2986 390


3it [01:36, 32.28s/it]

What are the main differences between visiting Seoul and Tokyo regarding cost, public transit, weather, food, foreigner friendliness, people, and nightlife based on personal experiences?
186 3000 542


4it [02:12, 33.93s/it]

Seeking destination and activity suggestions for a first solo trip over my birthday weekend. I'm from Kalamazoo, MI, and can travel for 3-4 nights at the end of March. Looking for nature, hiking, possibly beach locations, plus nightlife, museums, good food, and botanical gardens. Willing to fly and rent a car, aiming for straightforward, not too lengthy travel, and budget-friendly options. Any advice?
404 3354 288


5it [02:42, 32.59s/it]

Where's the best place to go "monk mode" for three months to focus on a project? I'm seeking a quiet neighborhood with the option to rent a pleasant living space. Additionally, access to fresh, affordable produce and a form of physical activity (other than hiking) for breaks would be ideal.
291 2810 346


6it [03:14, 32.44s/it]

I'm a 28-year-old male heading to Guangzhou for the first time this December for a three-week stay. Without Mandarin or Cantonese skills, I'm looking for advice on engaging activities and places to explore. My interests include spas, nightlife, good food, video games, anime, and technology. Since post-Covid information seems scarce, could you recommend what to do, see, or learn? I'll be with my significant other and in-laws, seeking to make the most of this trip both together and on my own.
495 3199 485


7it [03:47, 32.45s/it]

Is Ho Chi Minh safe for a girl's solo trip?
43 2508 373


8it [04:20, 32.65s/it]

I'm considering booking a trip through Costco Travel and am curious about others' experiences and costs. What destinations have you visited with Costco packages, how long was your trip, and what was included? Did you find these packages provided good value for money, and would you recommend them for a specific type of traveler?
329 2867 412


9it [04:51, 32.11s/it]

I'm planning a two-month trip to Europe in May and June and am looking for recommendations on cities or regions that are hidden gems, beyond the well-known tourist attractions. Given the overwhelming amount of amazing suggestions, I'm even considering extending my trip by a month. What unique places do you recommend?
318 3120 267


10it [05:24, 32.47s/it]

I want to take a cruise to Alaska in August and want to sail from San Francisco. I'll be traveling with my husband and toddler. There are so many options though, how do I know which itinerary to pick? I guess, what are the typical ports? Pros/cons? Which are must sees?
269 3222 325


11it [05:55, 31.82s/it]

What are the current travel restrictions to Japan?
50 2487 426


12it [06:28, 32.47s/it]

Do you have tips for finding a cheap flight from New York to Paris for next month?
82 2417 244


13it [07:02, 32.70s/it]

What is the baggage allowance for my flight to Brazil?
54 2795 566


14it [07:34, 32.64s/it]

Are there any good hotel deals in Rome for the summer?
54 1955 170


15it [08:10, 33.54s/it]

Do I need a visa to visit Australia for a week?
47 2305 305


16it [08:43, 33.46s/it]

How do I change my flight reservation?
38 2568 308


17it [09:16, 33.25s/it]

What are the top attractions in New York City?
46 2924 288


18it [09:50, 33.65s/it]

Can you recommend a family-friendly resort in Florida?
54 2916 212


19it [10:24, 33.63s/it]

What is the best way to get around in Tokyo?
44 2548 377


20it [10:57, 33.52s/it]

My college roommate is getting married in Israel next month, and I would like to attend her wedding, but Is it safe to visit Israel right now?
142 2281 241


21it [11:31, 33.68s/it]

i go to san jose state. where can i grab some cheap eats in between classes?
76 2613 265


22it [12:04, 33.41s/it]

i'd like to visit stanford this weekend. can you help me plan my trip. how can i get there? i live in san jose. are there any events that i can attend. and also any good food place?
181 2734 503


23it [12:37, 33.25s/it]

i'm planning to visit a friend in berkeley. what would be the most affordable way to get there from my place in san jose
120 2638 243


24it [13:11, 33.37s/it]

are there any good places for me and my friends to hang out in bay area. we are both foodies and asians
103 3053 238


25it [13:42, 32.75s/it]

i'm a berkeley student who just started my french learning this semseter. are there any events nearby that i can attend that's both fun and educational?
152 2721 266


26it [14:14, 32.50s/it]

I am a super movie fan. and i know that san francisco has a movie festival this weekend. help me plan a day so that i can use up my time most efficiently.
154 2522 299


27it [14:47, 32.68s/it]

i live in san jose and i love hiking. any good recommendations for a weekend hike with friends and my dog
105 2687 204


28it [15:24, 34.02s/it]

I'm visiting Oakland in May and heard about a unique event called HellaSecret Comedy & Cocktail Night. Can you tell me how I can attend this event and what to expect?
166 2885 301


29it [15:54, 32.91s/it]

Where can I take my mother for window shopping and to see a local market in San Francisco on Sunday, May 12?
108 2924 239


30it [16:28, 32.97s/it]

My friends are visiting California in June and heard about a Free National Parks Day. Can you tell me when this occurs and if it includes popular parks like Muir Woods and Yosemite?
181 2654 291


31it [17:01, 33.06s/it]

Can I bring my dog to just watch the events at the 17th Annual DogFest without participating in the competitions, and what things would be there to see?


154 2611 323


32it [17:34, 33.19s/it]

Are there any comedy shows in San Francisco where I can take my cousin for a good time on June 19th? 

102 2607 210


33it [18:06, 32.81s/it]

Is there a special event in Golden Gate Park for Mother's Day?
62 2465 192


34it [18:39, 32.64s/it]

As an astronomy enthusiast, where can I get the best experience of the Eta Aquarids meteor shower in 2024 in the Bay Area, and what should I expect?
148 2514 243


35it [19:15, 33.68s/it]


I'm new to exercising and looking to attend events to stay motivated. Are there any beginner-friendly events in the Bay Area coming up?
136 2476 323


36it [19:48, 33.48s/it]

I recently heard about the Yerba Buena Gardens Festival 2024. Can you tell me what this event entails and if there are any free concerts? 
138 2654 256


37it [20:20, 33.00s/it]


In [24]:
raw_evals.drop(['Mistral RAFT RAG CRC Output', 'Mistral RAFT RAG CRC Output Time'],axis=1,inplace=True)

In [20]:
raw_evals.to_csv("inferences_5_2.csv")

In [21]:
raw_evals.columns

Index(['Index', 'Notes', 'Golden Question', 'Golden Answer', 'MPC',
       'Llama QLORA Output', 'Llama QLORA Output Time',
       'pretrained Mistral Output', 'pretrained Mistral Output Time',
       'Mistral QLORA Output', 'Mistral QLORA Output Time',
       'Llama RAFT Output', 'Llama RAFT Output Time', 'Mistral RAFT Output',
       'Mistral RAFT Output Time', 'pretrained Llama Output',
       'pretrained Llama Output Time', 'Llama_RAG_CRC', 'Llama_RAG_CRC_time',
       'mistral_RAG_CRC', 'mistral_RAG_CRC_time', 'mistral_RAG_RR',
       'mistral_RAG_RR_time', 'llama_RAG_RR', 'llama_RAG_RR_time',
       'Llama RAFT RAG CRC Output', 'Llama RAFT RAG CRC Output Time',
       'Mistral RAFT RAG CRC GPT prompt summary Output',
       'Mistral RAFT RAG CRC GPT prompt summary Output Time',
       'Mistral QLORA RAG CRC GPT prompt summary Output',
       'Mistral QLORA RAG CRC GPT prompt summary Output Time',
       'Llama QLORA RAG CRC GPT prompt summary Output',
       'Llama QLORA RAG CRC 

In [49]:
crit = pd.read_csv("criteria_res_27.csv",index_col="Unnamed: 0").reset_index()
crit.head()

Unnamed: 0,index,Question,Context,Llama QLORA Output_conciseness_score,Llama QLORA Output_relevance_score,Llama QLORA Output_coherence_score,Llama QLORA Output_helpfulness_score,pretrained Mistral Output_conciseness_score,pretrained Mistral Output_relevance_score,pretrained Mistral Output_coherence_score,pretrained Mistral Output_helpfulness_score,Mistral QLORA Output_conciseness_score,Mistral QLORA Output_relevance_score,Mistral QLORA Output_coherence_score,Mistral QLORA Output_helpfulness_score,Llama RAFT Output_conciseness_score,Llama RAFT Output_relevance_score,Llama RAFT Output_coherence_score,Llama RAFT Output_helpfulness_score,Mistral RAFT Output_conciseness_score,Mistral RAFT Output_relevance_score,Mistral RAFT Output_coherence_score,Mistral RAFT Output_helpfulness_score,pretrained Llama Output_conciseness_score,pretrained Llama Output_relevance_score,pretrained Llama Output_coherence_score,pretrained Llama Output_helpfulness_score,Llama_RAG_CRC_conciseness_score,Llama_RAG_CRC_relevance_score,Llama_RAG_CRC_coherence_score,Llama_RAG_CRC_helpfulness_score,mistral_RAG_CRC_conciseness_score,mistral_RAG_CRC_relevance_score,mistral_RAG_CRC_coherence_score,mistral_RAG_CRC_helpfulness_score,mistral_RAG_RR_conciseness_score,mistral_RAG_RR_relevance_score,mistral_RAG_RR_coherence_score,mistral_RAG_RR_helpfulness_score,llama_RAG_RR_conciseness_score,llama_RAG_RR_relevance_score,llama_RAG_RR_coherence_score,llama_RAG_RR_helpfulness_score,Llama RAFT RAG CRC Output_conciseness_score,Llama RAFT RAG CRC Output_relevance_score,Llama RAFT RAG CRC Output_coherence_score,Llama RAFT RAG CRC Output_helpfulness_score,Mistral RAFT RAG CRC GPT prompt summary Output_conciseness_score,Mistral RAFT RAG CRC GPT prompt summary Output_relevance_score,Mistral RAFT RAG CRC GPT prompt summary Output_coherence_score,Mistral RAFT RAG CRC GPT prompt summary Output_helpfulness_score,Mistral QLORA RAG CRC GPT prompt summary Output_conciseness_score,Mistral QLORA RAG CRC GPT prompt summary Output_relevance_score,Mistral QLORA RAG CRC GPT prompt summary Output_coherence_score,Mistral QLORA RAG CRC GPT prompt summary Output_helpfulness_score,Llama QLORA RAG CRC GPT prompt summary Output_conciseness_score,Llama QLORA RAG CRC GPT prompt summary Output_relevance_score,Llama QLORA RAG CRC GPT prompt summary Output_coherence_score,Llama QLORA RAG CRC GPT prompt summary Output_helpfulness_score,GPT4_MQR_conciseness_score,GPT4_MQR_relevance_score,GPT4_MQR_coherence_score,GPT4_MQR_helpfulness_score,GPT4_conciseness_score,GPT4_relevance_score,GPT4_coherence_score,GPT4_helpfulness_score
0,0,What are your favorite luxury camp meals for a...,The text promotes joining the Funcheap email l...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,I'm taking a solo trip with my dog around Than...,The text provides information about DogFest 20...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
2,2,I'm planning a 3-4 week family road trip from ...,The text includes information about two free e...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0
3,3,What are the main differences between visiting...,The text discusses various popular events and ...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
4,4,Seeking destination and activity suggestions f...,The text is a list of different categories of ...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [48]:
[c for c in crit.columns if 'score' in c]

['Llama QLORA Output_conciseness_score',
 'Llama QLORA Output_relevance_score',
 'Llama QLORA Output_coherence_score',
 'Llama QLORA Output_helpfulness_score',
 'pretrained Mistral Output_conciseness_score',
 'pretrained Mistral Output_relevance_score',
 'pretrained Mistral Output_coherence_score',
 'pretrained Mistral Output_helpfulness_score',
 'Mistral QLORA Output_conciseness_score',
 'Mistral QLORA Output_relevance_score',
 'Mistral QLORA Output_coherence_score',
 'Mistral QLORA Output_helpfulness_score',
 'Llama RAFT Output_conciseness_score',
 'Llama RAFT Output_relevance_score',
 'Llama RAFT Output_coherence_score',
 'Llama RAFT Output_helpfulness_score',
 'Mistral RAFT Output_conciseness_score',
 'Mistral RAFT Output_relevance_score',
 'Mistral RAFT Output_coherence_score',
 'Mistral RAFT Output_helpfulness_score',
 'pretrained Llama Output_conciseness_score',
 'pretrained Llama Output_relevance_score',
 'pretrained Llama Output_coherence_score',
 'pretrained Llama Output_help

In [142]:
crit_melt = pd.melt(crit, id_vars='index', value_vars=[c for c in crit.columns if 'score' in c]).dropna()
crit_melt['model'] = crit_melt['variable'].apply(lambda x:' '.join(x.replace('_score','').split('_')[:-1]).replace('Output','').replace(' GPT prompt summary ','').strip())
crit_melt['metric'] = crit_melt['variable'].apply(lambda x:x.rsplit('_')[-2])

crit_melt

Unnamed: 0,index,variable,value,model,metric
0,0,Llama QLORA Output_conciseness_score,0.0,Llama QLORA,conciseness
1,1,Llama QLORA Output_conciseness_score,0.0,Llama QLORA,conciseness
2,2,Llama QLORA Output_conciseness_score,0.0,Llama QLORA,conciseness
3,3,Llama QLORA Output_conciseness_score,0.0,Llama QLORA,conciseness
4,4,Llama QLORA Output_conciseness_score,0.0,Llama QLORA,conciseness
...,...,...,...,...,...
2354,23,GPT4_helpfulness_score,1.0,GPT4,helpfulness
2355,24,GPT4_helpfulness_score,1.0,GPT4,helpfulness
2356,25,GPT4_helpfulness_score,1.0,GPT4,helpfulness
2357,26,GPT4_helpfulness_score,1.0,GPT4,helpfulness


In [145]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [144]:
crit_melt_df = crit_melt.groupby(['metric','model'])['value'].mean().to_frame('score').unstack()

In [148]:
crit_melt_df.T.style.apply(highlight_max,axis=0)

Unnamed: 0_level_0,metric,coherence,conciseness,helpfulness,relevance
Unnamed: 0_level_1,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
score,GPT4,0.892857,0.714286,1.0,0.25
score,GPT4 MQR,0.964286,1.0,0.357143,0.714286
score,Llama QLORA,0.214286,0.142857,0.464286,0.035714
score,Llama QLORA RAG CRC,0.107143,0.107143,0.464286,0.142857
score,Llama RAFT,0.444444,0.25,0.678571,0.142857
score,Llama RAFT RAG CRC,0.464286,0.035714,0.5,0.142857
score,Llama RAG CRC,0.571429,0.214286,0.892857,0.25
score,Mistral QLORA,0.25,0.214286,0.428571,0.071429
score,Mistral QLORA RAG CRC,0.142857,0.214286,0.357143,0.214286
score,Mistral RAFT,0.714286,0.25,0.928571,0.214286
