In [2]:
# Installation for GPU llama-cpp-python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.2.28  --force-reinstall --upgrade --no-cache-dir -q
# For downloading the models from HF Hub
!pip install huggingface_hub==0.23.2 -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/9.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m5.2/9.4 MB[0m [31m78.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m243.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m245.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━

In [40]:
#download datasets evaluate rouge_score and bert score
#!pip install -q datasets==2.16.1 evaluate==0.4.1 rouge_score==0.1.2 bert_score==0.3.12

In [4]:
import os
import google.generativeai as genai
from google.colab import userdata
import json
import numpy as np
import pandas as pd
from datasets import load_dataset
from huggingface_hub import hf_hub_download
import evaluate

genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

## Loading Dataset

In [5]:
dataset = load_dataset("knkarthick/dialogsum")

Downloading readme:   0%|          | 0.00/4.65k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
data = dataset['train'].to_pandas()

In [7]:
data.head()

Unnamed: 0,id,dialogue,summary,topic
0,train_0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...",get a check-up
1,train_1,"#Person1#: Hello Mrs. Parker, how have you bee...",Mrs Parker takes Ricky for his vaccines. Dr. P...,vaccines
2,train_2,"#Person1#: Excuse me, did you see a set of key...",#Person1#'s looking for a set of keys and asks...,find keys
3,train_3,#Person1#: Why didn't you tell me you had a gi...,#Person1#'s angry because #Person2# didn't tel...,have a girlfriend
4,train_4,"#Person1#: Watsup, ladies! Y'll looking'fine t...",Malik invites Nikki to dance. Nikki agrees if ...,dance


In [8]:
user_input = data['dialogue'][0]

## Call Gemini for summarization

In [9]:
def fetchResponseFromLLM(question,prompt):
    model=genai.GenerativeModel('gemini-pro')
    response=model.generate_content([prompt,question])
    return response.text

In [10]:
prompt = """
Summarize the dialogue mentioned in the user input. Be specific and concise in your summary.
"""
user_message = user_input
model_prediction=fetchResponseFromLLM(user_message,prompt)

In [11]:
bleu_scorer = evaluate.load('bleu')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [12]:
ground_truth = data['summary'][0]
ground_truth

"Mr. Smith's getting a check-up, and Doctor Hawkins advises him to have one every year. Hawkins'll give some information about their classes and medications to help Mr. Smith quit smoking."

## BLUE Score

In [13]:
bleu_scorer.compute(
    predictions=[model_prediction],
    references=[ground_truth]
)

{'bleu': 0.11361141853169658,
 'precisions': [0.24,
  0.12162162162162163,
  0.0821917808219178,
  0.06944444444444445],
 'brevity_penalty': 1.0,
 'length_ratio': 2.142857142857143,
 'translation_length': 75,
 'reference_length': 35}

## ROUGEL Score

In [14]:
rouge_scorer = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [15]:
rouge_scorer.compute(
    predictions=[model_prediction],
    references=[ground_truth]
)

{'rouge1': 0.326530612244898,
 'rouge2': 0.14583333333333334,
 'rougeL': 0.24489795918367346,
 'rougeLsum': 0.24489795918367346}

## BERT Score

In [16]:
bert_scorer = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [17]:
bert_score = bert_scorer.compute(
    predictions = [model_prediction],
    references=[ground_truth],
    lang="en",
    rescale_with_baseline=True
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
sum(bert_score['f1'])/len(bert_score['f1'])

0.4298917055130005

## Using Mistral Model as a Judge

In [21]:
model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_basename = "mistral-7b-instruct-v0.2.Q5_K_M.gguf"

In [22]:
model_path = hf_hub_download(
    repo_id=model_name_or_path,
    filename=model_basename
)

mistral-7b-instruct-v0.2.Q5_K_M.gguf:   0%|          | 0.00/5.13G [00:00<?, ?B/s]

In [23]:
from llama_cpp import Llama

lcpp_llm = Llama(
    model_path=model_path,
    n_threads=2, # CPU cores
    n_batch=512, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=43, # Change this value based on your model and your GPU VRAM pool.
    n_ctx=4096 # Context window
)

AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [24]:
mistral_template = "<s>[INST]{prompt}[/INST]"

In [25]:
mistral_system_message = """
Your task is to rate on AI-generated summaries of dialogues.
You will be presented a dialogue and an AI generated summary of the dialgoue as the input.
In the input, the dialogue will begin with '''Dialogue and the AI generated summary will begin with '''Summary.

Instructions:
1. Observe carefully the dialogue and its AI-generated summary presented in the input
2. Rate the quality of the summary generated by the AI system on a scale of 1 to 5, 1 being low quality and 5 being high quality.
The AI system was asked to generate a concise summary reflecting the most important points mentioned in the dialogue.
In your rating, focus your assessment on whether the summary was an accurate, concise reflection of key points in the dialogue.

Do not to explain your rating, the output should only be the rating between 1 to 5.
"""

In [26]:
user_message_template = """
'''Dialogue
{dialogue}

'''Summary
{summary}
"""

In [27]:
prompt = mistral_template.format(
    prompt=mistral_system_message + user_message_template.format(
            dialogue=ground_truth,
            summary=model_prediction
        )
)

In [28]:
response = lcpp_llm(
    prompt=prompt,
    max_tokens=3,
    temperature=0,
    top_p=0.95,
    repeat_penalty=1.2,
    echo=False # do not return the prompt
)

**Hence Mistral LLM rated the AI generated summary by Gemini against the ground truth as 5 out of 5.**

In [29]:
prediction = response["choices"][0]["text"].strip().replace(".", "")
prediction

'5'

### Let's use Mistral LLM to generate the summary of a dialog

In [30]:
system_message = """

Summarize the dialogue mentioned in the user input below. Be specific and concise in your summary.
The dialogue will be delimited by triple backticks, that is, ```.
"""

In [31]:
mistral_template = """<s>[INST]{system_message}```{dialog}```[/INST]</s> """

In [32]:
mistral_prompt = mistral_template.format(
            system_message=system_message,
            dialog=user_input
        )
try:
    response = lcpp_llm(
            prompt=mistral_prompt,
            max_tokens=140,
            temperature=0,
            top_p=0.95,
            repeat_penalty=1.2,
            echo=False
        )

    mistral_prediction = response["choices"][0]["text"]

except Exception as e:
    print(e) # Log error and continue

Llama.generate: prefix-match hit


In [41]:
#!pip install mlflow

In [42]:
#!pip install openai
#!pip install tiktoken
#!pip install dagshub

In [36]:
import mlflow
import openai
import os
import pandas as pd
import dagshub



In [37]:
dagshub.init(repo_owner='sarup.etceju', repo_name='ML_Flow', mlflow=True)

mlflow.set_tracking_uri("https://dagshub.com/sarup.etceju/ML_Flow.mlflow")
mlflow.set_experiment("LLM Evaluation")
eval_data = pd.DataFrame(
    {
        "inputs": [
            user_message
        ],
        "ground_truth": [
            ground_truth,
        ],
        "predictions": [
            model_prediction,
        ],
    }
)

with mlflow.start_run() as run:
    results = mlflow.evaluate(
        data=eval_data,
        targets="ground_truth",
        predictions="predictions",
        extra_metrics=[mlflow.metrics.genai.answer_similarity(), mlflow.metrics.genai.answer_relevance(),mlflow.metrics.toxicity()],
        evaluators="default",
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    eval_table = results.tables["eval_results_table"]
    print(f"See evaluation table below: \n{eval_table}")
    eval_df=pd.DataFrame(eval_table)
    eval_df.to_csv('eval_local.csv')

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=344d878f-7e43-449e-9ab7-1f95fd11cf59&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=dec587a191da333354f02f8240f655eec1d65b703875d817a560d23428e524ef




2024/09/15 01:50:49 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/6.08k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

See aggregated evaluation results below: 
{'answer_similarity/v1/mean': 4.0, 'answer_similarity/v1/variance': 0.0, 'answer_similarity/v1/p90': 4.0, 'answer_relevance/v1/mean': 5.0, 'answer_relevance/v1/variance': 0.0, 'answer_relevance/v1/p90': 5.0, 'toxicity/v1/mean': 0.0002993065572809428, 'toxicity/v1/variance': 0.0, 'toxicity/v1/p90': 0.0002993065572809428, 'toxicity/v1/ratio': 0.0}


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2024/09/15 01:51:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run traveling-whale-99 at: https://dagshub.com/sarup.etceju/ML_Flow.mlflow/#/experiments/0/runs/cb5e24b57aba4e1aa7d313e8ff545c0d.
2024/09/15 01:51:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/sarup.etceju/ML_Flow.mlflow/#/experiments/0.


See evaluation table below: 
                                              inputs  \
0  #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...   

                                        ground_truth  \
0  Mr. Smith's getting a check-up, and Doctor Haw...   

                                         predictions  \
0  During a medical checkup, Dr. Hawkins encourag...   

   answer_similarity/v1/score  \
0                           4   

                  answer_similarity/v1/justification  \
0  The provided output aligns closely with the ta...   

   answer_relevance/v1/score  \
0                          5   

                   answer_relevance/v1/justification  toxicity/v1/score  
0  The output accurately summarizes the conversat...           0.000299  


In [38]:
eval_df.head()

Unnamed: 0,inputs,ground_truth,predictions,answer_similarity/v1/score,answer_similarity/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification,toxicity/v1/score
0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...","During a medical checkup, Dr. Hawkins encourag...",4,The provided output aligns closely with the ta...,5,The output accurately summarizes the conversat...,0.000299


### Use MLFlow to call the LLM (OpenAI) for the summarization and then use MLFlow Evalution metrics and upload the results in Dagshub for data analysis

In [None]:
# Now set the API key for OpenAI client
openai.api_key = userdata.get("OPENAI_API_KEY")

In [39]:

eval_data = pd.DataFrame(
    {
        "inputs": [
            user_message
        ],
        "ground_truth": [
            ground_truth
        ],
    }
)
mlflow.set_experiment("LLM Inference Evaluation")
with mlflow.start_run() as run:
    system_prompt = "Summarize the dialogue mentioned in the user input. Be specific and concise in your summary."
    # Wrap "gpt-4" as an MLflow model.
    logged_model_info = mlflow.openai.log_model(
        model="gpt-4",
        task=openai.chat.completions,
        artifact_path="model",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": "{question}"},
        ],
    )

    # Use predefined question-answering metrics to evaluate our model.
    results = mlflow.evaluate(
        logged_model_info.model_uri,
        eval_data,
        targets="ground_truth",
        model_type="text-summarization",
        extra_metrics=[mlflow.metrics.toxicity(), mlflow.metrics.latency(),mlflow.metrics.genai.answer_similarity(),
                       mlflow.metrics.ari_grade_level(), mlflow.metrics.genai.answer_relevance()]
    )
    print(f"See aggregated evaluation results below: \n{results.metrics}")

    # Evaluation result for each data record is available in `results.tables`.
    eval_table = results.tables["eval_results_table"]
    df=pd.DataFrame(eval_table)
    df.to_csv('eval.csv')
    print(f"See evaluation table below: \n{eval_table}")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2024/09/15 01:53:47 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/09/15 01:53:50 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...


  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

See aggregated evaluation results below: 
{'latency/mean': 3.141493558883667, 'latency/variance': 0.0, 'latency/p90': 3.141493558883667, 'toxicity/v1/mean': 0.0005440807435661554, 'toxicity/v1/variance': 0.0, 'toxicity/v1/p90': 0.0005440807435661554, 'toxicity/v1/ratio': 0.0, 'rouge1/v1/mean': 0.37837837837837834, 'rouge1/v1/variance': 0.0, 'rouge1/v1/p90': 0.37837837837837834, 'rouge2/v1/mean': 0.1834862385321101, 'rouge2/v1/variance': 0.0, 'rouge2/v1/p90': 0.1834862385321101, 'rougeL/v1/mean': 0.2882882882882883, 'rougeL/v1/variance': 0.0, 'rougeL/v1/p90': 0.2882882882882883, 'rougeLsum/v1/mean': 0.2882882882882883, 'rougeLsum/v1/variance': 0.0, 'rougeLsum/v1/p90': 0.2882882882882883, 'answer_similarity/v1/mean': 5.0, 'answer_similarity/v1/variance': 0.0, 'answer_similarity/v1/p90': 5.0, 'answer_relevance/v1/mean': 5.0, 'answer_relevance/v1/variance': 0.0, 'answer_relevance/v1/p90': 5.0}


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2024/09/15 01:54:10 INFO mlflow.tracking._tracking_service.client: 🏃 View run placid-robin-2 at: https://dagshub.com/sarup.etceju/ML_Flow.mlflow/#/experiments/1/runs/6c00693826904f12b58072e27faae2c0.
2024/09/15 01:54:10 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/sarup.etceju/ML_Flow.mlflow/#/experiments/1.


See evaluation table below: 
                                              inputs  \
0  #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...   

                                        ground_truth  \
0  Mr. Smith's getting a check-up, and Doctor Haw...   

                                             outputs   latency  token_count  \
0  Mr. Smith, who has not had a check-up in five ...  3.141494           95   

   toxicity/v1/score  rouge1/v1/score  rouge2/v1/score  rougeL/v1/score  \
0           0.000544         0.378378         0.183486         0.288288   

   rougeLsum/v1/score  answer_similarity/v1/score  \
0            0.288288                           5   

                  answer_similarity/v1/justification  \
0  The provided output closely aligns with the pr...   

   answer_relevance/v1/score  \
0                          5   

                   answer_relevance/v1/justification  
0  The output accurately summarizes the conversat...  


In [43]:
df.head()

Unnamed: 0,inputs,ground_truth,outputs,latency,token_count,toxicity/v1/score,rouge1/v1/score,rouge2/v1/score,rougeL/v1/score,rougeLsum/v1/score,answer_similarity/v1/score,answer_similarity/v1/justification,answer_relevance/v1/score,answer_relevance/v1/justification
0,"#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. ...","Mr. Smith's getting a check-up, and Doctor Haw...","Mr. Smith, who has not had a check-up in five ...",3.141494,95,0.000544,0.378378,0.183486,0.288288,0.288288,5,The provided output closely aligns with the pr...,5,The output accurately summarizes the conversat...
