# Using IBM watsonx.governance metrics toolkit to evaluate the quality of your Prompt Template

In [None]:
!pip install --upgrade ibm-watson-machine-learning   | tail -n 1
#!pip install --upgrade ibm-watson-openscale --no-cache | tail -n 1
#!pip install --upgrade ibm-metrics-plugin --no-cache | tail -n 1
!pip install -i https://test.pypi.org/simple/ ibm-watson-openscale==3.0.34.8  | tail -n 1
!pip install -i https://test.pypi.org/simple/ ibm-metrics-plugin==4.8.1.13  | tail -n 1

In [None]:
!pip install --upgrade evaluate --no-cache | tail -n 1
!pip install --upgrade textstat --no-cache | tail -n 1
!pip install --upgrade sacrebleu --no-cache | tail -n 1
!pip install --upgrade sacremoses --no-cache | tail -n 1
!pip install --upgrade datasets==2.10.0 --no-cache | tail -n 1
!pip install torchmetrics --no-cache | tail -n 1

In [1]:
import warnings
warnings.filterwarnings('ignore')

## Provision services and configure credentials

If you have not already, provision an instance of IBM Watson OpenScale using the [OpenScale link in the Cloud catalog](https://cloud.ibm.com/catalog/services/watson-openscale).

Your Cloud API key can be generated by going to the [**Users** section of the Cloud console](https://cloud.ibm.com/iam#/users). From that page, click your name, scroll down to the **API Keys** section, and click **Create an IBM Cloud API key**. Give your key a name and click **Create**, then copy the created key and paste it below.

**NOTE:** You can also get OpenScale `API_KEY` using IBM CLOUD CLI.

How to install IBM Cloud (bluemix) console: [instruction](https://console.bluemix.net/docs/cli/reference/ibmcloud/download_cli.html#install_use)

How to get api key using console:
```
bx login --sso
bx iam api-key-create 'my_key'
```

In [85]:
CLOUD_API_KEY = "K-xxxx"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

## IBM watsonx.governance authentication

In [86]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *


authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
client = APIClient(authenticator=authenticator)
client.version

'3.0.34.8'

# Common Imports

In [87]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import  LLMGenerationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMQAMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMClassificationMetrics
from ibm_metrics_plugin.metrics.llm.utils.constants import HAP_SCORE
from ibm_metrics_plugin.metrics.llm.utils.constants import PII_DETECTION

# Evaluating Summarization output from IBM watsonx.ai Granite Model

## Test data containing the summarization output from model and the reference data

In [88]:
!rm -fr llm_content.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv"

--2024-01-14 04:19:47--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31230 (30K) [text/plain]
Saving to: ‘llm_content.csv’


2024-01-14 04:19:48 (21.1 MB/s) - ‘llm_content.csv’ saved [31230/31230]



In [89]:
import pandas as pd
import numpy as np
llm_data_all = pd.read_csv("llm_content.csv")
llm_data_all.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [90]:
llm_data = llm_data_all.head(10)
llm_data.head()

Unnamed: 0,input_text,generated_summary,reference_summary_1,reference_summary_2
0,Scientists have discovered a new species of de...,New bioluminescent fish species found in deep ...,Discovery of deep-sea fish emitting soothing l...,Scientists find new bioluminescent fish specie...
1,An international team of astronomers has ident...,Distant exoplanet\'s water vapor-filled atmosp...,Astronomers identify exoplanet with water vapo...,Discovery of exoplanet with water vapor in its...
2,Researchers have developed a novel nanotechnol...,New nanotechnology-based cancer treatment demo...,Researchers create cancer treatment using nano...,Innovative cancer treatment utilizing nanotech...
3,A new app is aiming to reduce food waste by co...,App connects local restaurants with customers ...,New sustainability-focused app facilitates sal...,Initiative to reduce food waste involves app c...
4,Archaeologists have uncovered an ancient city ...,"Ancient city dating back over 4,000 years disc...",Archaeological find in Iraq reveals ancient ci...,"Discovery of 4,000-year-old ancient city in Ku..."


In [91]:
df_input = llm_data[['input_text']].copy()
df_output = llm_data[['generated_summary']].copy()
df_reference = llm_data[['reference_summary_1']].copy()

## Metrics configuration for evaluation

In [92]:
metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.NORMALIZED_RECALL.value: {},
            LLMSummarizationMetrics.NORMALIZED_PRECISION.value: {},
            LLMSummarizationMetrics.NORMALIZED_F1_SCORE.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {}
        }
    }
}

In [93]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [94]:
def extract_key_words(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    keywords = [token.text for token in doc if token.pos_ == 'NOUN']
    return keywords

In [95]:
def compute_f1_score(reference_keywords, generated_keywords):
    common_keywords = set(reference_keywords) & set(generated_keywords)

    precision = len(common_keywords) / len(generated_keywords) if len(generated_keywords) > 0 else 0
    recall = len(common_keywords) / len(reference_keywords) if len(reference_keywords) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

In [96]:
def compute_averages_f1_score(precisions, recalls, f1_scores):
    import numpy as np
    precision = round(np.min(precisions), 4)
    recall = round(np.min(recalls), 4)
    f1_score = round(np.min(f1_scores), 4)

    keyword_inclusions = {
        "keyword_inclusions" : {
            "precision": {
                "metric_value": precision
            },
            "recall": {
                "metric_value": recall
            },
            "f1_score": {
                "metric_value": f1_score
            }
        }
    }
    return keyword_inclusions

In [97]:
def custom_metric_1(df_input, df_output, df_reference):
    precisions = []
    recalls = []
    f1_scores = []
    
    for input_text, generated_summary in zip(df_input['input_text'], df_output['generated_summary']):
    
        input_text_keywords = extract_key_words(input_text)
        print('Input Text Keywords: '+ str(input_text_keywords))
    
        generated_summary_keywords = extract_key_words(generated_summary)
        print('Generated Summary Keywords: '+ str(generated_summary_keywords))
        
        precision, recall, f1_score = compute_f1_score(input_text_keywords, generated_summary_keywords)
        
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1_score)

        print('\n')
    
    keyword_inclusions = compute_averages_f1_score(precisions, recalls, f1_scores)
    return keyword_inclusions
    

## Summarization Metrics Evaluation

In [98]:
import json
result = client.llm_metrics.compute_metrics(metric_config, 
                                            sources = df_input, 
                                            predictions = df_output, 
                                            references = df_reference, 
                                            custom_evaluators = [custom_metric_1])

[nltk_data] Downloading package wordnet to /home/hadoop/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hadoop/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Input Text Keywords: ['Scientists', 'species', 'sea', 'fish', 'light', 'fish', 'advancements', 'light', 'exploration']
Generated Summary Keywords: ['fish', 'species', 'sea', 'exploration']


Input Text Keywords: ['team', 'astronomers', 'exoplanet', 'atmosphere', 'water', 'vapor', 'discovery', 'step', 'habitability', 'system']
Generated Summary Keywords: ['water', 'vapor', 'atmosphere', 'scientists', 'habitability']


Input Text Keywords: ['Researchers', 'nanotechnology', 'cancer', 'treatment', 'tumor', 'cells', 'precision', 'innovation', 'promise', 'cancer', 'therapies']
Generated Summary Keywords: ['nanotechnology', 'cancer', 'treatment', 'precision', 'tumor', 'cells']


Input Text Keywords: ['app', 'food', 'waste', 'restaurants', 'customers', 'food', 'prices', 'initiative', 'role', 'sustainability']
Generated Summary Keywords: ['restaurants', 'customers', 'surplus', 'food', 'reduction', 'food', 'waste']


Input Text Keywords: ['Archaeologists', 'city', 'years', 'region', 'city\\', 'r

## Evaluated Metrics

In [99]:
print(json.dumps(result,indent=2))

{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 20.854000000000003,
      "mean": 20.854000000000003,
      "min": -50.69,
      "max": 67.76,
      "std": 32.72954787344304
    },
    "flesch_kincaid_grade": {
      "metric_value": 13.64,
      "mean": 13.64,
      "min": 6.8,
      "max": 23.3,
      "std": 4.3260143319226305
    }
  },
  "bleu": {
    "precisions": [
      0.35947712418300654,
      0.11188811188811189,
      0.022556390977443608,
      0.0
    ],
    "brevity_penalty": 0.8548213791906977,
    "length_ratio": 0.864406779661017,
    "translation_length": 153,
    "reference_length": 177,
    "metric_value": 0.0
  },
  "meteor": {
    "metric_value": 0.2511393214238763
  },
  "normalized_f1": {
    "metric_value": 0.26192560128044,
    "mean": 0.26192560128044,
    "min": 0.06666666666666667,
    "max": 0.39999999999999997,
    "std": 0.104247533868887
  },
  "normalized_precision": {
    "metric_value": 0.2845611333111333,
    "mean": 0.28456113331

# Evaluating Content Generation output from the Foundation Model

## Test data containing the content generation output from model and the reference data

In [100]:
!rm -fr llm_content_generation.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv"

--2024-01-14 04:21:46--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_generation.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11794 (12K) [text/plain]
Saving to: ‘llm_content_generation.csv’


2024-01-14 04:21:47 (67.2 MB/s) - ‘llm_content_generation.csv’ saved [11794/11794]



In [101]:
data = pd.read_csv("llm_content_generation.csv")
data.head()

Unnamed: 0,question,generated_text,reference_text
0,What are the benefits of regular exercise?,"Regular exercise has numerous benefits, includ...","Regular exercise has numerous benefits, includ..."
1,What is the process of photosynthesis?,Photosynthesis is the process by which plants ...,Photosynthesis is the process by which plants ...
2,What are the key features of a smartphone?,A smartphone is a mobile device that typically...,A smartphone is a mobile device that typically...
3,How does the immune system work?,The immune system is a complex network of cell...,The immune system is a complex network of cell...
4,What is the capital of France?,"The capital of France is Paris, which is known...","The capital of France is Paris, which is known..."


In [102]:
df_input = data[['question']].copy()
df_output = data[['generated_text']].copy()
df_reference = data[['reference_text']].copy()

## Metrics configuration for evaluation

In [103]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.GENERATION.value: { # metric group   
            LLMGenerationMetrics.BLEU.value: {},
            LLMGenerationMetrics.ROUGE_SCORE.value: {},
            LLMGenerationMetrics.FLESCH.value: {},
            LLMGenerationMetrics.METEOR.value: {},            
            LLMGenerationMetrics.NORMALIZED_RECALL.value: {},
            LLMGenerationMetrics.NORMALIZED_PRECISION.value: {},
            LLMGenerationMetrics.NORMALIZED_F1_SCORE.value: {}            
        }    
    }
}

## Content Generation Metrics Evaluation

In [104]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

[nltk_data] Downloading package wordnet to /home/hadoop/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hadoop/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Evaluated Metrics

In [105]:
print(json.dumps(result,indent=2))

{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 39.10217391304347,
      "mean": 39.10217391304347,
      "min": -11.44,
      "max": 69.62,
      "std": 20.153544505710833
    },
    "flesch_kincaid_grade": {
      "metric_value": 12.673913043478263,
      "mean": 12.673913043478263,
      "min": 8.0,
      "max": 18.6,
      "std": 3.2043743730833554
    }
  },
  "bleu": {
    "precisions": [
      1.0,
      0.9949174078780177,
      0.9947643979057592,
      0.9946018893387314
    ],
    "brevity_penalty": 0.7138823993242189,
    "length_ratio": 0.7479224376731302,
    "translation_length": 810,
    "reference_length": 1083,
    "metric_value": 0.711075655695426
  },
  "meteor": {
    "metric_value": 0.738039475927929
  },
  "normalized_f1": {
    "metric_value": 0.8428460742183366,
    "mean": 0.8428460742183366,
    "min": 0.6666666666666666,
    "max": 0.9722222222222222,
    "std": 0.06436358907082648
  },
  "normalized_precision": {
    "metric_value": 1.0,


# Evaluating Question and Answering output from the Foundation Model

## Test data containing the question and answer output from model and the reference data

In [106]:
!rm -fr llm_content_qa.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv"

--2024-01-14 04:21:50--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_qa.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3109 (3.0K) [text/plain]
Saving to: ‘llm_content_qa.csv’


2024-01-14 04:21:50 (28.7 MB/s) - ‘llm_content_qa.csv’ saved [3109/3109]



In [107]:
data = pd.read_csv("llm_content_qa.csv")
data.head()

Unnamed: 0,question,answers
0,who did chris carter play for last year,Milwaukee Brewers
1,what is the latest version of safari on mac,Safari 11
2,when did bucharest become the capital of romania,1862
3,who did jeffrey dean morgan play on supernatural,John Eric Winchester
4,who is the shortest man that ever lived,Chandra Bahadur Dangi


In [108]:
df_input = data[['question']].copy()
df_output = data[['answers']].copy()
df_reference = data[['answers']].copy()

## Metrics configuration for evaluation

In [109]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.QA.value: { # metric group   
            LLMQAMetrics.EXACT_MATCH.value: {},
            LLMQAMetrics.ROUGE_SCORE.value: {},
            LLMQAMetrics.BLEU.value: {}          
        }    
    }
}

## Question and Answering Metrics Evaluation

In [110]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

## Evaluated Metrics

In [111]:
print(json.dumps(result,indent=2))

{
  "bleu": {
    "precisions": [
      1.0,
      1.0,
      1.0,
      1.0
    ],
    "brevity_penalty": 1.0,
    "length_ratio": 1.0,
    "translation_length": 133,
    "reference_length": 133,
    "metric_value": 1.0
  },
  "exact_match": {
    "metric_value": 1.0
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 1.0
    },
    "rouge2": {
      "metric_value": 0.74
    },
    "rougeL": {
      "metric_value": 1.0
    },
    "rougeLsum": {
      "metric_value": 1.0
    }
  }
}


# Evaluating Text Classification output from the Foundation Model

## Test data containing the text classification output from model and the reference data

In [112]:
!rm -fr llm_content_classification.csv
!wget "https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv"

--2024-01-14 04:21:54--  https://raw.githubusercontent.com/ravichamarthy/custom_metrics/main/llm_content_classification.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 480803 (470K) [text/plain]
Saving to: ‘llm_content_classification.csv’


2024-01-14 04:21:54 (29.8 MB/s) - ‘llm_content_classification.csv’ saved [480803/480803]



In [113]:
data = pd.read_csv("llm_content_classification.csv")
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [114]:
data['label'] = data['label'].replace({'ham': 0, 'spam': 1})

In [115]:
df_input = data[['text']].copy()
df_output = data[['label']].copy()
df_reference = data[['label']].copy()

## Make some realistic reference column

In [116]:
shuffled_column = df_reference['label'].sample(frac=1).reset_index(drop=True)
df_reference['label'] = shuffled_column

## Metrics configuration for evaluation

In [117]:
metric_config = {   
    #All Common parameters goes here 
    "configuration": {        
        LLMTextMetricGroup.CLASSIFICATION.value: { # metric group   
            LLMClassificationMetrics.ACCURACY.value: {},
            LLMClassificationMetrics.PRECISION.value: {},
            LLMClassificationMetrics.RECALL.value: {},
            LLMClassificationMetrics.F1_SCORE.value: {},
            LLMClassificationMetrics.MATTHEWS_CORRELATION.value: {},            
        }    
    }
}

## Text Classification Metrics Evaluation

In [118]:
result = client.llm_metrics.compute_metrics(metric_config,df_input,df_output, df_reference)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.60k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

## Evaluated Metrics

In [119]:
print(json.dumps(result,indent=2))

{
  "accuracy": {
    "accuracy": 0.7674084709260589
  },
  "f1": {
    "f1": 0.13253012048192772
  },
  "matthews_correlation": {
    "matthews_correlation": -0.001770397652787315
  },
  "precision": {
    "precision": 0.13253012048192772
  },
  "recall": {
    "recall": 0.13253012048192772
  }
}


Author: ravi.chamarthy@in.ibm.com