# For a langchain powered multi-chain application, demonstrate how to  evaluate the LLM Quality Metrics using IBM watsonx.governance - monitoring toolkit.

In [None]:
!pip install ibm-watson-machine-learning langchain-openai "pydantic>=1.10.0" langchain "sqlalchemy==1.4.47" | tail -n 1

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.chains import SimpleSequentialChain
from langchain.chains import SequentialChain

# LLM 1 - using Azure OpenAI GPT Model

## Summarization prompt - summarize the mobile issue

In [3]:
prompt_1_template_text = """
Please provide a summary of the following text with maximum of 20 words. Do not include the Issue in the output.

Issue: Some apps continue to run processes in the background even when not actively in use, leading to increased battery drain. Users can identify and restrict background activity for specific apps in their phone settings.
Summary: Certain apps running in the background excessively consume battery power.

Issue: Devices with insufficient RAM struggle to handle multiple apps simultaneously, leading to slow performance. Users can close background apps and consider upgrading to a device with more RAM if multitasking is essential.
Summary: Limited Random Access Memory (RAM) affecting multitasking capabilities.

Issue: Users may encounter issues where the device fails to authenticate with a Wi-Fi network, often requiring re-entry of the correct password or troubleshooting router settings.
Summary: Inability to connect to a Wi-Fi network due to authentication problems.

Issue: {content}
Summary: 
"""

In [4]:
prompt_1 = PromptTemplate(
    input_variables=["content"],
    template=prompt_1_template_text
)

In [5]:
# PLEASE DO NOT USE THESE. Otherwise Ravi Chamarthy gets charged.
API_TYPE = "azure"
BASE_URL = "https://azure-openai-instance-default.openai.azure.com/"
API_VERSION = "2023-09-15-preview"
AZURE_OPENAI_API_KEY = 'xxxx'
DEPLOYMENT_NAME = 'azure-openai-deployment-001'

In [6]:
from langchain_openai import AzureOpenAI
azure_openai_model = AzureOpenAI(
    azure_endpoint=BASE_URL,
    openai_api_version=API_VERSION,
    deployment_name=DEPLOYMENT_NAME,
    openai_api_key=AZURE_OPENAI_API_KEY,
    openai_api_type = API_TYPE,
    temperature=0.0,
    max_tokens=20,
    stop='\n',
    top_p=0.5,
    frequency_penalty=0,
    presence_penalty=0
)

## First element of the LLM chain using the Azure OpenAI Model

In [7]:
#prompt_to_azure_openai = LLMChain(llm=azure_openai_model, prompt=prompt_1)
prompt_to_azure_openai = LLMChain(llm=azure_openai_model, 
                                  prompt=prompt_1, 
                                  output_key='summary')

# LLM 2 - using IBM watsonx.ai FLAN_T5_XXL Model

## Issue classification prompt - summarize the mobile issue

### BatteryPerformance or StorageDataManagement or ConnectivityAndNetwork

In [8]:
prompt_2_template_text = """
Classify the following as BatteryPerformance or StorageDataManagement or ConnectivityAndNetwork.

Issue: Certain apps running in the background excessively consume battery power.
Type: BatteryPerformance

Issue: Limited Random Access Memory (RAM) affecting multitasking capabilities.
Type: StorageDataManagement

Issue: Inability to connect to a Wi-Fi network due to authentication problems.
Type: ConnectivityAndNetwork

Issue: {summary}
Type:
"""

In [9]:
prompt_2 = PromptTemplate(
    input_variables=["summary"],
    template=prompt_2_template_text
)

In [10]:
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.foundation_models.extensions.langchain import WatsonxLLM
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods

print([model.name for model in ModelTypes])

['FLAN_T5_XXL', 'FLAN_UL2', 'MT0_XXL', 'GPT_NEOX', 'MPT_7B_INSTRUCT2', 'STARCODER', 'LLAMA_2_70B_CHAT', 'LLAMA_2_13B_CHAT', 'GRANITE_13B_INSTRUCT', 'GRANITE_13B_CHAT']


In [11]:
model_id_2 = ModelTypes.FLAN_T5_XXL

In [12]:
# Using YS1Dev as doing multiple scoring, and the account is white listed
test_GEN_API_KEY = '_iXE92G6-xxx-xxxx-'
GEN_API_KEY = test_GEN_API_KEY

test_api_endpoint = 'https://wml-xxxx.ml.xxx.cloud.ibm.com/ml/v1-beta/generation/text?version=2023-05-28'
api_endpoint = test_api_endpoint

test_project_id = 'xxxx-fcdd-xxx-bbd7-xxxx'
project_id = test_project_id

test_endpoint_url = "https://wml-xxxx.ml.xxx.cloud.ibm.com"
endpoint_url = test_endpoint_url

In [13]:
flan_t5_model = Model(
    model_id=model_id_2,
    params={
        "decoding_method": "sample",
        "max_new_tokens": 10,
        "min_new_tokens": 0,
        "temperature":0.0
    },
    credentials={
        "apikey": GEN_API_KEY,
        "url": endpoint_url
    },
    project_id=project_id)
flan_t5_llm = WatsonxLLM(model=flan_t5_model)

## Second element of the LLM chain using the watsonx model

In [51]:
flan_to_t5 = LLMChain(llm=flan_t5_model.to_langchain(), prompt=prompt_2, output_key='issue_type')

# LLM 3 - using watsonx.ai FLAN_T5_XXL model

## Generate issue resolution

In [52]:
prompt_3_template_text = """
Provide a resolution for this mobile issue type in a maximum of 10 words.

Issue Type: BatteryPerformance
Resolution: Optimize background app usage to conserve battery.

Issue Type: StorageDataManagement
Resolution: Optimize apps, clear cache, upgrade RAM if possible.

Issue Type: ConnectivityAndNetwork
Resolution: Check Wi-Fi password, reset router, restart device.

Issue Type: {issue_type}
Resolution: 

"""

In [53]:
prompt_3 = PromptTemplate(
    input_variables=["issue_type"],
    template=prompt_3_template_text
)

In [61]:
issue_resolution_prompt = LLMChain(llm=flan_t5_model.to_langchain(), prompt=prompt_3, output_key='resolution')

# IBM watsonx.governance-monitoring toolkit to evaluate the output of summarization element of the LLM chain

In [62]:
CLOUD_API_KEY = "K-xxxxx"
IAM_URL="https://iam.ng.bluemix.net/oidc/token"

In [63]:
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator,BearerTokenAuthenticator

from ibm_watson_openscale import *
from ibm_watson_openscale.supporting_classes.enums import *
from ibm_watson_openscale.supporting_classes import *


authenticator = IAMAuthenticator(apikey=CLOUD_API_KEY)
client = APIClient(authenticator=authenticator)
client.version

'3.0.35'

In [64]:
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMTextMetricGroup
from ibm_metrics_plugin.metrics.llm.utils.constants import LLMSummarizationMetrics

metric_config = {   
    "configuration": {
        LLMTextMetricGroup.SUMMARIZATION.value: {
            LLMSummarizationMetrics.ROUGE_SCORE.value: {},
            LLMSummarizationMetrics.SARI.value: {},
            LLMSummarizationMetrics.METEOR.value: {},
            LLMSummarizationMetrics.BLEU.value: {},
            LLMSummarizationMetrics.FLESCH.value: {}
        }
    }
}

# LLM Chain Callback Handler

In [70]:
input_token_counts = []
output_token_counts = []

from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import LLMResult
from typing import Any, Dict, List, Optional, Union
import pandas as pd
import json

class MyCustomHandler(BaseCallbackHandler):
    prompts_text = None
    summary_ground_truth = None
    resolution_ground_truth = None

    def __init__(self, summary_ground_truth: str = "", resolution_ground_truth: str = ""):
        self.summary_ground_truth = summary_ground_truth
        self.resolution_ground_truth = resolution_ground_truth  
        print(self.summary_ground_truth)
        print(self.resolution_ground_truth)

    def on_chain_start(self, serialized: Dict[str, Any], prompts: Dict[str, Any], **kwargs: Any) -> Any:
        print('Inside on_chain_start')
        self.prompts_text = prompts
        print(self.prompts_text)
        
    def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> Any:
        print('Inside on_chain_end')
        
        overall_context = self.prompts_text['content']
        generated_summary = outputs['summary']
        resolution = outputs['resolution']        

        print('Evaluating Summarization quality..')
        df_input = pd.DataFrame({'input_text': [overall_context]})
        df_reference = pd.DataFrame({'ground_truth': [self.summary_ground_truth]})
        df_output = pd.DataFrame({'generated_summary': [generated_summary]})
        evals = client.llm_metrics.compute_metrics(metric_config, 
                                                    sources = df_input, 
                                                    predictions = df_output, 
                                                    references = df_reference)
        print(json.dumps(evals, indent=2))

        
        print('Evaluating Content generation quality..')
        df_reference = pd.DataFrame({'ground_truth': [self.resolution_ground_truth]})
        df_output = pd.DataFrame({'resolution': [resolution]})
        evals = client.llm_metrics.compute_metrics(metric_config, 
                                                    sources = df_input, 
                                                    predictions = df_output, 
                                                    references = df_reference)
        print(json.dumps(evals, indent=2))

# Chaining..

In [71]:
chain = SequentialChain(chains=[prompt_to_azure_openai, flan_to_t5, issue_resolution_prompt], 
                        input_variables=["content"],
                        output_variables=["summary", "issue_type", "resolution"],
                        callbacks=[MyCustomHandler(
                            summary_ground_truth = 'Push notifications from apps can drain battery life, users can manage notification settings to reduce battery consumption.',
                            resolution_ground_truth = 'Optimize background app usage to conserve battery.')],
                        verbose=True)

Push notifications from apps can drain battery life, users can manage notification settings to reduce battery consumption.
Optimize background app usage to conserve battery.


## Invoke the LLM chain with callback handler

In [74]:
issue = 'Apps that send push notifications at high frequencies can contribute to increased battery consumption. \
Users can manage notification settings, disable unnecessary alerts, and set longer intervals for non-essential updates.'

chain.invoke({"content" :issue})

Inside on_chain_start
{'content': 'Apps that send push notifications at high frequencies can contribute to increased battery consumption. Users can manage notification settings, disable unnecessary alerts, and set longer intervals for non-essential updates.'}


[1m> Entering new SequentialChain chain...[0m
Inside on_chain_end
Evaluating Summarization quality..


[nltk_data] Downloading package wordnet to /home/hadoop/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hadoop/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 28.84,
      "mean": 28.84,
      "min": 28.84,
      "max": 28.84,
      "std": 0.0
    },
    "flesch_kincaid_grade": {
      "metric_value": 13.5,
      "mean": 13.5,
      "min": 13.5,
      "max": 13.5,
      "std": 0.0
    }
  },
  "bleu": {
    "precisions": [
      1.0,
      1.0,
      1.0,
      1.0
    ],
    "brevity_penalty": 1.0,
    "length_ratio": 1.0,
    "translation_length": 19,
    "reference_length": 19,
    "metric_value": 1.0
  },
  "meteor": {
    "metric_value": 0.9999271030762502
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 1.0
    },
    "rouge2": {
      "metric_value": 1.0
    },
    "rougeL": {
      "metric_value": 1.0
    },
    "rougeLsum": {
      "metric_value": 1.0
    }
  },
  "sari": {
    "metric_value": 99.12280701754386
  }
}
Evaluating Content generation quality..


[nltk_data] Downloading package wordnet to /home/hadoop/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/hadoop/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{
  "flesch": {
    "flesch_reading_ease": {
      "metric_value": 64.37,
      "mean": 64.37,
      "min": 64.37,
      "max": 64.37,
      "std": 0.0
    },
    "flesch_kincaid_grade": {
      "metric_value": 6.0,
      "mean": 6.0,
      "min": 6.0,
      "max": 6.0,
      "std": 0.0
    }
  },
  "bleu": {
    "precisions": [
      0.5714285714285714,
      0.16666666666666666,
      0.0,
      0.0
    ],
    "brevity_penalty": 0.8668778997501817,
    "length_ratio": 0.875,
    "translation_length": 7,
    "reference_length": 8,
    "metric_value": 0.0
  },
  "meteor": {
    "metric_value": 0.39952531645569617
  },
  "rouge_score": {
    "rouge1": {
      "metric_value": 0.5714
    },
    "rouge2": {
      "metric_value": 0.1667
    },
    "rougeL": {
      "metric_value": 0.5714
    },
    "rougeLsum": {
      "metric_value": 0.5714
    }
  },
  "sari": {
    "metric_value": 67.50228937728939
  }
}

[1m> Finished chain.[0m


{'content': 'Apps that send push notifications at high frequencies can contribute to increased battery consumption. Users can manage notification settings, disable unnecessary alerts, and set longer intervals for non-essential updates.',
 'summary': 'Push notifications from apps can drain battery life, users can manage notification settings to reduce battery consumption.',
 'issue_type': 'BatteryPerformance',
 'resolution': 'Optimize task or app settings to conserve '}

Author: ravi.chamarthy@in.ibm.com