# Environment Setup

In [None]:
import importlib
import subprocess
import sys

libraries = ["langchain", "langchain_community", "huggingface_hub", "langchain_openai"]

for library in libraries:
    try:
        # Try to import the library
        module = importlib.import_module(library)
        print(f"Library {library} version: {module.__version__}")
    except ImportError:

        # If library is not installed, attempt to install it
        print(f"Library {library} not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", library])

        # After installing, import again and print the version
        module = importlib.import_module(library)
        # print(f"Library {library} version after installation: {module.__version__}")
    except AttributeError:
        # If library doesn't have __version__ attribute
        print(f"Library {library} does not have a __version__ attribute.")

Library langchain version: 0.3.23
Library langchain_community not found. Installing...
Library huggingface_hub version: 0.30.2
Library langchain_openai not found. Installing...


In [None]:
!python --version

Python 3.11.12


In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN_NEW')

In [None]:
import os
from langchain.llms import HuggingFaceHub
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from google.colab import userdata

#LLM


##Azure Open AI


In [None]:
from langchain_openai import AzureOpenAI
import os
from google.colab import userdata

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2024-05-01-preview"
os.environ["AZURE_OPENAI_API_KEY"] = userdata.get('AZ_OPENAI_KEY')
os.environ["AZURE_OPENAI_ENDPOINT"] =  "https://azopenai-demo.openai.azure.com/"

llm = AzureOpenAI(deployment_name="dp-gpt-35-turbo-instruct", model_name="gpt-35-turbo-instruct")

Connect to HuggingFace LLM

In [None]:
# Load LLaMA 4 model from Hugging Face Hub (make sure it's a chat-compatible LLaMA4 model)
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN_NEW')
llm_hf = HuggingFaceHub(
    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",  # Adjust if you're using LLaMA 4 when it's available
    model_kwargs={
        "temperature": 0.7,
        "max_new_tokens": 256,
        "top_p": 0.9,
        "repetition_penalty": 1.1
    }
)

  llm_hf = HuggingFaceHub(


# Chapter 7

## Sentiment Analysis and Classification

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Simple sentiment analysis chain
sentiment_prompt = ChatPromptTemplate.from_template(
    "Analyze the sentiment of the following text. Respond with only 'positive', 'negative', or 'neutral'.\n\nText: {text}"
)

sentiment_chain = sentiment_prompt | llm | StrOutputParser()

# Example usage
text = "This is a great product! I love it."
sentiment = sentiment_chain.invoke({"text": text})
print(f"Sentiment: {sentiment}")


Sentiment: 

Positive


## Building a Text Classifier


In [None]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

# Multi-class classification template
classification_template = """
Classify the following text into one of these categories:
- Product Question
- Technical Support
- Billing Issue
- Feature Request
- Complaint
Text: {text}

Classification (respond with all the categories as per the text. You may show more than 1 if the text falls under multiple classess.):
"""

classification_prompt = PromptTemplate(
    input_variables=["text"],
    template=classification_template
)

classifier_chain = LLMChain(llm=llm, prompt=classification_prompt)

# llm = AzureOpenAI(deployment_name="dp-gpt-35-turbo-instruct", model_name="gpt-35-turbo-instruct")
# Example usage
category = classifier_chain.invoke("I am unable to login and My subscription was charged twice this month.")
print(f"Category: {category}")

Category: {'text': '\n- Technical Support\n- Billing Issue'}


  classifier_chain = LLMChain(llm=llm, prompt=classification_prompt)


## Advanced Classification with Structured Output


In [None]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Dict
# Use built-in float instead of typing.Float

class ClassificationResult(BaseModel):
    primary_category: str = Field(description="The main category of the text")
    confidence: float = Field(description="Confidence score between 0 and 1") # Changed typing.Float to float
    secondary_categories: List[Dict[str, float]] = Field( # Changed typing.Float to float
        description="Other possible categories with confidence scores"
    )

classification_parser = PydanticOutputParser(pydantic_object=ClassificationResult)
llm = AzureOpenAI(deployment_name="dp-gpt-35-turbo-instruct", model_name="gpt-35-turbo-instruct")
advanced_classification_prompt = PromptTemplate(
    template="Classify the following text:\n{text}\n{format_instructions}",
    input_variables=["text"],
    partial_variables={"format_instructions": classification_parser.get_format_instructions()}
)

advanced_classifier = advanced_classification_prompt | llm | classification_parser

In [None]:
# Example usage:
user_input = "I'm having trouble logging into my account. I think my password might be incorrect."

# Invoke the advanced classifier
result = advanced_classifier.invoke({"text": user_input})

# Print the classification results
print(result)

# Access specific fields of the result
print(f"Primary Category: {result.primary_category}")
print(f"Confidence: {result.confidence}")
print(f"Secondary Categories: {result.secondary_categories}")

primary_category='IT' confidence=0.9 secondary_categories=[{'security': 0.7}, {'technology': 0.6}]
Primary Category: IT
Confidence: 0.9
Secondary Categories: [{'security': 0.7}, {'technology': 0.6}]


## Integrating Traditional ML Models


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import numpy as np

# Training data
texts = ["I love this product", "This doesn't work", "How do I install this?"]
labels = ["positive", "negative", "question"]

# Create a scikit-learn pipeline
ml_classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

ml_classifier.fit(texts, labels)
# Wrap in a LangChain tool
from langchain.tools import Tool

def classify_text(text):
    pred = ml_classifier.predict([text])[0]
    proba = ml_classifier.predict_proba([text])[0]
    confidence = np.max(proba)
    return {"classification": pred, "confidence": float(confidence)}

classification_tool = Tool(
    name="TextClassifier",
    func=classify_text,
    description="Classifies text as positive, negative, or question"
)

In [None]:
#example usage
text_to_classify = "This is an awesome product!"
result = classification_tool.run(text_to_classify)
print(result)

{'classification': np.str_('positive'), 'confidence': 0.44264572877572594}


In [None]:
# Importing the GPT-4 LLM and setting it up
os.environ['OPENAI_API_KEY']= "<Use OpenAI API Key>"
os.environ["OPENAI_API_BASE"] = "<Use OpenAI URL>"

## Model Selection Strategies

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.evaluation.criteria import LabeledCriteriaEvalChain, Criteria
from langchain_openai import ChatOpenAI
import pandas as pd

def evaluate_models_on_task(models, task_examples, evaluation_criteria, evaluator_llm):
    results = []

    evaluator = LabeledCriteriaEvalChain.from_llm(
        llm=evaluator_llm,
        criteria=evaluation_criteria
    )

    for model_name, model in models.items():
        chain = LLMChain(llm=model, prompt=task_examples["prompt"])

        scores = []
        for example in task_examples["examples"]:
            prediction = chain.invoke(example["input"])

            evaluation_result = evaluator.evaluate_strings(
                input=example["input"]["text"],
                prediction=prediction,
                reference=example["expected"]
            )

            scores.append(evaluation_result.get("score", 0))

        results.append({
            "model": model_name,
            "avg_score": sum(scores) / len(scores),
            "min_score": min(scores),
            "max_score": max(scores)
        })

    return pd.DataFrame(results)

llm_gpt4o_mini = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.0
)

models = {
    "gpt-3.5-turbo": llm,
    "gpt-4o-mini": llm_gpt4o_mini,
}

# Define classification task
classification_examples = {
    "prompt": PromptTemplate(
        template="Classify the sentiment: {text}",
        input_variables=["text"]
    ),
    "examples": [
        {"input": {"text": "I love this product"}, "expected": "positive"},
        {"input": {"text": "This is terrible"},    "expected": "negative"},
    ]
}

# Run evaluation using gpt-3.5-turbo as the evaluator
results = evaluate_models_on_task(
    models=models,
    task_examples=classification_examples,
    evaluation_criteria=Criteria.CORRECTNESS,
    evaluator_llm=llm
)

print(results)


OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
import pandas

# Replace 'your_file_id' with the actual file ID from the Google Drive URL
file_id = "1mPP2UvJAkGjqF78rLf8S1izVoE7-5dfl"
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"

data_for_fine_tuning = pandas.read_csv(download_url)

In [None]:
data_for_fine_tuning.head()

Unnamed: 0,query,response,category,priority
0,I can't log into my account after three attempts,Your account may be temporarily locked for sec...,Authentication,High
1,How do I cancel my subscription?,You can cancel your subscription by logging in...,Billing,Medium
2,When will my order be delivered?,Thank you for your patience. Based on your ord...,Billing,Medium
3,The mobile app keeps crashing when I try to up...,I'm sorry you're experiencing this issue. Plea...,Technical Support,High
4,Do you offer student discounts?,Yes! We offer a 15% discount for verified stud...,StudentBeans.,Pricing


## Preparing Fine-tuning Data with LangChain


In [None]:
import pandas as pd
import json

# Google Drive file ID and download URL
file_id = "1mPP2UvJAkGjqF78rLf8S1izVoE7-5dfl"
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"

# Load the CSV using pandas
data_df = pd.read_csv(download_url)

# Function to prepare data in OpenAI fine-tuning format
def prepare_openai_fine_tuning_data(df, instruction, input_column="query", response_column="response"):
    formatted_data = []

    for _, row in df.iterrows():
        # Check if response exists
        if pd.isna(row[response_column]):
            print(f"Warning: Missing response for query: {row[input_column]}. Skipping...")
            continue

        example = {
            "messages": [
                {"role": "system", "content": instruction},
                {"role": "user", "content": row[input_column]},
                {"role": "assistant", "content": row[response_column]}
            ]
        }
        formatted_data.append(example)

    return formatted_data

# Instruction for the model
instruction = "You are a customer support assistant. Provide helpful, accurate, and concise responses."

# Generate fine-tuning data
fine_tuning_data = prepare_openai_fine_tuning_data(
    data_df,
    instruction,
    input_column="query",
    response_column="response"
)

# Save as JSONL
with open("fine_tuning_data.jsonl", "w", encoding="utf-8") as f:
    for item in fine_tuning_data:
        f.write(json.dumps(item) + "\n")

In [None]:
# ft_data = json.loads("")
import json

# Correct way to load a JSONL file
file_path = "/content/fine_tuning_data.jsonl"

docs = []
with open(file_path, "r") as f:
    for line in f:
        doc = json.loads(line)
        docs.append(doc)

# Now you can print some examples
for doc in docs[:2]:
    print(doc)

{'messages': [{'role': 'system', 'content': 'You are a customer support assistant. Provide helpful, accurate, and concise responses.'}, {'role': 'user', 'content': "I can't log into my account after three attempts"}, {'role': 'assistant', 'content': "Your account may be temporarily locked for security. Please use the 'Forgot Password' link on the login page to reset your password or wait 30 minutes before trying again. If the issue persists contact our security team at security@example.com."}]}
{'messages': [{'role': 'system', 'content': 'You are a customer support assistant. Provide helpful, accurate, and concise responses.'}, {'role': 'user', 'content': 'How do I cancel my subscription?'}, {'role': 'assistant', 'content': 'You can cancel your subscription by logging into your account'}]}


In [None]:
pip install --upgrade transformers -q

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import json
import torch

# Load tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Load the dataset
data_files = {"train": "/content/fine_tuning_data.jsonl"}
dataset = load_dataset("json", data_files=data_files)

# Prepare prompt-response pairs
def process_messages(example):
    system_prompt = ""
    user_input = ""
    assistant_response = ""

    for message in example["messages"]:
        if message["role"] == "system":
            system_prompt = message["content"]
        elif message["role"] == "user":
            user_input = message["content"]
        elif message["role"] == "assistant":
            assistant_response = message["content"]

    # Combine into a single training sample
    full_prompt = f"System: {system_prompt}\nUser: {user_input}\nAssistant: "
    full_completion = assistant_response

    return {
        "input_text": full_prompt,
        "output_text": full_completion
    }

# Apply processing
processed_dataset = dataset.map(process_messages, remove_columns=["messages"])

# Tokenize
def tokenize_function(examples):
    inputs = [i + o for i, o in zip(examples["input_text"], examples["output_text"])]
    model_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    return model_inputs

tokenized_dataset = processed_dataset.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    # evaluation_strategy="no",
    weight_decay=0.01,
    push_to_hub=False,
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator,
)

# Train
trainer.train()

# Save
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmnitin3[0m ([33mmnitin3-testbook[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [None]:
  from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

  # 1. Load the fine-tuned GPT-2 model
  model_path = "./gpt2-finetuned"  # path where your fine-tuned model is saved
  tokenizer = GPT2Tokenizer.from_pretrained(model_path)
  model = GPT2LMHeadModel.from_pretrained(model_path)

  # Create a pipeline for text generation
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=512)

  # 2. Define the function to handle text generation
  def generate_response(input_text):
      # Generate the response using the model pipeline
      result = generator(input_text, return_full_text=False)
      return result[0]['generated_text']

  # 3. Example usage
  query = "I can't reset my password"
  response = generate_response(query)

  print("\nResponse from fine-tuned model:\n")
  print(response)


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Response from fine-tuned model:

 or anything so I can't open it. Please, please try again. I'm sorry, there was a problem. There was nothing wrong.

12/9/2018

I used the "Help". I have been using "Login" (I'm guessing you are a customer support rep?), and I see nothing wrong. Sorry. I asked if they have a free account and they said they do, but no. You can purchase a free trial subscription at the checkout, or by entering your name above. If you'd like to use this for other purposes see the link by clicking on that link. Thank you.

12/1/2018

Sorry for the inconvenience! Please try again. After a period of time, the download will stop. Please try again.

12/2/2018

Just finished signing up, and I'm very happy with the experience. I tried a number of things, but my device has nothing to do with battery performance or anything. So, I've tried the app, and it does indeed work. I'm a bit frustrated. And after getting my device back to normal again, my device seems to have an occasional