In [None]:
!pip install -q transformers accelerate langchain langchain-huggingface langchain-chroma # RUN ONCE -> RESTART SESSION

In [1]:
import os
import sys
from google.colab import drive
from google.colab import userdata

drive.mount("/content/drive")

HF_TOKEN = userdata.get("HF_TOKEN")
os.environ["HF_TOKEN"] = HF_TOKEN

print("\nHugging Face Token successfully set.")

OPENAI_API_KEY = userdata.get("OPENAI_API")
os.environ["OPENAI_API"] = OPENAI_API_KEY

print("\nOpenAI API successfully set.\n")

sys.path.append("/content/drive/MyDrive/ES-CSA/src")

%cd /content/drive/MyDrive/ES-CSA/data/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Hugging Face Token successfully set.

OpenAI API successfully set.

/content/drive/MyDrive/ES-CSA/data


In [2]:
import torch
import shutil
import textwrap
import sqlite3
from torch import bfloat16
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import LLMChain
from langchain_chroma import Chroma
from embedding_utils import LocalEmbedding
from intent_classifier import BGEIntentClassifier, INTENT_LABELS, INTENT_TO_METADATA
from rag_pipeline import run_rag_pipeline, rag_pipeline_with_llm
from llm_pipeline import create_pipeline
from llm_wrapper import CoherePromptWrapper

In [3]:
# Initializing MSISDN Mapping Database Path

db_path = "msisdn_mapping.db"
conn = sqlite3.connect(db_path)

# Initializing Embedding Model (all-MiniLM-L6-v2)

embedding_model = LocalEmbedding()

# Initializing Vector Store (ChromaDB)

vectorstore = Chroma(
    persist_directory="embeddings/chromadb_embeddings",
    embedding_function=embedding_model,
    collection_name="consumer_db"
)

# Initializing Intent Classifier (BAAI/bge-m3)

classifier = BGEIntentClassifier(
    intent_labels=INTENT_LABELS,
    intent_metadata_map=INTENT_TO_METADATA
)

# Initialize Tokenizer & LLM Model (command-r7b-12-2024)

model_id = "CohereForAI/c4ai-command-r7b-12-2024"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=bfloat16,
    device_map="auto",
    low_cpu_mem_usage=True
)

print(f"\nEmbedding Model:   all-MiniLM-L6-v2")
print(f"\nVector Store:      ChromaDB")
print(f"\nIntent Classifier: BAAI/bge-m3")
print(f"\nLLM Model:         CohereForAI/c4ai-command-r7b-12-2024")

print(f"\n\nUsing Device:", "GPU" if {model.device} == {"cuda:0"} else "CPU")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]


Embedding Model: all-MiniLM-L6-v2

Vector Store: ChromaDB

Intent Classifier: BAAI/bge-m3

LLM Model: CohereForAI/c4ai-command-r7b-12-2024


Using Device: cuda:0


In [8]:
# Initialize Text Generation Pipeline

llm_pipeline = create_pipeline(model=model, tokenizer=tokenizer)

# Wrap Pipeline using CoherePromptWrapper

llm = CoherePromptWrapper(pipeline=llm_pipeline, tokenizer=tokenizer)

# Initializing Retrieval Chain

rag_chain = llm

print(f"\nRAG Chain initialized with LLM and System Prompt.")

Device set to use cuda:0



RAG Chain initialized with LLM and System Prompt.


In [12]:
# Test: Querying the RAG integrated LLM model

# Query 1

query = "What region has the most amount of customers?"
session_context = {"msisdn": "9230610000463"}

response = rag_pipeline_with_llm(
    query=query,
    session_context=session_context,
    classifier=classifier,
    vectorstore=vectorstore,
    rag_chain=rag_chain,
    llm=llm,
    db_path=db_path,
)

print("=" * 65)
print(f"Query:", textwrap.fill(query, width=90))
print("=" * 65)
print(f"\nGenerated Response:")
print("-" * 19)
print(textwrap.fill(response.strip(), width=90))

# Query 2

query = "How many purchases have I made and how much was I charged for them?"
session_context = {"msisdn": "9230610000463"}

response = rag_pipeline_with_llm(
    query=query,
    session_context=session_context,
    classifier=classifier,
    vectorstore=vectorstore,
    rag_chain=rag_chain,
    llm=llm
)

print("\n")
print("=" * 65)
print(f"Query:", textwrap.fill(query, width=90))
print("=" * 65)
print(f"\nGenerated Response:")
print("-" * 19)
print(textwrap.fill(response.strip(), width=90))

# Query 3

query = "What is the status of my most recent complaint?"
session_context = {"msisdn": "9230610000463"}

response = rag_pipeline_with_llm(
    query=query,
    session_context=session_context,
    classifier=classifier,
    vectorstore=vectorstore,
    rag_chain=rag_chain,
    llm=llm
)

print("\n")
print("=" * 65)
print(f"Query:", textwrap.fill(query, width=90))
print("=" * 65)
print(f"\nGenerated Response:")
print("-" * 19)
print(textwrap.fill(response.strip(), width=90))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Query: What region has the most amount of customers?

Generated Response:
-------------------
Quetta has the most customers with 75 active users.


Query: How many purchases have I made and how much was I charged for them?

Generated Response:
-------------------
You have made 5 purchases, totaling 3164 PKR.


Query: What is the status of my most recent complaint?

Generated Response:
-------------------
Your most recent complaint (Ticket ID T48828) was resolved on November 24, 2023, with a
detailed explanation under the General Inquiry category.
