In [None]:
pip install langchain_community langchain_text_splitters langchain_openai langchain_chroma gradio pypdf openai

Collecting langchain_community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.3.1-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting gradio
  Downloading gradio-5.12.0-py3-none-any.whl.metadata (16 kB)
Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.15 (from langchain_community)
  Downloading langchain-0.3.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.31 (from langchain_community)
  Downloading langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 kB)
Co

In [None]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from openai import OpenAI
import gradio as gr
from uuid import uuid4
import json
from collections import defaultdict
from google.colab import userdata

In [None]:
# get api key
api_key = userdata.get("openai_api_key")

# Fine Tuning

In [None]:
# func to convert from dataset to formatted openai understand
def convert_json_to_jsonl(input_file, output_file):
    # load file in read mode
    with open(input_file, "r") as file:
        data = json.load(file)

    # open output file in write mode
    with open(output_file, "w") as file:
        # iteratate over each question answer pair
        for que_ans in data["questions"]:
            chat_format = {
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a friendly hospitality chatbot named UTM_FC who loves to help people, and you're not satisfied unless the customer is satisfied.",
                    },
                    {"role": "user", "content": que_ans["question"]},
                    {"role": "assistant", "content": que_ans["answer"]},
                ]
            }
            # write the chat following openai training format to output file
            file.write(json.dumps(chat_format) + "\n")

In [None]:
input_file_path = "/content/Ecommerce_FAQ_Chatbot_dataset.json"
output_file_path = "/content/Ecommerce_FAQ_Chatbot_dataset.jsonl"

In [None]:
# Call the function to perform the conversion
convert_json_to_jsonl(input_file_path, output_file_path)

In [None]:
# Load the dataset
with open(output_file_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 79
First example:
{'role': 'system', 'content': "You are a friendly hospitality chatbot named UTM_FC who loves to help people, and you're not satisfied unless the customer is satisfied."}
{'role': 'user', 'content': 'How can I create an account?'}
{'role': 'assistant', 'content': "To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process."}


In [None]:
# Format validation
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(
            k not in ("role", "content", "name", "function_call", "weight")
            for k in message
        ):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [None]:
client = OpenAI(api_key=api_key)

# upload file to openAI
file_id = client.files.create(
    file=open(output_file_path, "rb"), purpose="fine-tune"
)

In [None]:
# train model
job_id = client.fine_tuning.jobs.create(
    training_file=file_id.id, model="gpt-4o-mini-2024-07-18"
)

In [None]:
job_id

FineTuningJob(id='ftjob-gYeZfrEYqSeH5pzQVTTHJWAm', created_at=1737432406, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-PVSOt5Au4VPWuHNCbVr0FrUT', result_files=[], seed=775058313, status='validating_files', trained_tokens=None, training_file='file-Nu5tvesXw7tCS4eGgbEbxm', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto')), type='supervised'), user_provided_suffix=None)

In [None]:
# check training status
# List up to 3 events from a fine-tuning job
client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id.id, limit=3)

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-PEFtOIITo9P4UfCfcHnDaPgM', created_at=1737432871, level='info', message='The job has successfully completed', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-coA4y8FDa5ufYxfuB90mzqo6', created_at=1737432864, level='info', message='New fine-tuned model created', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-BBaKK5Y7B0Rfh6TAKMdbIsYt', created_at=1737432864, level='info', message='Checkpoint created at step 158', object='fine_tuning.job.event', data={}, type='message')], object='list', has_more=True)

In [None]:
# get fine-tuned model name
model_object = client.fine_tuning.jobs.retrieve(job_id.id)
model_name = model_object.fine_tuned_model

In [None]:
model_name

'ft:gpt-4o-mini-2024-07-18:personal::ArzwebFg'

# Retrieval Augmented Generation (RAG)

In [None]:
# initiate the embeddings model
embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-small", openai_api_key=api_key
)

In [None]:
# initiate the vector store
vector_store = Chroma(
    collection_name="vector_collection",
    embedding_function=embeddings_model,
    persist_directory="chroma_db",
)

In [None]:
# loading the pdf documents
loader = PyPDFDirectoryLoader("data")

raw_documents = loader.load()

In [None]:
# splitting the document
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False,
)

In [None]:
# creating the chunks
chunks = text_splitter.split_documents(raw_documents)

In [None]:
# creating ID
uuids = [str(uuid4()) for _ in range(len(chunks))]

In [None]:
# add chunks to vector store
vector_store.add_documents(documents=chunks, ids=uuids)

['66351c63-2076-4523-8457-67f65e137376',
 'eaf64e26-1a2d-40a4-ad74-3e33e3a4c8a8',
 '98c5cd6e-f1b9-4bfd-a955-ee3c8302e635',
 '7ede7c19-c45b-4e1e-b4b8-1476aeeae8a2',
 'e8848f8e-5fb9-421a-9f4a-f9d318dbd4aa',
 '05751688-1474-41b3-bccb-73a10c2006b6',
 '6fe9408c-0c07-41d5-bab0-9d4f3ee8ec99',
 '02d18bf7-f351-4028-9747-d6ac24e8e802',
 'e66cf784-9ecd-45c0-854b-0e4813ab5b8f',
 '861361e3-fe8f-4602-ada9-8e58a6a2e0ea',
 '1069849a-c753-42e5-8774-0003187e44d0',
 '0aeb3213-eb5b-4e1b-922d-c46fcef348ad',
 'ab300ae1-2218-45d8-8c42-8b93655c2492',
 'a49a19c0-ca6c-4d55-91bf-5d7697f635ca',
 'ad71e313-8e4b-4bf8-8990-28a84316a24b',
 'bbe2c6a4-3cb9-47cf-9f19-8728f3099659',
 '51c4d6ad-f7b4-4961-971c-14e96f55da86',
 'c2ace69f-5254-4aee-b9fa-11ed3b04a394',
 'f33fb3d2-d139-4620-ae3b-cf36ab0648f3',
 'e6e34ed9-1738-4d3f-b044-20200bcc80ed',
 '8a125031-de5b-4700-8b70-84821180a654',
 '3499b278-6b00-4485-add7-e9c881f14cb0',
 'f4dc203e-66b6-43c8-b81f-5c75e79a0bc2',
 '0e916bf4-a36b-44cb-9537-af141df66d8e',
 '7f89951e-c749-

In [None]:
# setup the vector store to be the retriever, top 5 most relavant chunks to the question
num_results = 5
retriever = vector_store.as_retriever(search_kwargs={"k": num_results})

# Chatbot

In [None]:
# initiate the model, low temperature for low creative writing
large_language_model = ChatOpenAI(
    temperature=0.1, model=model_name, max_tokens=500, openai_api_key=api_key
)

In [66]:
# this func is called for every message in the chatbot
def output(message, history):
    # retrieve the relevant chunks based on the questions
    docs = retriever.invoke(message)

    knowledge = ""
    # add chunks to the knowledge base
    for doc in docs:
        knowledge += doc.page_content + "\n\n"

    # make the call to fine-tuned large language model
    if message is not None:
        response = ""
        rag_prompt = f"""
        You are a chatbot assistant named UTM_FC which answers questions based on knowledge which is provided to you.
        When responding, rely solely on the information provided in the "The knowledge" section and
         do not mention the source of this information to the user.

        The question: {message}
        Conversation history: {history}
        The knowledge: {knowledge}
        """

        # stream the answer to gradio
        for answer in large_language_model.stream(rag_prompt):
            response += answer.content
            yield response

In [67]:
# initiate the Gradio app
chatbot = gr.ChatInterface(
    output,
    type='messages',
    textbox=gr.Textbox(
        placeholder="Question about our ecommerce system...",
        container=False,
        autoscroll=True,
        scale=7,
    ),
    title="Chat with UTM_FC",
)

In [68]:
# launch the chatbot
chatbot.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://3e5202951fb9bb7db3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [69]:
# compare with normal gpt-4o-mini model without RAG or fine-tuning
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a friendly hospitality chatbot named UTM_FC who loves to help people, and you're not satisfied unless the customer is satisfied."},
        {"role": "user", "content": "I'm a buyer, when can I request for a return or refund?"}
    ],
)

print(completion.choices[0].message.content)

Hello! I’m here to help you with that. The timeframe for requesting a return or refund can vary depending on the store or service you purchased from. Generally, many retailers allow returns within 30 to 90 days of the purchase date. It’s always a good idea to check the specific return policy of the retailer or service to ensure you’re within the set guidelines.

If you need help with a specific purchase or store, feel free to provide more details, and I'll do my best to assist you!


In [70]:
completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a friendly hospitality chatbot named UTM_FC who loves to help people, and you're not satisfied unless the customer is satisfied."},
        {"role": "user", "content": "What payment methods can I use in the system?"}
    ],
)

print(completion.choices[0].message.content)

Great question! The payment methods available can vary depending on the specific hospitality service or platform you're using. However, common options usually include:

1. **Credit/Debit Cards** (Visa, MasterCard, American Express, etc.)
2. **Mobile Payment Services** (Apple Pay, Google Pay, etc.)
3. **PayPal** or other online payment systems
4. **Bank Transfers**
5. **Gift Cards** or vouchers specific to the service
6. **Cash**, if applicable at the point of service

If you let me know the specific service or platform you're inquiring about, I can provide more detailed information! Your satisfaction is my top priority! 😊


In [None]:
pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers>=3.0.0->bert_score)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, bert_score
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installe

In [72]:
from bert_score import BERTScorer
# instantiate the BERTScorer object
scorer = BERTScorer(lang="en")

# Calculate BERTScore
# Precision, Recall, and F1 Score respectively
# before RAG & fine tuning
P1, R1, F1_1 = scorer.score([completion.choices[0].message.content], ["We accept major credit cards, debit cards, and PayPal as payment methods for online orders."])

# after fine tune the model and use RAG
P2, R2, F2_2 = scorer.score(["The payment methods accepted in the system include credit cards, debit cards, and PayPal."], ["We accept major credit cards, debit cards, and PayPal as payment methods for online orders."])

print("\nPrecision:")
print("Base model - Precision:", P1.tolist()[0])
print("Enhanced model -Precision:", P2.tolist()[0])

print("\nRecall:")
print("Base model - Recall:", R1.tolist()[0])
print("Enhanced model - Recall:", R2.tolist()[0])

print("\nF1 Score:")
print("Base model - F1 Score:", F1_1.tolist()[0])
print("Enhanced model F1 Score:", F2_2.tolist()[0])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Precision:
Base model - Precision: 0.8022609949111938
Enhanced model -Precision: 0.9357295036315918

Recall:
Base model - Recall: 0.8841315507888794
Enhanced model - Recall: 0.9297795295715332

F1 Score:
Base model - F1 Score: 0.8412089943885803
Enhanced model F1 Score: 0.9327449798583984
