**Note : If you’re using Google Colab to run the code. In your notebook, go to Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4. You will need ~8GB of GPU RAM for inference and running on CPU is practically impossible.**

# Install required packages

In [1]:
!pip install typing-extensions==3.10.0.2
!pip install -q gradio

Collecting typing-extensions==3.10.0.2
  Using cached typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.8.0
    Uninstalling typing_extensions-4.8.0:
      Successfully uninstalled typing_extensions-4.8.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires kaleido, which is not installed.
sqlalchemy 2.0.23 requires typing-extensions>=4.2.0, but you have typing-extensions 3.10.0.2 which is incompatible.
arviz 0.15.1 requires typing-extensions>=4.1.0, but you have typing-extensions 3.10.0.2 which is incompatible.
chex 0.1.7 requires typing-extensions>=4.2.0; python_version < "3.11", but you have typing-extensions 3.10.0.2 which is incompatible.
fastapi 0.104.1 requires typing-extensions>=4.8.0, but

In [2]:
!pip install unstructured
!pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers



# Import necessary libraries

In [3]:
from torch import cuda, bfloat16
import transformers
import torch

# Define model information

In [4]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory : This requires the `bitsandbytes` library

In [5]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# Begin initializing HF items, you need an access token

In [6]:
hf_auth = 'ADD_HUGGINGFACE_AUTH_KEY'                   # Add your Hugging Face auth-key here
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

# Load the pre-trained model

In [7]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)




model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Enable evaluation mode for model inference

In [8]:
model.eval()
print(f"Model loaded on {device}")

Model loaded on cuda:0


# Initialize tokenizer

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Define stop tokens

In [10]:
stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

# Convert stop tokens to tensor and move to device

In [11]:
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

# Import necessary libraries for stopping criteria

In [12]:
from transformers import StoppingCriteria, StoppingCriteriaList

# Define custom stopping criteria object

In [13]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

# Create stopping criteria list

In [14]:
stopping_criteria = StoppingCriteriaList([StopOnTokens()])

# Create text generation pipeline

In [15]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

# Import necessary libraries for langchain

In [16]:
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain

import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Initialize llm variable with pipeline model

In [17]:
llm = HuggingFacePipeline(pipeline=generate_text)

# Note: Please upload your csv dataset on colab files folder before running the below cell and accordingly check file_path

# Load CSV dataset

In [18]:
loader = CSVLoader(
    file_path="/content/Hackathon_TD_Dataset.csv", encoding= 'utf-8',  #copy file path and paste here
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["instructions", "input", "output", "text"],
    },
)
documents = loader.load()
documents[0]

Document(page_content='instructions: \ufeffinstruction\ninput: input\noutput: output\ntext: text', metadata={'source': '/content/Hackathon_TD_Dataset.csv', 'row': 0})

# Split documents into chunks

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

# Initialize Hugging Face embeddings and create vector store

In [20]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Create Conversational Retrieval Chain

In [21]:
chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

# Test the model output


In [22]:
chat_history = []
query = "Who are my TD Champions?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 Based on the input provided, it seems that the TD Champions are the individuals responsible for managing and implementing the Talent Development (TD) program within their respective Business Units/Markets. To access the information regarding TD Champions, please refer to the attachment provided or follow the practice of the requestor.


# Define a function to get chatbot answer

In [23]:
def get_chatbot_answer(query, chat_history):
    result = chain({"question": query, "chat_history": chat_history})
    chat_history.append({"question": query, "answer": result['answer']})
    return result['answer']

# Integrating Gradio as user interface here:

In [24]:
import gradio as gr

def get_chatbot_answer(query, chat_history):
    # Ensure chat_history is initialized properly
    if chat_history is None:
        chat_history = []

    result = chain({"question": query, "chat_history": chat_history})
    chat_history.append({"question": query, "answer": result['answer']})
    return result['answer']

# Define Gradio Interface
iface = gr.Interface(
    fn=get_chatbot_answer,
    inputs=gr.Textbox(),
    outputs=gr.Textbox(),
    live=False,  # Set to False if you want to manually click a button to get a response
    title="Welcome to BristleBot",
    theme="huggingface",
    allow_flagging=True,  # Enable flagging of answers as bad or good
    examples=[
        ["How do I login to POSH module?", []],
        ["How can I access the TD Portal?", []],
        ["How can I request for a training?", []]
    ]
)



# Launch the Gradio interface

In [25]:
iface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://cea6840df49f195f9f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://cea6840df49f195f9f.gradio.live


