<a href="https://colab.research.google.com/github/shreeyut1905/InterIIT/blob/main/IITI_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dependencies

In [None]:
!pip install datasets transformers accelerate bitsandbytes sentence-transformers faiss-cpu



In [None]:
!pip install git+https://github.com/openai/whisper.git


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-2p245m2n
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-2p245m2n
  Resolved https://github.com/openai/whisper.git to commit 423492dda7806206abe56bdfe427c1096473a020
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


# Functions

In [None]:
import whisper
import os
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import whisper
import os
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
from IPython.display import Audio
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  print("recording for 5 seconds")
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  return 'audio.wav'

def transcribe():
  audio = record()
  model = whisper.load_model("base")
  result = model.transcribe(audio)
  return result["text"]
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

def text_to_speech(key):
  speech = synthesiser(str(key), forward_params={"speaker_embeddings": speaker_embedding})
  sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
  audio_file = "/content/speech.wav"

  audio = Audio(audio_file)

  display(audio)



# For CPU

In [None]:



device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

TOKEN = "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"


def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data.split('&&')


def initialize_models():

    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)


    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", use_auth_token=TOKEN)
    llm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct", use_auth_token=TOKEN).to(device)

    return embedding_model, tokenizer, llm_model


def create_index(sections, embedding_model):

    embeddings = embedding_model.encode(sections, convert_to_tensor=True, device=device).cpu().numpy()


    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    return index, embeddings


def search_query(query, embedding_model, index, sections, embeddings, top_k=3):

    query_embedding = embedding_model.encode(query, convert_to_tensor=True, device=device).cpu().numpy()


    distances, indices = index.search(np.array([query_embedding]), top_k)


    relevant_sections = [sections[idx] for idx in indices[0]]
    return relevant_sections


def generate_response(query, relevant_sections, tokenizer, llm_model, max_tokens=100):

    context = " ".join(relevant_sections)
    prompt = f"Context: {context}\n\nUser Query: {query}\nResponse:"


    inputs = tokenizer(prompt, return_tensors="pt").to(device)


    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_length=min(inputs['input_ids'].shape[1] + max_tokens, llm_model.config.max_position_embeddings),
            num_return_sequences=1,
            do_sample=True,
            top_p=0.95,
            temperature=0.7
        )


    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()


def main():

    file_path = 'data.txt'
    sections = read_text(file_path)


    embedding_model, tokenizer, llm_model = initialize_models()


    index, embeddings = create_index(sections, embedding_model)

    print("Chatbot is ready! Type your questions (or type 'exit' to quit):")


    while True:
        query = transcribe()
        if query.lower() == 'exit':
            print("Exiting chatbot. Goodbye!")
            break


        relevant_sections = search_query(query, embedding_model, index, sections, embeddings)


        response = generate_response(query, relevant_sections, tokenizer, llm_model)


        text_to_speech(response)

if __name__ == "__main__":
    main()


Using device: cpu




Chatbot is ready! Type your questions (or type 'exit' to quit):
recording for 5 seconds


<IPython.core.display.Javascript object>

  checkpoint = torch.load(fp, map_location=device)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Assistant: IIT Indore has a state-of-the-art campus internet facility. The campus LAN is extended to all the hostels and residences.


<IPython.core.display.Javascript object>

recording for 5 seconds


  checkpoint = torch.load(fp, map_location=device)


KeyboardInterrupt: 

# For GPU only

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

TOKEN = "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"


def read_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data.split('&&')


def initialize_models():

    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)


    quantization_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
    )

    print("Loading LLM with 8-bit quantization...")
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct", use_auth_token=TOKEN)
    llm_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-3.2-3B-Instruct",
        quantization_config=quantization_config,
        device_map="auto",
        use_auth_token=TOKEN
    )

    return embedding_model, tokenizer, llm_model


def create_index(sections, embedding_model):

    print("Creating embeddings...")
    embeddings = embedding_model.encode(sections, convert_to_tensor=True, device=device).cpu().numpy()


    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    return index, embeddings


def search_query(query, embedding_model, index, sections, embeddings, top_k=3):

    query_embedding = embedding_model.encode(query, convert_to_tensor=True, device=device).cpu().numpy()


    distances, indices = index.search(np.array([query_embedding]), top_k)


    relevant_sections = [sections[idx] for idx in indices[0]]
    return relevant_sections


def generate_response(query, relevant_sections, tokenizer, llm_model, max_tokens=100):

    context = " ".join(relevant_sections)
    prompt = f"Context: {context}\n\nUser Query: {query}\nResponse:"


    inputs = tokenizer(prompt, return_tensors="pt").to(device)


    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs,
            max_length=min(inputs['input_ids'].shape[1] + max_tokens, llm_model.config.max_position_embeddings),
            num_return_sequences=1,
            do_sample=True,
            top_p=0.95,
            temperature=0.7
        )


    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()




Using device: cpu


In [None]:
def main():

    file_path = 'data.txt'
    sections = read_text(file_path)


    embedding_model, tokenizer, llm_model = initialize_models()


    index, embeddings = create_index(sections, embedding_model)

    print("Chatbot is ready! Type your questions (or type 'exit' to quit):")


    while True:
        query = transcribe()
        if query.lower() == 'exit':
            print("Exiting chatbot. Goodbye!")
            break


        relevant_sections = search_query(query, embedding_model, index, sections, embeddings)


        response = generate_response(query, relevant_sections, tokenizer, llm_model)

        print(f"Assistant: {response}")

if __name__ == "__main__":
    main()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading LLM with 8-bit quantization...




tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend