In [None]:
!pip install transformers
!pip install pyngrok
!pip install flask
!pip install bs4
!pip install llama-index
!pip install bitsandbytes
!pip install sentencepiece
!pip install accelerate

In [None]:
!pip install llama_index.embeddings.huggingface
!pip install llama_index.llms.huggingface

In [4]:
from flask import Flask, request, jsonify
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from llama_index.core import VectorStoreIndex, Settings
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from transformers import BitsAndBytesConfig
import os
from pyngrok import ngrok

In [2]:
from llama_index.core import Document

In [5]:
ngrok.set_auth_token("2m3INDfD7mYEcHw8VB1STZuofFc_UBjx4mHCjjpu5iZbbxgN")

In [11]:
quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_compute_dtype=torch.float16,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True
 )

        # Initialize models
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
            "Qwen/Qwen2.5-3B-Instruct",
            quantization_config = quantization_config,
            device_map="auto",
            torch_dtype=torch.float16
        )

tokenizer.save_pretrained("./local_tokenizer")
model.save_pretrained("./local_model")

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [8]:
app = Flask(__name__)

# In-memory chat history
chat_history = []
message_id = 0  # For auto-incrementing IDs

class RAGChatbot:
    def __init__(self):

        # Initialize models
        self.tokenizer = AutoTokenizer.from_pretrained("/content/local_tokenizer")
        self.model = AutoModelForCausalLM.from_pretrained("/content/local_model")


        self.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
        self.llm = HuggingFaceLLM(
            model=self.model,
            tokenizer=self.tokenizer,
            max_new_tokens=128,
        )

        # Settings configuration
        self.settings = Settings
        self.settings.llm = self.llm
        self.settings.embed_model = self.embed_model
        self.settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
        self.settings.num_output = 50
        self.settings.context_window = 2048
        self.settings.generate_kwargs = {
            "do_sample": False,
            "temperature": 0.1,
            "max_new_tokens": 128
        }

    def scrape_website(self, url):
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            # Get text and clean it
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)

            return text
        except Exception as e:
            print(f"Error scraping website: {e}")
            return None

    def create_index_from_websites(self, urls):
        texts = []
        for url in urls:
            text = self.scrape_website(url)
            if text:
                texts.append(text)

        # Create documents from scraped text
        documents = [Document(text = text) for text in texts]

        # Create index
        self.index = VectorStoreIndex.from_documents(
            documents,
            embed_model=self.settings.embed_model,
            node_parser=self.settings.node_parser,
            show_progress=True
        )

        # Create query engine
        self.query_engine = self.index.as_query_engine(
            llm=self.settings.llm,
            similarity_top_k=5,
            response_mode="compact",
            verbose=True,
            generate_kwargs=self.settings.generate_kwargs,
            context_window=self.settings.context_window,
            num_output=self.settings.num_output,
            show_progress=True
        )

    def get_response(self, query):
        response = self.query_engine.query(query)
        return str(response)

# Initialize chatbot
chatbot = RAGChatbot()

# Example websites - replace with your target URLs
websites = [
    "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)",
]
chatbot.create_index_from_websites(websites)

def add_to_history(role,query, response):
    global message_id
    message_id += 1
    message = {
        "id": message_id,
        "timestamp": datetime.utcnow().isoformat(),
        "role": role,
        "User_query": query,
        "generated_response": response
    }
    chat_history.append(message)
    return message

@app.route('/chat', methods=['POST'])
def chat():
    try:
        data = request.get_json()
        query = data.get('query')

        if not query:
            return jsonify({"error": "No query provided"}), 400

        # Get response from chatbot
        response = chatbot.get_response(query)

        # Store system response
        system_message = add_to_history('system', query, response)

        return jsonify({
            "response": response,
            "message_id": system_message["id"]
        })

    except Exception as e:
        return jsonify({"error": str(e)}), 500

@app.route('/history', methods=['GET'])
def get_history():
    return jsonify({"history": chat_history})

if __name__ == '__main__':

    ngrok.connect(5000)

    # Get the public URL
    tunnels = ngrok.get_tunnels()
    ngrok_url = tunnels[0].public_url
    print(f" * Public URL: {ngrok_url}")

    # Run Flask app
    app.run(port=5000)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/56 [00:00<?, ?it/s]

 * Public URL: https://c6f3-34-125-53-18.ngrok-free.app
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [30/Jan/2025 06:19:26] "POST /chat HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [30/Jan/2025 06:19:27] "GET /history HTTP/1.1" 200 -
