<a href="https://colab.research.google.com/github/ranjani277/LLM-model-/blob/main/News_Research_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ctransformers
!pip install chromadb
!pip install langchain
!pip install sentence_transformers
!pip install bitsandbytes
!pip install langchain_community
!pip install unstructured



In [2]:
# Data Handling
import pandas as pd
import numpy as np

# Torch and Transformers
import torch
from torch import bfloat16
import transformers
from transformers import AutoTokenizer

# LangChain
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.document_loaders import UnstructuredURLLoader
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [3]:
# Hiding warnings
import warnings
warnings.filterwarnings("ignore")

In [11]:
user_input = []
print("Enter your URLS: ")
while True:
  urls = input()
  if urls:
    user_input.append(urls)
  else:
    break

print("Given URLS: ")
print(user_input)

Enter your URLS: 
https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html

Given URLS: 
['https://www.moneycontrol.com/news/business/tata-motors-launches-punch-icng-price-starts-at-rs-7-1-lakh-11098751.html']


In [12]:


loaders = UnstructuredURLLoader(urls=user_input)
data = loaders.load()
len(data)


1

In [13]:

splitter = RecursiveCharacterTextSplitter(chunk_size = 1000,
                                chunk_overlap = 20)
splitted_texts = splitter.split_documents(data)

In [14]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [15]:
# Check if splitted_texts is empty
print(len(splitted_texts))


4


In [16]:
chroma_database = Chroma.from_documents(splitted_texts,
                                      embedding_model,
                                      persist_directory = 'chroma_db')


In [17]:

retriever = chroma_database.as_retriever()

In [18]:

quantization_config = transformers.BitsAndBytesConfig(load_in_4bit = True,
                                        bnb_4bit_quant_type = 'nf4',
                                        bnb_4bit_use_double_quant = True,
                                        bnb_4bit_compute_dtype = bfloat16)

In [19]:
llm = CTransformers(
    model="TheBloke/Mistral-7B-Instruct-v0.1-GGUF",
    model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf",
    config={"max_new_tokens": 2048, "context_length": 4096, "temperature": 0},
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
QnA = RetrievalQA.from_chain_type(llm = llm,
                                 chain_type = 'stuff',
                                 retriever = retriever,
                                 verbose = False)

In [21]:
# Defining function to fetch documents according to a query
def get_answers(QnA, query):
    answer = QnA.run(query)
    print(f"\033[1mQuery:\033[0m {query}\n")
    print(f"\033[1mAnswer:\033[0m ", answer)

In [22]:
query = input("Enter your query: ")
get_answers(QnA, query)

Enter your query: what is the price of Tiago iCNG?
[1mQuery:[0m what is the price of Tiago iCNG?

[1mAnswer:[0m   The price of Tata Motors' Punch iCNG starts at Rs 7.1 lakh.
