In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#install libraries

In [None]:
!pip install langchain-community
!pip install -q cassio datasets langchain openai tiktoken
## Embedding
!pip install install sentence_transformers
!pip install llama-cpp-python
!pip install langchain
!pip install ctransformers

#Load Json Data

In [None]:
import json
import re


In [None]:
json_path = "/content/drive/MyDrive/Dataset/news.article.json"

In [None]:
# Load the JSON file
with open(json_path, 'r') as file:
    data = json.load(file)

In [None]:
data[0]

{'articleBody': 'Sanjay Raut, a member of the Shiv Sena (UBT) party, responded to the Maharashtra chief minister\'s statement that Eknath Shinde "himself is Hamas" and that the Shiv Sena group led by Uddhav Thackeray is capable of collaborating with "Hamas and Lashkar-e-Taiba for their own selfishness" on Wednesday by claiming that Eknath Shinde is Hamas.\n\n\n\nRaut made fun of Shinde by claiming, "He himself is Hamas. Hamas and Lashkar-e-Taiba, two terrorist groups, are completely irrelevant in Maharashtra. But the BJP is to blame for sowing the worms in their (the Shinde faction\'s) thoughts, said Raut.\n\nWhen Shinde made a statement at the Tuesday Dussehra rally in Mumbai\'s Azad Maidan, Raut reacted to it. As part of the opposition alliance INDIA, Uddhav Thackeray\'s Shiv Sena (UBT) has formed an alliance with Congress and the Samajwadi Party. Shinde remarked of this alliance: "For their own selfishness, they will tie the knot with Hamas and Lashkar-e-Taiba."\n\nRaut highlighted 

In [None]:
len(data)

37421

# filter data & Preprocess

In [None]:
# Extract articles
articles = [item.get('articleBody', '') for item in data]

In [None]:
len(articles)

37421

In [None]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation and special characters
    text = text.lower()  # Convert to lowercase
    return text



In [None]:
exm_txt = articles[0]
exm_txt

'Sanjay Raut, a member of the Shiv Sena (UBT) party, responded to the Maharashtra chief minister\'s statement that Eknath Shinde "himself is Hamas" and that the Shiv Sena group led by Uddhav Thackeray is capable of collaborating with "Hamas and Lashkar-e-Taiba for their own selfishness" on Wednesday by claiming that Eknath Shinde is Hamas.\n\n\n\nRaut made fun of Shinde by claiming, "He himself is Hamas. Hamas and Lashkar-e-Taiba, two terrorist groups, are completely irrelevant in Maharashtra. But the BJP is to blame for sowing the worms in their (the Shinde faction\'s) thoughts, said Raut.\n\nWhen Shinde made a statement at the Tuesday Dussehra rally in Mumbai\'s Azad Maidan, Raut reacted to it. As part of the opposition alliance INDIA, Uddhav Thackeray\'s Shiv Sena (UBT) has formed an alliance with Congress and the Samajwadi Party. Shinde remarked of this alliance: "For their own selfishness, they will tie the knot with Hamas and Lashkar-e-Taiba."\n\nRaut highlighted that Shinde\'s a

In [None]:
cln_exm_txt=clean_text(exm_txt)
cln_exm_txt

'sanjay raut a member of the shiv sena ubt party responded to the maharashtra chief ministers statement that eknath shinde himself is hamas and that the shiv sena group led by uddhav thackeray is capable of collaborating with hamas and lashkaretaiba for their own selfishness on wednesday by claiming that eknath shinde is hamas raut made fun of shinde by claiming he himself is hamas hamas and lashkaretaiba two terrorist groups are completely irrelevant in maharashtra but the bjp is to blame for sowing the worms in their the shinde factions thoughts said raut when shinde made a statement at the tuesday dussehra rally in mumbais azad maidan raut reacted to it as part of the opposition alliance india uddhav thackerays shiv sena ubt has formed an alliance with congress and the samajwadi party shinde remarked of this alliance for their own selfishness they will tie the knot with hamas and lashkaretaiba raut highlighted that shindes address differed from the customary dussehra rallies conduct

In [None]:
# Clean all articles
cleaned_articles = [clean_text(article) for article in articles]

In [None]:
len(cleaned_articles)

37421

In [None]:
# Keywords related to Israel-Hamas war
keywords = ['israel', 'hamas', 'gaza', 'palestine', 'idf', 'al-shifa', 'war', 'conflict']

def is_relevant(article):
    return any(keyword in article for keyword in keywords)

# Filter relevant articles
relevant_articles = [article for article in cleaned_articles if is_relevant(article)]

In [None]:
len(relevant_articles)

36318

#Text chunk and embedding

In [None]:
def extract_text_from_articles(relevant_articles):
    extracted_text = ""
    for article in relevant_articles:
        extracted_text += article + "\n"
    return extracted_text

In [None]:
extracted_text = extract_text_from_articles(relevant_articles[:1000])
print(extracted_text)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Function to split text into chunks
def custom_text_splitter(text, chunk_size: int, chunk_overlap: int):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(' '.join(current_chunk + [word])) <= chunk_size:
            current_chunk.append(word)
        else:
            chunks.append(' '.join(current_chunk))
            # Use overlap to start new chunk
            current_chunk = current_chunk[-chunk_overlap // len(current_chunk):] + [word]

    # Add the last chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Split extracted text into chunks
chunk_size = 800
chunk_overlap = 200
texts = custom_text_splitter(extracted_text, chunk_size, chunk_overlap)

In [None]:
texts[:5]

['sanjay raut a member of the shiv sena ubt party responded to the maharashtra chief ministers statement that eknath shinde himself is hamas and that the shiv sena group led by uddhav thackeray is capable of collaborating with hamas and lashkaretaiba for their own selfishness on wednesday by claiming that eknath shinde is hamas raut made fun of shinde by claiming he himself is hamas hamas and lashkaretaiba two terrorist groups are completely irrelevant in maharashtra but the bjp is to blame for sowing the worms in their the shinde factions thoughts said raut when shinde made a statement at the tuesday dussehra rally in mumbais azad maidan raut reacted to it as part of the opposition alliance india uddhav thackerays shiv sena ubt has formed an alliance with congress and the samajwadi party',
 'samajwadi party shinde remarked of this alliance for their own selfishness they will tie the knot with hamas and lashkaretaiba raut highlighted that shindes address differed from the customary dus

In [None]:
# Specify the file path for the new text file
file_path = "Data/text_file.txt"

# Write the content to the text file
with open(file_path, "w", encoding="utf-8") as file:
    file.write(extracted_text)

In [None]:
len(extracted_text)

135660444

#With vector database

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from huggingface_hub import hf_hub_download
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
import os
from langchain.llms import CTransformers
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

In [None]:

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "Hugging Face API"
ASTRA_DB_APPLICATION_TOKEN = os.environ.get('ASTRA_DB_APPLICATION_TOKEN', 'AstraDB API')
ASTRA_DB_ID = os.environ.get('ASTRA_DB_ID', 'AstraDB DatabaseID')

In [None]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)



In [None]:
embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')



In [None]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [None]:
# astra_vector_store.add_texts(texts)

# print("Inserted %i headlines." % len(texts))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [None]:
model_name_or_path = "TheBloke/CodeLlama-13B-Python-GGUF"
model_basename = "codellama-13b-python.Q5_K_M.gguf"
model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

In [None]:
llm = CTransformers(
        model = "TheBloke/Llama-2-7B-Chat-GGML",
        model_type="llama",
        max_new_tokens = 800,
        temperature = 0.5
    )

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
query_text ="What happened at the Al-Shifa Hospital?"

In [None]:
astra_vector_store.similarity_search_with_score(query_text, k=4)



[(Document(page_content='shifa was originally built by british authorities in 1946 in the 1980s israel renovated and expanded the hospital as part of an initiative to improve gaza living conditions a source in gaza told the tazpit press service that during periods of heightened security tensions portions of shifa hospital are routinely sealed off to the public this restricts access for gaza residents to various buildings and structures adjacent to the hospital additionally senior hamas officials are known to receive medical treatment there especially during military operations in recent days the israeli defense forces have bombed and demolished several buildings close to shifa hospital a palestinian source in gaza told the tazpit press service that those buildings are wellknown hamas sites widely believed to be'),
  0.8070742062626122),
 (Document(page_content='at alshifa hospital speaking to al jazeera in a short documentary the network published on wednesday gazas al shifa a hospital

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


Enter your question (or type 'quit' to exit): What happened at the Al-Shifa Hospital?

QUESTION: "What happened at the Al-Shifa Hospital?"




ANSWER: "The question:"

FIRST DOCUMENTS BY RELEVANCE:




    [0.8071] "shifa was originally built by british authorities in 1946 in the 1980s israel renova ..."
    [0.7995] "at alshifa hospital speaking to al jazeera in a short documentary the network publis ..."
    [0.7839] "at the shifa hospital celia from fishponds said that she was shocked by the conditio ..."
    [0.7731] "great honour someone is making money from this the covers are sustaining his livelih ..."


#With llamaIndex

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install llama_index

Collecting llama_index
  Downloading llama_index-0.10.43-py3-none-any.whl (6.8 kB)
Collecting llama-index-agent-openai<0.3.0,>=0.1.4 (from llama_index)
  Downloading llama_index_agent_openai-0.2.7-py3-none-any.whl (12 kB)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama_index)
  Downloading llama_index_cli-0.1.12-py3-none-any.whl (26 kB)
Collecting llama-index-core==0.10.43 (from llama_index)
  Downloading llama_index_core-0.10.43-py3-none-any.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama_index)
  Downloading llama_index_embeddings_openai-0.1.10-py3-none-any.whl (6.2 kB)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from llama_index)
  Downloading llama_index_indices_managed_llama_cloud-0.1.6-py3-none-any.whl (6.7 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama_index)
  Downloading llama_i

In [None]:
%pip install llama-index-llms-huggingface

Collecting llama-index-llms-huggingface
  Downloading llama_index_llms_huggingface-0.2.3-py3-none-any.whl (10 kB)
Collecting text-generation<0.8.0,>=0.7.0 (from llama-index-llms-huggingface)
  Downloading text_generation-0.7.0-py3-none-any.whl (12 kB)
Installing collected packages: text-generation, llama-index-llms-huggingface
Successfully installed llama-index-llms-huggingface-0.2.3 text-generation-0.7.0


In [None]:
from llama_index.core import VectorStoreIndex,ServiceContext,PromptTemplate,SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM

In [None]:
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
## Default format supportable by LLama2
query_wrapper_prompt=PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

In [None]:
import accelerate
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="syedzaidi-kiwi/Llama-2-7b-chat-finetune",
    model_name="syedzaidi-kiwi/Llama-2-7b-chat-finetune",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

In [None]:
## Embedding
!pip install install sentence_transformers

Collecting install
  Downloading install-1.3.5-py3-none-any.whl (3.2 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: install, sentence_transformers
Successfully installed install-1.3.5 sentence_transformers-3.0.1


In [None]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.2.4-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain-community
Successfully installed langchain-community-0.2.4


In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core.indices.service_context import ServiceContext
from llama_index.legacy.embeddings.langchain import LangchainEmbedding


embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))


[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

  service_context=ServiceContext.from_defaults(


In [None]:
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=4096, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=LangchainEmbedding(model_name='sentence-transformers/all-mpnet-base-v2', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7cb3988ff1f0>), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7cb3988ff1f0>, id_func=<function default_id_func at 0x7cb4ff6a0c10>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.core.service_context_elements.llama_logger.LlamaLogger object at 0x7cb392c99f90>, callback_manager=<llama_index.core.callbacks.ba

In [None]:
documents = SimpleDirectoryReader('/content/Data').load_data()

In [None]:
documents

In [None]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [None]:
query_engine=index.as_query_engine()

In [None]:
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor


In [None]:
retriever=VectorIndexRetriever(index=index,similarity_top_k=4)
postprocessor=SimilarityPostprocessor(similarity_cutoff=0.80)

query_engine=RetrieverQueryEngine(retriever=retriever,
                                  node_postprocessors=[postprocessor])

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    response=query_engine.query(query_text)
    print("ANSWER: \"%s\"\n" % response)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))