**IMPORTING NECESSARY MODULES**

In [1]:
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [2]:
%pip install llama-index-llms-huggingface



In [3]:
!pip install llama-index



In [4]:
!pip install accelerate



In [38]:
!pip install llama-index



In [46]:
!pip install llama-index-embeddings-langchain

Collecting llama-index-embeddings-langchain
  Downloading llama_index_embeddings_langchain-0.1.2-py3-none-any.whl (2.5 kB)
Installing collected packages: llama-index-embeddings-langchain
Successfully installed llama-index-embeddings-langchain-0.1.2


In [5]:
!pip install langchain==0.1.2 sentence_transformers==2.2.2




In [6]:
!pip install openai tiktoken chromadb pypdf InstructorEmbedding faiss-cpu compare fitz PyMuPDF



In [8]:
!pip install frontend

Collecting frontend
  Downloading frontend-0.0.3-py3-none-any.whl (32 kB)
Collecting aiofiles (from frontend)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles, frontend
Successfully installed aiofiles-23.2.1 frontend-0.0.3


**CONVERTING GIVEN PDF INTO CLEANED PDF USING FITZ**

In [10]:
import fitz  # PyMuPDF
import re
import os

def remove_tables_and_citations_from_pdf(input_pdf_path, output_pdf_path):
    # Open the input PDF file
    pdf_document = fitz.open(input_pdf_path)

    # Create a PDF writer object
    pdf_writer = fitz.open()

    # Iterate through each page of the PDF
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)

        # Get the text content of the page
        text = page.get_text()

        # Remove tables
        cleaned_text = remove_tables(text)

        # Remove in-text citations
        cleaned_text = remove_in_text_citations(cleaned_text)

        # Create a new page with cleaned text
        new_page = pdf_writer.new_page(width=page.rect.width, height=page.rect.height)
        new_page.insert_text((0, 0), cleaned_text)

    # Save the modified PDF to the output file
    pdf_writer.save(output_pdf_path)
    pdf_writer.close()
    pdf_document.close()

def remove_tables(text):
    # Regular expression to remove tables
    table_pattern = re.compile(r'\+\-+\+')
    cleaned_text = table_pattern.sub('', text)
    return cleaned_text

def remove_in_text_citations(text):
    # Regular expression to remove in-text citations ([1], [2], etc.)
    citation_pattern = re.compile(r'\[\d+\]')
    cleaned_text = citation_pattern.sub('', text)
    return cleaned_text


def batch_convert_and_transfer(input_folder, output_folder):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through each PDF file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            input_pdf_path = os.path.join(input_folder, filename)
            output_pdf_path = os.path.join(output_folder, filename)
            # Convert and transfer the PDF file
            remove_tables_and_citations_from_pdf(input_pdf_path, output_pdf_path)

# Input and output folder paths
input_folder = "pdf_folder"
output_folder = "clean"

# Convert and transfer PDF files from input folder to output folder
batch_convert_and_transfer(input_folder, output_folder)



**DATA EMBEDDING AND STORE IT IN FAISS VECTOR DATABASE**  

In [11]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader

In [12]:
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [13]:
root_dir = "/content/sample_data/clean"

In [14]:
# loader = TextLoader('single_text_file.txt')
loader = DirectoryLoader(f'{root_dir}', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [15]:
text_splitter = RecursiveCharacterTextSplitter(
                                               chunk_size=1000,
                                               chunk_overlap=200)

texts = text_splitter.split_documents(documents)

In [16]:
import pickle
import faiss
from langchain.vectorstores import FAISS

In [17]:
def store_embeddings(docs, embeddings, sotre_name, path):

    vectorStore = FAISS.from_documents(docs, embeddings)

    with open(f"{path}/faiss_{sotre_name}.pkl", "wb") as f:
        pickle.dump(vectorStore, f)

In [18]:
def load_embeddings(sotre_name, path):
    with open(f"{path}/faiss_{sotre_name}.pkl", "rb") as f:
        VectorStore = pickle.load(f)
    return VectorStore

In [19]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl",
                                                      model_kwargs={"device": "cuda"})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [20]:
Embedding_store_path = f"{root_dir}/Embedding_store"

In [21]:
db_instructEmbedd = FAISS.from_documents(texts, instructor_embeddings)

**RETRIEVER (5 RELATED CONTENTS)**

In [22]:
retriever = db_instructEmbedd.as_retriever(search_kwargs={"k": 5})

In [23]:
retriever.search_type

'similarity'

In [24]:
retriever.search_kwargs

{'k': 5}

In [25]:
docs = retriever.get_relevant_documents("What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children?")

In [26]:
docs[0]

Document(page_content='1\nScientific Reports |         (2020) 10:5014  | https://doi.org/10.1038/s41598-020-61213-w\nwww.nature.com/scientificreports\nMulti-modular AI Approach to \nStreamline Autism Diagnosis in \nYoung Children\nHalim\xa0Abbas· ·\n1, Ford\xa0Garberson· ·\n1, Stuart\xa0Liu-Mayo· ·\n1, Eric\xa0Glover1* & Dennis\xa0P.\xa0Wall· ·\n2\nAutism has become a pressing healthcare challenge. The instruments used to aid diagnosis are time \nand labor expensive and require trained clinicians to administer, leading to long wait times for at-risk \nchildren. We present a multi-modular, machine\xa0learning-based assessment of autism comprising three \ncomplementary modules for a unified outcome of diagnostic-grade reliability: A 4-minute, parent-\nreport questionnaire delivered via a mobile app, a list of key behaviors identified from 2-minute, semi-\nstructured home videos of children, and a 2-minute questionnaire presented to the clinician at the', metadata={'source': '/content/sam

**CREATING HUGGINGFACE QA LLM**

In [27]:
from huggingface_hub import login
login(token='')#hugging face token

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [28]:
from llama_index.llms.huggingface import HuggingFaceLLM

In [35]:
import torch
from transformers import BitsAndBytesConfig

# Create a BitsAndBytesConfig object
quantization_config = BitsAndBytesConfig(
    load_in_8bit_fp32_cpu_offload=False  # Ensure to set this according to your requirements
    # Add other configurations as needed
)

# Initialize HuggingFaceLLM with quantization_config
my_custom_llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    # Uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16},  # Remove "load_in_8bit" argument
    # Pass the quantization_config here
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



**QUESTION ANSWERING**

In [44]:
from llama_index.core import ServiceContext

In [47]:
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=my_custom_llm,
    embed_model=instructor_embeddings
)

  service_context = ServiceContext.from_defaults(


In [48]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [58]:
query = 'What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children?'
llm_response = service_context(query)
process_llm_response(llm_response)

Multimodal AI methods combine behavioral observations, neuroimaging, and genetic data to improve the accuracy and efficiency of autism diagnosis in young children.
