In [43]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.cassandra import Cassandra
from langchain_community.document_loaders import DirectoryLoader
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_community.llms import Ollama
import cassio
import os
from PyPDF2 import PdfReader
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [5]:
def remove_figures_tables_citations(text):
    figure_regex = r"(Figure|Table) \d+(?:\.\w+)?(?:\s*[:\-–]|\\s*\(.*?\))*?"
    citation_regex = r"\([^)]+\)"
    processed_text = re.sub(figure_regex, "", text, flags=re.IGNORECASE)
    processed_text = re.sub(citation_regex, "", processed_text)
    return processed_text

In [6]:
def preprocess_text(text):
    text = text.lower()
    text = remove_figures_tables_citations(text)
    text = re.sub(r'[^\\w\\s]', '', text)
    words = text.split()
    stop_words = stopwords.words('english')
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

In [7]:
def process_pdfs(pdf_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            with open(os.path.join(pdf_folder, filename), 'rb') as pdf_file:
                pdf_reader = PdfReader(pdf_file)
                raw_text = ''
                for page in pdf_reader.pages:
                    content = page.extract_text()
                    if content:
                        raw_text += content
                preprocessed_words = preprocess_text(raw_text)
                output_filename = os.path.splitext(filename)[0] + ".txt"
                output_path = os.path.join(output_folder, output_filename)
                with open(output_path, 'w') as f:
                    f.write(' '.join(preprocessed_words))
                print(f"Processed: {filename}")

In [8]:
pdf_folder = "Data"
output_folder = "Dummy"

process_pdfs(pdf_folder, output_folder)

print("Preprocessing complete!")

Processed: 15_Nazneen.pdf


Multiple definitions in dictionary at byte 0x1cc6b for key /MediaBox
Multiple definitions in dictionary at byte 0x1ce61 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d014 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d1ae for key /MediaBox
Multiple definitions in dictionary at byte 0x1d33d for key /MediaBox
Multiple definitions in dictionary at byte 0x1d4af for key /MediaBox
Multiple definitions in dictionary at byte 0x1d699 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d85b for key /MediaBox
Multiple definitions in dictionary at byte 0x1db05 for key /MediaBox


Processed: 22_Ouss_ASD.pdf
Processed: Qiu.pdf
Processed: Tariq2018.pdf
Processed: Tariq_2019.pdf
Processed: carpenter2020 (1).pdf
Processed: Patten_Audio.pdf
Processed: Abbas_2020.pdf
Processed: Asd_Cry_patterns.pdf
Processed: Abbas_2018.pdf
Processed: Dawson.pdf
Processed: 1_Ramırez-Duque_.pdf
Processed: zhao2020.pdf
Processed: Young_Behavior.pdf
Processed: LEE.pdf
Preprocessing complete!


In [9]:
def read_doc(directory):
    file_loader = DirectoryLoader('pre-processed', glob="*.txt", show_progress=True, use_multithreading=True)
    docs = file_loader.load()
    return docs

In [10]:
doc = read_doc('Dummy/')

  7%|▋         | 1/15 [00:01<00:25,  1.79s/it]Need to load profiles.
Need to load profiles.
Need to load profiles.
Need to load profiles.
Need to load profiles.
No features in text.
100%|██████████| 15/15 [00:07<00:00,  1.98it/s]


In [11]:
doc

[Document(page_content='', metadata={'source': 'pre-processed/15_Nazneen.txt'}),
 Document(page_content='research application machine learning approach early detection autism combining questionnaire home video screening halim abbas1ford garberson1eric glover2and dennis p wall134 1cognoa inc palo alto ca usa wwwlinkedincominhalimabbas2eric_gericglovercom3department pediatrics stanford university stanford ca usa4department biomedical data science stanford university stanford ca usa correspondence cognoa inc palo alto ca usa halimcognoacom received 19 september 2017 revised 16 march 2018 editorial decision 25 march 2018 accepted 2 april 2018 abstract background existing screening tool early detection autism expensive cumbersome time intensive sometimes fall short predictive value work sought apply machine learning goldstandard clinical data obtained across thousand child atrisk autism spectrum disorder create alowcost quick easy apply autism screening tool method two algorithm trained ide

In [12]:
len(doc)

15

In [13]:
def chunk_data(docs, chunk_size=900, chunk_overlap = 50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = chunk_size,chunk_overlap = chunk_overlap)
    doc = text_splitter.split_documents(docs)
    return doc

In [14]:
documents = chunk_data(docs=doc)
documents

[Document(page_content='research application machine learning approach early detection autism combining questionnaire home video screening halim abbas1ford garberson1eric glover2and dennis p wall134 1cognoa inc palo alto ca usa wwwlinkedincominhalimabbas2eric_gericglovercom3department pediatrics stanford university stanford ca usa4department biomedical data science stanford university stanford ca usa correspondence cognoa inc palo alto ca usa halimcognoacom received 19 september 2017 revised 16 march 2018 editorial decision 25 march 2018 accepted 2 april 2018 abstract background existing screening tool early detection autism expensive cumbersome time intensive sometimes fall short predictive value work sought apply machine learning goldstandard clinical data obtained across thousand child atrisk autism spectrum disorder create alowcost quick easy apply autism screening tool method two algorithm trained identify', metadata={'source': 'pre-processed/Abbas_2018.txt'}),
 Document(page_cont

In [19]:
## Embeddings
embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-large", encode_kwargs = {'normalize_embeddings': True})
embeddings

  from .autonotebook import tqdm as notebook_tqdm


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='thenlper/gte-large', cache_folder=None, model_kwargs={}, encode_kwargs={'normalize_embeddings': True}, multi_process=False, show_progress=False)

In [17]:
ASTRA_DB_APPLICATION_TOKEN = ""
ASTRA_DB_ID = ""

In [18]:
cassio.init(token = ASTRA_DB_APPLICATION_TOKEN,database_id = ASTRA_DB_ID)

In [20]:
astra_vector_store = Cassandra(
    embedding=embeddings,
    table_name="Dummy",
    session=None,
    keyspace=None
)

In [21]:
astra_vector_store.add_documents(documents)

['841f5f066f104cdd993791f1b20bc1c4',
 'f68099820a054b7f9e7d8677f1974967',
 'a99cb514c1594c41a2ef95caed46ec00',
 '2fda97e3ff774df7aa4e7150cd0a4e47',
 'd1ee2827f84f4c8f9c7d2c0e16881b2a',
 'd143ea25580e49b1b3a7e93bb51193ca',
 'da498158ffa54cd297f558942ad73380',
 'd317021a14204d7db51cfae247f22bfc',
 '25168e9f2a3e4e94b04cc6042ed9358e',
 '86be750413c94d4c8247385c38752e43',
 'a2c986bfd61248c3b666e017b2107022',
 '938cc43c22454fd59038402642649123',
 'd24111c046bd403b8f26c9d6d1b91784',
 '23c80f89cf384ee38792f0b249eb0014',
 '9f139c293100457d8e02a24d79ff3b37',
 '6fd8a4faecaa4887bb1670c510370ca0',
 '6f2a0d919d5c44ae976a29c6706fe4e4',
 'f66411851d0442b9a2420ab4a050c536',
 'e133d48754354a88aafed34108bfab95',
 'ae305a78be4d4416b6dba22caf835f1a',
 '3a2ac980c53f401b934f14ad92ed1028',
 '8a9446b95bf54e0eac79519d796ae85c',
 '152df9e1335547a2a113f2ddd61e0c59',
 '07b423ce853b4eaea8c2cfc1e296323f',
 'c9535dd7020c4fa49138851dff58a5bf',
 'bbaf63753d1648d1abe923ba7eafff25',
 '6706394ac4ba409c8767ac43df347fb1',
 

In [22]:
astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

In [25]:
llm = Ollama(model="llama2")

In [37]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}""")

In [39]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain=create_stuff_documents_chain(llm,prompt)

In [38]:
retriever = astra_vector_store.as_retriever()
retriever

VectorStoreRetriever(tags=['Cassandra', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.cassandra.Cassandra object at 0x7c3196353a50>)

In [40]:
from langchain.chains import create_retrieval_chain
retrieval_chain=create_retrieval_chain(retriever,document_chain)

In [None]:
questions = [
    "What are the variety of Multimodal and Multi-modular AI Approaches to Streamline Autism Diagnosis in Young Children",
    "What is Autism Spectrum Disorder, how it is caysed?",
    "What is the cure of Autism Spectrum Disorder",
    "What is West Syndrome?",
    "What are Stereotypical and maladaptive behaviors in Autism Spectrum, how are these detected and managed",
    "How relevant is eye contact and how it can be used to detect Autism",
    "How can cross country trials help in development of Machine learning based Multimodal solutions ",
    "How early infants cry can help in the early detection of Autism ",
    "What are various methods to detect  Atypical Pattern of Facial expression in Children ",
    "What kind of facial expressions can be used to detect Autism Disorder in children",
    "What are methods to detect Autism from home videos",
    "What is Still-Face Paradigm in Early Screening for High-Risk Autism Spectrum Disorder",
    "What is the utility of Behavior and interaction imaging at 9 months of age predict autism/intellectual disability in high-risk infants with West syndrome"
    ]

In [41]:
response=retrieval_chain.invoke({"input":"What is the cure of Autism Spectrum Disorder"})

In [42]:
response['answer']

'Based on the provided context, there is no definitive cure for Autism Spectrum Disorder (ASD). The latest research suggests that early diagnosis and intervention can significantly improve the outlook for children with ASD. However, the complexity of the diagnostic procedure and the shortage of trained specialists result in many children not receiving a timely diagnosis and appropriate treatment.\n\nThe context mentions several methods for screening and diagnosing ASD, including developmental screening, followed by a comprehensive diagnostic evaluation. These screens typically use questionnaires answered by parents, teachers, or clinicians to identify potential red flags for ASD. While these screens are generally easy and inexpensive to administer, they are not always accurate enough to help diagnose ASD early enough to receive effective behavioral therapy.\n\nTherefore, while there is no single "cure" for ASD, early detection and intervention can significantly improve the outlook for 