In [31]:
import PyPDF2
from PyPDF2 import PdfReader
import os
from glob import glob
import random as rand
import langdetect
import openai
from dotenv import load_dotenv
load_dotenv()
openai.api_key=os.environ.get('OPENAI_API_KEY')
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

# Data Preprocessing

In [2]:
pdfs = glob('Publications/*.pdf')

### Checking wether the PDF is readable or not

In [3]:
unreadable_pdfs = []
readable_pdfs = []
for pdf in pdfs:
  reader = PdfReader(pdf)
  page = reader.pages[0]
  text = page.extract_text()
  if text == '':
    unreadable_pdfs.append(pdf)
  else:
    readable_pdfs.append(pdf)
    
for pdf in readable_pdfs:
  _pdf = PdfReader(pdf)
  page = _pdf.pages[0]
  text = page.extract_text()
  if langdetect.detect(text)!='en':
    unreadable_pdfs.append(pdf)
    readable_pdfs.remove(pdf)

print(unreadable_pdfs)
print(readable_pdfs)

Multiple definitions in dictionary at byte 0x1cc6b for key /MediaBox
Multiple definitions in dictionary at byte 0x1ce61 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d014 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d1ae for key /MediaBox
Multiple definitions in dictionary at byte 0x1d33d for key /MediaBox
Multiple definitions in dictionary at byte 0x1d4af for key /MediaBox
Multiple definitions in dictionary at byte 0x1d699 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d85b for key /MediaBox
Multiple definitions in dictionary at byte 0x1db05 for key /MediaBox
Multiple definitions in dictionary at byte 0x1cc6b for key /MediaBox
Multiple definitions in dictionary at byte 0x1ce61 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d014 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d1ae for key /MediaBox
Multiple definitions in dictionary at byte 0x1d33d for key /MediaBox
Multiple definitions in dictionary

['Publications/15_Nazneen.pdf', 'Publications/Tariq_2019.pdf', 'Publications/Asd_Cry_patterns.pdf', 'Publications/Tariq2018.pdf']
['Publications/Young_Behavior.pdf', 'Publications/1_Ramırez-Duque_.pdf', 'Publications/Patten_Audio.pdf', 'Publications/zhao2020.pdf', 'Publications/carpenter2020 (1).pdf', 'Publications/LEE.pdf', 'Publications/Abbas_2020.pdf', 'Publications/Dawson.pdf', 'Publications/Qiu.pdf', 'Publications/22_Ouss_ASD.pdf', 'Publications/Abbas_2018.pdf']


In [4]:
# import torch
# import easyocr
# import cv2
# import numpy as np
# import fitz

# class OCR:
#     def __init__(self, lang:str, gpu:bool=True):
#         super().__init__()
#         self.device = 'cpu' if not gpu else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#         if self.device == 'cpu' and gpu:
#             print("GPU not available, using CPU instead")
#         self.lang = lang
#         self.reader = easyocr.Reader([lang], gpu=True if self.device=='cuda' else False)

#     def get_bbox(self, img)->np.array:
#         """Get bounding box of text in image format (x_min, x_max, y_min, y_max)"""
#         bbox = np.array(self.reader.detect(img)[0][0])
#         return bbox

#     def read_img(self, img_path)->np.array:
#         """Read image and return image"""
#         img = cv2.imread(img_path)
#         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#         return img

#     def get_text(self, img, detail=0)->str:
#         """return text from image"""
#         text = self.reader.readtext(img, detail=detail)
#         return text

# class pdf2images:
#     def __init__(self):
#         super().__init__()

#     def get_images_from_pdf(self, idx, pdf_path, output_folder:str=None, save_images_locally:bool=False):
#         if save_images_locally:
#             if output_folder is None:
#                 raise ValueError("output_folder must be specified if save_images_locally is True")
#             if not os.path.exists(output_folder):
#                 os.makedirs(output_folder)
#         doc = fitz.open(pdf_path)
#         count = idx
#         j = 0
#         for i in doc:
#             images = i.get_pixmap()
#             images.save(str(count)+'_'+str(j)+".png")
#             j+=1
#             count+=1
#         return count

#     def pdfs_to_images(self, pdf_paths, output_folder, save_images_locally:bool=False):
#         idx=0
#         for pdf_path in pdf_paths:
#             idx = self.get_images_from_pdf(idx = idx, pdf_path = pdf_path, output_folder = output_folder, save_images_locally = save_images_locally)
# pdf_image_gen = pdf2images()
# ocr = OCR('en')
# OUTPUT_PATH = '/content/drive/MyDrive/Publications_images'
# pdf_image_gen.pdfs_to_images(unreadable_pdfs, OUTPUT_PATH, save_images_locally=True)
# images_path = glob('/content/drive/MyDrive/Publications_images/*.png')
# for image in images_path:
#   img = ocr.read_img(image)
#   text = ocr.get_text(img)
#   print(text)

In [32]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def generate_text(prompt:str, sys_prompt:str, temperature:float=0.01):
    messages=[
        {
            "role": "system",
            "content": sys_prompt
        },
        {
            'role':'user',
            'content':prompt
        }
    ]

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-1106",
        messages=messages,
        temperature=temperature,
        max_tokens=1000,
    )

    return response.choices[0].message['content']

In [27]:
system_prompt = """The input is New Text from research paper, and the Previous Text has been cleaned already. From the New Text remove the table data, the refrences, and the figure, while keeping the core of the text same. If possible summarise the table data, and what the figure was for."""
print(system_prompt)

The input is New Text from research paper, and the Previous Text has been cleaned already. From the New Text remove the table data, the refrences, and the figure, while keeping the core of the text same. If possible summarise the table data, and what the figure was for.


In [33]:
for pdf_path in readable_pdfs:
  pdf = PdfReader(pdf_path)
  cleaned_text = ''
  for page in pdf.pages:
    text = page.extract_text()
    text = text.lower()
    prompt = f"""Previous Text: {cleaned_text}
    New Text{text}"""
    cleaned_text = cleaned_text+generate_text(prompt, system_prompt, temperature=0.2)
      
  with open(os.path.join('Texts' ,f"f{pdf_path.split('/')[-1].split('.')[0]}.txt"), 'w') as f:
    f.write(cleaned_text)

Multiple definitions in dictionary at byte 0x1cc6b for key /MediaBox
Multiple definitions in dictionary at byte 0x1ce61 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d014 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d1ae for key /MediaBox
Multiple definitions in dictionary at byte 0x1d33d for key /MediaBox
Multiple definitions in dictionary at byte 0x1d4af for key /MediaBox
Multiple definitions in dictionary at byte 0x1d699 for key /MediaBox
Multiple definitions in dictionary at byte 0x1d85b for key /MediaBox
Multiple definitions in dictionary at byte 0x1db05 for key /MediaBox


# RAG

In [23]:
import pymongo
import openai
import langchain
import langchain_openai
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.document_loaders import DirectoryLoader
import pandas as pd
from langchain.chains import question_answering
from langchain.docstore.document import Document
import numpy as np
MONGO_CONNECTION_STRING = os.environ.get('MONGODB_ATLAS_CONNECTION_STRING')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [2]:
client = MongoClient(MONGO_CONNECTION_STRING)
dbName = 'Publications'
collectionName = 'Embedding_of_publications'
collection = client[dbName][collectionName]

In [32]:
loader = DirectoryLoader('./Texts', glob='./*.txt')
data = loader.load()
content = {}
for doc in data:
    content.update({doc.metadata['source'].split('/')[-1].split('.')[0] : doc.page_content.split('.\n\n')})

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [50]:
for file in content.keys():
    print(file, end=' ')
    data = []
    for text in content[file]:
        embed = embeddings.embed_documents([text])
        data.append({"embeddings":embed, "text":text})
    collection.insert_many(data)
    print('Done')

f1_Ramırez-Duque_ Done
fYoung_Behavior Done
fPatten_Audio Done
fAbbas_2018 Done
fQiu Done
fzhao2020 Done
fcarpenter2020 (1) Done
fDawson Done
f22_Ouss_ASD Done
fLEE Done
fAbbas_2020 Done


In [77]:
for doc in collection.find():
    arr = doc['embeddings']
    flat_arr = np.array(arr).flatten()
    query = {'_id':doc['_id']}
    update = {"$set":{'embeddings':flat_arr.tolist()}}
    collection.update_one(query, update)

In [None]:
# import pymongo
# import datetime

# # connect to your Atlas cluster
# client = pymongo.MongoClient("<connection-string>")

# # define pipeline
# pipeline = [
#   {
#     '$vectorSearch': {
#       'index': 'vector-search-tutorial', 
#       'path': 'plot_embedding', 
#       'filter': {
#         '$or': [
#           {
#             'genres': {
#               '$ne': 'Crime'
#             }
#           }, {
#             '$and': [
#               {
#                 'year': {
#                   '$lte': 2015
#                 }
#               }, {
#                 'genres': {
#                   '$eq': 'Action'
#                 }
#               }
#             ]
#           }
#         ]
#       }, 
#       'queryVector': [],
#       'numCandidates': 200, 
#       'limit': 10
#     }
#   }, {
#     '$project': {
#       '_id': 0, 
#       'title': 1, 
#       'genres': 1, 
#       'plot': 1, 
#       'year': 1, 
#       'score': {
#         '$meta': 'vectorSearchScore'
#       }
#     }
#   }
# ]

# # run pipeline
# result = client["sample_mflix"]["embedded_movies"].aggregate(pipeline)

# # print results
# for i in result:
#     print(i)
 

In [5]:
import pandas as pd
queries = pd.read_csv('Query.csv')

In [3]:
def make_pipline(query):
    queryVector = embeddings.embed_documents([query])
    queryVector = np.array(queryVector).flatten()
    pipeline = [{
        "$vectorSearch": {
            "index": "vector_index",
            "path": "embeddings",
            "queryVector": queryVector.tolist(),
            "numCandidates": 200,
            "limit": 10
        }
    }]
    return pipeline

def similarity_search(query):
    pipeline = make_pipline(query)
    result = collection.aggregate(pipeline)
    return result

In [21]:
def document_retrival(query):
    sample_query = queries.sample(n=1)['Questions '].to_list()[0]
    retrived_vectors = similarity_search(sample_query)
    list_vectors = [Document(i['text']) for i in retrived_vectors]
    return list_vectors

In [27]:
class PublicationsRAG:
    def __init__(self, model:str, api_key:str, ) -> None:
        self.llm = ChatOpenAI(api_key=api_key, model= model)
        self.chain = question_answering.load_qa_chain(llm = self.llm, chain_type="map_reduce")
    
    def __make_pipline(self, query)->list[dict]:
        queryVector = embeddings.embed_documents([query])
        queryVector = np.array(queryVector).flatten()
        pipeline = [{
            "$vectorSearch": {
                "index": "vector_index",
                "path": "embeddings",
                "queryVector": queryVector.tolist(),
                "numCandidates": 200,
                "limit": 10
            }
        }]
        return pipeline
    
    def __similarity_search(self, query)->list[Document]:
        pipeline = self.__make_pipline(query)
        result = collection.aggregate(pipeline)
        list_documents = [Document(i['text']) for i in result]
        return list_documents
    
    def query_run(self, query):
        documents = self.__similarity_search(query)
        return self.chain.run(input_documents = documents, question = query)

In [28]:
rag = PublicationsRAG('gpt-3.5-turbo', OPENAI_API_KEY)

In [30]:
rag.query_run(queries.sample(n=1)['Questions '].to_list()[0])

"Various methods to detect atypical patterns of facial expressions in children include:\n\n1. Behavioral assessments using structured observations and coding systems.\n2. Facial expression recognition software utilizing computer-based algorithms.\n3. Eye-tracking technology to monitor eye movements and gaze patterns.\n4. Electromyography (EMG) to measure electrical activity in facial muscles.\n5. Physiological measures like heart rate variability or skin conductance.\n6. Parent/caregiver reports on the child's facial expressions.\n7. Multimodal assessments combining different methods for enhanced accuracy."

In [31]:
sample_queries = queries['Questions '].to_list()
answer = []
for query in sample_queries:
    ans = rag.query_run(query)
    answer.append(ans)
    print(ans)

queries['answer'] = answer
queries.to_csv('Answers.csv')

The variety of multimodal and multi-modular AI approaches to streamline autism diagnosis in young children include a 4-minute parent-report questionnaire delivered via a mobile app, a list of key behaviors identified from 2-minute, semi-structured home videos of children, and a 2-minute questionnaire presented to the clinician at the time of clinical assessment. Additionally, there is a new module intended for completion in a primary care setting, based on a questionnaire answered by a clinician after examining the child and talking to the parent. These modules are described as fast and easy to administer, providing a streamlined approach to autism diagnosis in young children.
Autism Spectrum Disorder (ASD) is a complex neurodevelopmental condition that affects social skills, communication, behavior, and interests. The exact cause of ASD is not fully understood, but research suggests that a combination of genetic and environmental factors may contribute to its development. Genetic fact