In [15]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_norm = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en",
    encode_kwargs={'normalize_embeddings': True}
)

In [6]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
import os

base_path = '../../Library/CloudStorage/Box-Box/DISC_AI Project/Transripts Pilot'
os.listdir(base_path)

['.chroma',
 '.DS_Store',
 '6.3 Effects of Age and Disuse on Skeletal Muscle.pdf',
 '6.9 Precautions and Contraindications for Resistance Exercise.pdf',
 '6.4 Framework for Resistance Training.pdf',
 'd_db',
 '6.1 Principles of Resistance Exercise.pdf',
 '6.7 Case Application of Oddvar-Holten Method.pdf',
 '6.5 Velocity and Mode Resistsance Training Parameters.pdf',
 '6.6 Volume and Intensity of Resistance Exercise.pdf',
 '6.2 Muscle Adaptations to Resistance Exercise.pdf',
 'p_db',
 '~',
 '6.8 Order and Frequency of Resistance Exercise.pdf']

In [7]:
import glob

pdfs = glob.glob(f'{base_path}/*.pdf')
pages = []
for pdf in pdfs:
    loader = PyPDFLoader(pdf)
    pages += loader.load_and_split()

In [16]:
db = FAISS.from_documents(pages, model_norm)

In [17]:
db.similarity_search('What is resistance training?')

[Document(page_content="mechanically. 00:01:24[BLANK_AUDIO] 00:01:29Now, since we have established what the definition is, I would like to say before we do anything else that resists training within the literature is heavily, heavily covered. 00:01:40As a matter of fact, it is so ubiquitous that it's hardly worth mentioning the fact that, yes, there is literature supporting resistance training. 00:01:50As a matter of fact, when I did a very quick search term for resistance training and rehabilitation, I had almost 44,000 hits. 00:01:59So again, when we go through the material associated with resistance training, it's assuming that we all understand that resistance training does have therapeutic benefits. 00:02:12[BLANK_AUDIO] 00:02:14Okay, so let's talk about some of the elements of muscle performance, and they are strength, endurance, and power. 00:02:20The strength of a muscle is the ability of contractile tissue to produce tension and resultant force based on the demands placed on t

In [18]:
db.save_local("pt_example_db")

In [19]:
new_db = FAISS.load_local("pt_example_db", model_norm)

In [20]:
new_db.similarity_search('What is resistance training?')

[Document(page_content="mechanically. 00:01:24[BLANK_AUDIO] 00:01:29Now, since we have established what the definition is, I would like to say before we do anything else that resists training within the literature is heavily, heavily covered. 00:01:40As a matter of fact, it is so ubiquitous that it's hardly worth mentioning the fact that, yes, there is literature supporting resistance training. 00:01:50As a matter of fact, when I did a very quick search term for resistance training and rehabilitation, I had almost 44,000 hits. 00:01:59So again, when we go through the material associated with resistance training, it's assuming that we all understand that resistance training does have therapeutic benefits. 00:02:12[BLANK_AUDIO] 00:02:14Okay, so let's talk about some of the elements of muscle performance, and they are strength, endurance, and power. 00:02:20The strength of a muscle is the ability of contractile tissue to produce tension and resultant force based on the demands placed on t

In [21]:
old = db.similarity_search('What is resistance training?')
new = new_db.similarity_search('What is resistance training?')
old == new # yay!

True

In [24]:
base_path = '../../Library/CloudStorage/Box-Box/DISC_AI Project/Transcripts Primary Care/Week 1 Primary Care.pdf'
os.path.exists(base_path)

True

In [25]:
import pypdf

read_pdf = pypdf.PdfReader(base_path)

In [51]:
read_pdf.pages[0].extract_text().startswith('1.')

True

In [55]:
import collections
text_dict = collections.defaultdict(str)
pages = read_pdf.pages
for page in pages:
    page_text = page.extract_text()
    if page_text.startswith('1.'):
        curr = page_text[:3]
        text_dict[curr] += page_text
    else:
        text_dict[curr] += page_text

In [57]:
text_dict['1.2']

"1.2  The Low Back and Lower Extremities; Regional Pain Patterns and Associated Disorders00:00:01Lecture 1.2, regional pain patterns and associated diseases and disorders, low back and lower extremities. 00:00:09[BLANK_AUDIO] 00:00:12So the objectives here are to identify 4 forms you might give to a patient with low back pain to provide additional information about that patient's condition. 00:00:23Describe at least two non-musculoskeletal causes of low back pain and under what circumstances you might reevaluate a musculoskeletal cause of labaquine. 00:00:34Describe symptoms which might lead you to include cauda equina in your list of differential diagnoses. 00:00:42Identify five potentially serious conditions which may mimic less serious musculoskeletal conditions in the pelvis, hip, and thigh. 00:00:52And finally, identify five potentially serious conditions which may mimic less serious musculoskeletal conditions in the knee, lower leg, ankle, and foot. 00:01:03[BLANK_AUDIO] 00:01:06

In [78]:
os.makedirs('./data/week_1')

In [79]:
from langchain.text_splitter import TokenTextSplitter
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=25)

for k,v in text_dict.items():
    text_chunks = text_splitter.split_text(v)
    for i, tc in enumerate(text_chunks):
        with open(f'./data/week_1/week_1_{k}_{i}.txt', 'w') as f:
            f.write(tc)

In [80]:
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('./data/week_1', glob="*.txt")
docs = loader.load()

In [81]:
db = FAISS.from_documents(docs, model_norm)

In [82]:
db.similarity_search("What is the goal of symptom investigation?")

[Document(page_content="1.1 Symptom InvestigationEffects00:00:00[MUSIC] 00:00:08[BLANK_AUDIO] 00:00:10Hi and welcome to lecture 1.1 symptom investigation. 00:00:17[BLANK_AUDIO] 00:00:24The objectives here for lecture 1.1 are to, let's see here, 00:00:32[BLANK_AUDIO] 00:00:35Summarize why a body diagram may provide useful information, describe the use of open and closed-ended questions as a component of symptom investigation. 00:00:47Describe how knowledge of potential pain location patterns associated with viscera, can be beneficial and how it can also be confounding. 00:00:56And finally, identify three questions you might ask a patient regarding the onset of their symptoms and describe the appropriate time frame over which to assess changes in their symptoms. 00:01:09[BL", metadata={'source': 'data/week_1/week_1_1.1_0.txt'}),
 Document(page_content="or burning. 00:04:18And visceral disorders could present with symptoms such as aching, squeezing, gnawing, burning or cramping. 00:04:26T

In [83]:
db.save_local("./dbs/week_1_db")