In [1]:
import os
os.chdir('../../Library/CloudStorage/Box-Box/DISC_AI Project/Transripts Pilot')

In [2]:
os.listdir()

['.chroma',
 '6.3 Effects of Age and Disuse on Skeletal Muscle.pdf',
 '6.9 Precautions and Contraindications for Resistance Exercise.pdf',
 '6.4 Framework for Resistance Training.pdf',
 'd_db',
 '6.1 Principles of Resistance Exercise.pdf',
 '6.7 Case Application of Oddvar-Holten Method.pdf',
 '6.5 Velocity and Mode Resistsance Training Parameters.pdf',
 '6.6 Volume and Intensity of Resistance Exercise.pdf',
 '6.2 Muscle Adaptations to Resistance Exercise.pdf',
 'p_db',
 '~',
 '6.8 Order and Frequency of Resistance Exercise.pdf']

# Getting the data
In this notebook, I'll two different techiques of getting the text and see which works the best:
1. Using the `LangChain` PDF loader
    * Pros: easy to use, scalable
    * Cons: not sure how it will work across pages
2. Converting every PDF into a string text and then joining them on the document level
    * Pros: allows for further processing (i.e. removing timestamps)
    * Cons: not very scalable

## `LangChain` PDF loader

In [3]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('./6.1 Principles of Resistance Exercise.pdf')
pages = loader.load_and_split()

In [4]:
pages[0]

Document(page_content="6.1 Principles of Resistance Exercise00:00:00[MUSIC] 00:00:08Hello, and welcome to our lecture on the principles of resistance exercise. 00:00:14Here is our course statement. 00:00:15By the end of this lecture, I am hoping that you'll be able to define resistance training, define the following terms associated with resistance training, and they include resistance training, muscle strength, power, and endurance. 00:00:31And finally, describe the principles of overload, SAID, and reversibility. 00:00:35Here's our patient case that'll be used for this lecture. 00:00:39On general physical examination, J.D a 14-year-old has an obese appearance and presents with difficulty in standing, walking, getting up from sitting positions, and climbing stairs. 00:00:51He also presents with proximal weakness, calf hypertrophy, hamstring muscle contracture, and a positive Gower's sign. 00:00:59So you've determined that this patient is appropriate for resistance training. 00:01:03So

In [5]:
import glob
pages = []
for file in glob.glob('*.pdf'):
    loader = PyPDFLoader(file)
    pages += loader.load_and_split()

In [6]:
len(pages)

61

## Single string per document and then `LangChain` string loader

In [9]:
from langchain.docstore.document import Document
import pypdf as pdf
import re

read_pdf = pdf.PdfReader(open('./6.1 Principles of Resistance Exercise.pdf', 'rb'))
text = ' '.join([p.extract_text() for p in read_pdf.pages])
text = re.sub('\d+\:\d+\:\d+','',text)
text = re.sub('\[.*?\]','',text)
text = text.replace('cielo24 | what’s in your video? | cielo24.com', '')
text[:1000]

"6.1 Principles of Resistance Exercise Hello, and welcome to our lecture on the principles of resistance exercise. Here is our course statement. By the end of this lecture, I am hoping that you'll be able to define resistance training, define the following terms associated with resistance training, and they include resistance training, muscle strength, power, and endurance. And finally, describe the principles of overload, SAID, and reversibility. Here's our patient case that'll be used for this lecture. On general physical examination, J.D a 14-year-old has an obese appearance and presents with difficulty in standing, walking, getting up from sitting positions, and climbing stairs. He also presents with proximal weakness, calf hypertrophy, hamstring muscle contracture, and a positive Gower's sign. So you've determined that this patient is appropriate for resistance training. So what do you do? Well, first, let's understand what resistance training actually is.  Resistance training is 

In [10]:
doc = Document(page_content=text)
doc

Document(page_content="6.1 Principles of Resistance Exercise Hello, and welcome to our lecture on the principles of resistance exercise. Here is our course statement. By the end of this lecture, I am hoping that you'll be able to define resistance training, define the following terms associated with resistance training, and they include resistance training, muscle strength, power, and endurance. And finally, describe the principles of overload, SAID, and reversibility. Here's our patient case that'll be used for this lecture. On general physical examination, J.D a 14-year-old has an obese appearance and presents with difficulty in standing, walking, getting up from sitting positions, and climbing stairs. He also presents with proximal weakness, calf hypertrophy, hamstring muscle contracture, and a positive Gower's sign. So you've determined that this patient is appropriate for resistance training. So what do you do? Well, first, let's understand what resistance training actually is.  R

In [12]:
def process_text(file):
    read_pdf = pdf.PdfReader(open(file, 'rb'))
    text = ' '.join([p.extract_text() for p in read_pdf.pages])
    text = re.sub('\d+\:\d+\:\d+','',text)
    text = re.sub('\[.*?\]','',text)
    text = text.replace('cielo24 | what’s in your video? | cielo24.com', '')
    return text

docs = []
for file in glob.glob('*.pdf'):
    text = process_text(file)
    doc = Document(page_content=text)
    docs.append(doc)

In [13]:
len(docs)

9

# Set up for Retrieval QA

In [64]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain import HuggingFaceHub
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.memory import ConversationBufferMemory

huggingfacehub_api_token = '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'
repo_id = "tiiuae/falcon-7b-instruct"
llm = HuggingFaceHub(huggingfacehub_api_token=huggingfacehub_api_token, 
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.1, "max_new_tokens":2000})

In [65]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# d_documents = text_splitter.split_documents(docs)
p_documents = text_splitter.split_documents(pages)

In [66]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'mps'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [67]:
d_persist_directory = '~/d_db'
p_persist_directory = '~/p_db'

# d_docsearch = Chroma.from_documents(d_documents, hf, persist_directory=d_persist_directory)
p_docsearch = Chroma.from_documents(p_documents, hf, persist_directory=p_persist_directory)

In [68]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, input_key='question', output_key='answer')

# Results

## By Page

In [69]:
pqa = ConversationalRetrievalChain.from_llm(llm, p_docsearch.as_retriever(), return_source_documents=True, memory=memory)

In [70]:
dict(memory.chat_memory)['messages']

[]

In [71]:
query = "What is resistance training?"
result = pqa({"question": query, "chat_history":memory.chat_memory})

In [72]:
result['answer']

' Resistance training is a form of active exercise in which dynamic or static muscle contractions are resisted by an outside force applied manually or mechanically.'

In [73]:
memory.chat_memory

ChatMessageHistory(messages=[HumanMessage(content='What is resistance training?', additional_kwargs={}, example=False), AIMessage(content=' Resistance training is a form of active exercise in which dynamic or static muscle contractions are resisted by an outside force applied manually or mechanically.', additional_kwargs={}, example=False)])

In [74]:
result['source_documents']

[Document(page_content="mechanically. 00:01:24[BLANK_AUDIO] 00:01:29Now, since we have established what the definition is, I would like to say before we do anything else that resists training within the literature is heavily, heavily covered. 00:01:40As a matter of fact, it is so ubiquitous that it's hardly worth mentioning the fact that, yes, there is literature supporting resistance training. 00:01:50As a matter of fact, when I did a very quick search term for resistance training and rehabilitation, I had almost 44,000 hits. 00:01:59So again, when we go through the material associated with resistance training, it's assuming that we all understand that resistance training does have therapeutic benefits. 00:02:12[BLANK_AUDIO] 00:02:14Okay, so let's talk about some of the elements of muscle performance, and they are strength, endurance, and power. 00:02:20The strength of a muscle is the ability of contractile tissue to produce tension and resultant force based on the demands placed on t

In [78]:
cites = result['source_documents']
# cites = [c.page_content for c in cites if c.page_content not in cites]
# cites

In [76]:
memory.chat_memory

ChatMessageHistory(messages=[HumanMessage(content='What is resistance training?', additional_kwargs={}, example=False), AIMessage(content=' Resistance training is a form of active exercise in which dynamic or static muscle contractions are resisted by an outside force applied manually or mechanically.', additional_kwargs={}, example=False)])

In [81]:
import pandas as pd
cite_df = pd.DataFrame(cites)

In [93]:
list(zip(list(c[1] for c in cite_df.drop_duplicates(subset=0)[0]), list(c[1] for c in cite_df.drop_duplicates(subset=0)[1])))

[("mechanically. 00:01:24[BLANK_AUDIO] 00:01:29Now, since we have established what the definition is, I would like to say before we do anything else that resists training within the literature is heavily, heavily covered. 00:01:40As a matter of fact, it is so ubiquitous that it's hardly worth mentioning the fact that, yes, there is literature supporting resistance training. 00:01:50As a matter of fact, when I did a very quick search term for resistance training and rehabilitation, I had almost 44,000 hits. 00:01:59So again, when we go through the material associated with resistance training, it's assuming that we all understand that resistance training does have therapeutic benefits. 00:02:12[BLANK_AUDIO] 00:02:14Okay, so let's talk about some of the elements of muscle performance, and they are strength, endurance, and power. 00:02:20The strength of a muscle is the ability of contractile tissue to produce tension and resultant force based on the demands placed on the muscle. 00:02:30So

## By Document

In [63]:
dqa = ConversationalRetrievalChain.from_llm(llm, d_docsearch.as_retriever(), return_source_documents=True)

In [64]:
chat_history = []
query = "What is resistance training?"
result = dqa({"question": query, "chat_history": chat_history})

In [65]:
result['answer']

'\ns.'

In [68]:
result['source_documents']

[Document(page_content="6.1 Principles of Resistance Exercise Hello, and welcome to our lecture on the principles of resistance exercise. Here is our course statement. By the end of this lecture, I am hoping that you'll be able to define resistance training, define the following terms associated with resistance training, and they include resistance training, muscle strength, power, and endurance. And finally, describe the principles of overload, SAID, and reversibility. Here's our patient case that'll be used for this lecture. On general physical examination, J.D a 14-year-old has an obese appearance and presents with difficulty in standing, walking, getting up from sitting positions, and climbing stairs. He also presents with proximal weakness, calf hypertrophy, hamstring muscle contracture, and a positive Gower's sign. So you've determined that this patient is appropriate for resistance training. So what do you do? Well, first, let's understand what resistance training actually is.  