In [22]:
! pip install openai
! pip install srt
! pip install faiss-cpu
! pip install sentence_transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
import os
os.environ["LANGCHAIN_HANDLER"] = "langchain"

In [24]:
import os
from pathlib import Path
import shutil
import string

import srt

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI


## Make documents

In [3]:
DOCS_FOLDER = Path("docs")

In [25]:
def get_lecture_titles():
    return {
        1: "lecture-1-course-vision-and-when-to-use-ml",
        2: "lecture-2-development-infrastructure-and-tooling",
        3: "lecture-3-troubleshooting-and-testing",
        4: "lecture-4-data-management",
        5: "lecture-5-deployment",
        6: "lecture-6-continual-learning",
        7: "lecture-7-foundation-models",
        8: "lecture-8-teams-and-pm",
        9: "lecture-9-ethics"
    }


def get_srt_urls():
    return {
        1: "https://www.youtube.com/watch?v=-Iob-FW5jVM",
        2: "https://www.youtube.com/watch?v=BPYOsDCZbno",
        3: "https://www.youtube.com/watch?v=RLemHNAO5Lw",
        4: "https://www.youtube.com/watch?v=Jlm4oqW41vY",
        5: "https://www.youtube.com/watch?v=W3hKjXg7fXM",
        6: "https://www.youtube.com/watch?v=nra0Tt3a-Oc",
        7: "https://www.youtube.com/watch?v=Rm11UeGwGgk",
        8: "https://www.youtube.com/watch?v=a54xH6nT4Sw",
        9: "https://www.youtube.com/watch?v=7FQpbYTqjAA"
    }


In [26]:
lecture_md_filenames = [
    elem for elem in DOCS_FOLDER.iterdir() if elem.is_file() and "lecture" in str(elem) and str(elem).endswith("md")]
lecture_md_filenames


[PosixPath('docs/lecture-04.md'),
 PosixPath('docs/lecture-01.md'),
 PosixPath('docs/lecture-05.md'),
 PosixPath('docs/lecture-08.md'),
 PosixPath('docs/lecture-09.md'),
 PosixPath('docs/lecture-02.md'),
 PosixPath('docs/lecture-06.md'),
 PosixPath('docs/lecture-07.md'),
 PosixPath('docs/lecture-03.md')]

In [27]:
lecture_titles = get_lecture_titles()
lecture_titles


{1: 'lecture-1-course-vision-and-when-to-use-ml',
 2: 'lecture-2-development-infrastructure-and-tooling',
 3: 'lecture-3-troubleshooting-and-testing',
 4: 'lecture-4-data-management',
 5: 'lecture-5-deployment',
 6: 'lecture-6-continual-learning',
 7: 'lecture-7-foundation-models',
 8: 'lecture-8-teams-and-pm',
 9: 'lecture-9-ethics'}

In [28]:
lecture_texts = {}
for fn in lecture_md_filenames:
    idx = int("".join(elem for elem in str(fn) if elem in string.digits))
    lecture = fn.open().read()
    lecture_texts[idx] = lecture


In [8]:
# lecture_texts[1]


In [29]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
lecture_texts_split = {idx: text_splitter.split_text(
    lecture_text) for idx, lecture_text in lecture_texts.items()}
website_url_base = "https://fullstackdeeplearning.com/course/2022/"
source_urls = {idx: website_url_base +
               title for idx, title in lecture_titles.items()}


In [30]:
# Source URL as the key, lecture split list as the value
url_to_text_split = dict([(url, text_split) for url, text_split in zip(
    source_urls.values(), lecture_texts_split.values())])


In [31]:
url_to_text_split


{'https://fullstackdeeplearning.com/course/2022/lecture-1-course-vision-and-when-to-use-ml': ['---\ndescription: Sourcing, storing, exploring, processing, labeling, and versioning data for deep learning.\n---\n\n# Lecture 4: Data Management\n\n<div align="center">\n<iframe width="720" height="405" src="https://www.youtube-nocookie.com/embed/Jlm4oqW41vY?list=PL1T8fO7ArWleMMI8KPJ_5D5XSlovTW_Ur" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>\n</div>\n\nLecture by [Sergey Karayev](https://sergeykarayev.com).<br />\nNotes by [James Le](https://twitter.com/le_james94) and [Vishnu Rachakonda](https://www.linkedin.com/in/vrachakonda/).<br />\nPublished August 29, 2022.\n[Download slides](https://fsdl.me/2022-lecture-04-slides).\n\n## 1 - Introduction\n\nOne thing people don\'t quite get as they enter the field of ML is how\nmuch of it deals with data - putting together datas

In [32]:

srt_filenames = [
    elem for elem in DOCS_FOLDER.iterdir() if elem.is_file() and str(elem).endswith("srt")]
srt_filenames


[PosixPath('docs/lecture-08.srt'),
 PosixPath('docs/lecture-09.srt'),
 PosixPath('docs/lecture-04.srt'),
 PosixPath('docs/lecture-05.srt'),
 PosixPath('docs/lecture-06.srt'),
 PosixPath('docs/lecture-02.srt'),
 PosixPath('docs/lecture-03.srt'),
 PosixPath('docs/lecture-01.srt')]

In [33]:
srt_urls = get_srt_urls()
srt_urls

{1: 'https://www.youtube.com/watch?v=-Iob-FW5jVM',
 2: 'https://www.youtube.com/watch?v=BPYOsDCZbno',
 3: 'https://www.youtube.com/watch?v=RLemHNAO5Lw',
 4: 'https://www.youtube.com/watch?v=Jlm4oqW41vY',
 5: 'https://www.youtube.com/watch?v=W3hKjXg7fXM',
 6: 'https://www.youtube.com/watch?v=nra0Tt3a-Oc',
 7: 'https://www.youtube.com/watch?v=Rm11UeGwGgk',
 8: 'https://www.youtube.com/watch?v=a54xH6nT4Sw',
 9: 'https://www.youtube.com/watch?v=7FQpbYTqjAA'}

In [34]:
def timestamp_from_timedelta(timedelta):
    return int(timedelta.total_seconds())

def create_srt_texts_and_metadatas(subtitles, base_url):
    query_params_format = "&t={start}s"
    texts, metadatas = [], []

    for subtitle in subtitles:
        raw_text = subtitle.content
        text = subtitle.content.strip()
        start = timestamp_from_timedelta(subtitle.start)
        url = base_url + query_params_format.format(start=start)

        texts.append(text)
        metadatas.append(url)

    return texts, metadatas


In [35]:
for fn in srt_filenames:
    idx = int("".join(elem for elem in str(fn) if elem in string.digits))
    srt_url = srt_urls[idx]

    
    srt_text = fn.open().read()
    subtitles = list(srt.parse(srt_text))
    texts, metadatas = create_srt_texts_and_metadatas(subtitles, srt_url)
    
    for text, url in zip(texts, metadatas):
        url_to_text_split[url] = [text]


In [15]:
# url_to_text_split.keys()


In [36]:
url_to_text_split["https://www.youtube.com/watch?v=a54xH6nT4Sw&t=1351s"]

["is today much less well defined than your software engineering interviews some common types of Assessments that I've seen are your normal sort of background and culture fit interviews whiteboard coding interviews similar to you'd see in software engineering pair coding like in software engineering but some more ml specific ones include pair debugging where you and an interviewer will sit down and run some ml code and try to find Hey where's the bug in this code oftentimes this is ml specific code and the goal is to test for how well is this person able to find bugs in ml code since bugs tend to be where we spend most of our time in machine learning math puzzles are often common especially involving things like linear algebra"]

In [37]:
all_text_splits = []
all_text_metadata = []

for source_url, text_splits in url_to_text_split.items():
    for text in text_splits:
        all_text_splits.append(text)
        all_text_metadata.append({"source": source_url})


In [38]:
assert len(all_text_splits) == len(all_text_metadata)

## Index documents

In [39]:
# OPENAI_KEY = "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"

# embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)
embeddings = HuggingFaceEmbeddings()
docsearch = FAISS.from_texts(all_text_splits, embeddings, all_text_metadata)


In [40]:
PERSONAL_KEY = "use your OpenAI key"
chain = VectorDBQAWithSourcesChain.from_chain_type(
    OpenAI(temperature=0, openai_api_key=PERSONAL_KEY), chain_type="stuff", vectorstore=docsearch)


In [41]:
chain({"question": "What is FSDL"}, return_only_outputs=True)


{'answer': ' Full Stack Deep Learning (FSDL) is the course and community for people who are building products that are powered by machine learning (ML).\n',
 'sources': 'https://fullstackdeeplearning.com/course/2022/lecture-2-development-infrastructure-and-tooling'}

In [80]:
help(chain.combine_documents_chain)

Help on StuffDocumentsChain in module langchain.chains.combine_documents.stuff object:

class StuffDocumentsChain(langchain.chains.combine_documents.base.BaseCombineDocumentsChain, pydantic.main.BaseModel)
 |  StuffDocumentsChain(*, memory: langchain.chains.base.Memory = None, callback_manager: langchain.callbacks.base.BaseCallbackManager = None, verbose: bool = None, input_key: str = 'input_documents', output_key: str = 'output_text', llm_chain: langchain.chains.llm.LLMChain, document_prompt: langchain.prompts.base.BasePromptTemplate = None, document_variable_name: str) -> None
 |  
 |  Chain that combines documents by stuffing into context.
 |  
 |  Method resolution order:
 |      StuffDocumentsChain
 |      langchain.chains.combine_documents.base.BaseCombineDocumentsChain
 |      langchain.chains.base.Chain
 |      pydantic.main.BaseModel
 |      pydantic.utils.Representation
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  combine_docs(self, docs: Li

In [81]:
chain.combine_documents_chain.document_prompt

PromptTemplate(input_variables=['page_content', 'source'], output_parser=None, template='Content: {page_content}\nSource: {source}', template_format='f-string')

In [86]:
print(chain.combine_documents_chain.llm_chain.prompt.template)


Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES"). 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESTION: Which state/country's law governs the interpretation of the contract?
Content: This Agreement is governed by English law and the parties submit to the exclusive jurisdiction of the English courts in  relation to any dispute (contractual or non-contractual) concerning this Agreement save that either party may apply to any court for an  injunction or other relief to protect its Intellectual Property Rights.
Source: 28-pl
Content: No Waiver. Failure or delay in exercising any right or remedy under this Agreement shall not constitute a waiver of such (or any other)  right or remedy.

11.7 Severability. The invalidity, illegality or unenforceability of any term (or part of a term) of this Agreement shall not affect the con

In [87]:
chain.combine_documents_chain.llm_chain.prompt



In [88]:
chain.combine_documents_chain


