# Running ETL to Build the Document Corpus

This notebook walks through the process for setting up the corpus of Full Stack documents that the bot searches over.

In each case, we have to
- Extract data from its natural habitat, like YouTube or GitHub
- Transform it into a format that is useful for our purposes
- Load it into our database in that format

hence the acronym "ETL".

In [27]:
!make secrets 

[1;38;5;214m###
# 🥞:[0m Loaded config from .env
[1;38;5;214m###[0m
python -m pip install -qqq -r requirements.txt
[1;38;5;214m###
# 🥞:[0m If you haven't gotten a Modal token yet, run make modal-token
[1;38;5;214m###[0m
Verifying token against [4;34mhttps://api.modal.com[0m
[32mToken verified successfully[0m
Token written to [35m/home/suhaspillai/[0m[95m.modal.toml[0m
bash tasks/setup_environment_modal.sh prod
[1;38;5;214m###
# 🥞:[0m Setting up modal environment prod
[1;38;5;214m###[0m
[31m╭─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m─────────────────────[0m[31m─╮[0m
[31m│[0m [2;33m/home/suhaspillai/anaconda3/envs/py310/lib/python3.10/site-packages/modal/cl[0m [31m│[0m
[31m│[0m [2;33mi/[0m[1;33menvironment.py[0m:[94m51[0m in [92mcreate[0m                                                [31m│[0m
[31m│[0m                                                                        

In [1]:
from pymongo import MongoClient

In [2]:
client = MongoClient('mongodb+srv://suhaspillai:Mongodb2023@fsdl.1jw1q9u.mongodb.net/', 27017)

In [3]:
client.list_database_names()

['fsdl', 'admin', 'local']

In [4]:
db,collection="fsdl", "ask-fsdl"

In [5]:
import modal

In [6]:
#!modal  run app.py::drop_docs --db {db} --collection {collection}

In [5]:
import json
from pathlib import Path
import pprint

from etl import markdown, pdfs, shared, videos
from etl.shared import display_modal_image

pp = pprint.PrettyPrinter(indent=2)

## PDFs: arXiV Papers

```bash
!modal run etl/pdfs.py --json-path data/llm-papers.json
```

In [12]:
display_modal_image(shared.image)

In [13]:
display_modal_image(pdfs.image)

In [14]:
papers_path = Path("data") / "llm-papers.json"

with open(papers_path) as f:
    pdf_infos = json.load(f)

pdf_infos[:100:20]

[{'tags': ['Multimodal', 'Vision', 'Internals'],
  'title': 'On the Hidden Mystery of OCR in Large Multimodal Models',
  'url': 'https://arxiv.org/abs/2305.07895'},
 {'tags': ['Evaluation'],
  'title': 'Asking Crowdworkers to Write Entailment Examples: The Best of Bad Options',
  'url': 'https://aclanthology.org/2020.aacl-main.68/'},
 {'tags': ['Prompting', 'Critical', 'Philosophy', 'Simulation'],
  'title': 'Inducing anxiety in large language models increases exploration and bias',
  'url': 'https://arxiv.org/abs/2304.11111v1'},
 {'tags': ['Ahead of Its Time'],
  'title': 'PROGRAMS WITH COMMON SENSE',
  'url': 'http://jmc.stanford.edu/articles/mcc59/mcc59.pdf'},
 {'tags': ['Ahead of Its Time', 'External Memory', 'Reasoning', 'Philosophy'],
  'title': 'Heuristic Problem Solving By Computer',
  'url': 'https://iiif.library.cmu.edu/file/Simon_box00065_fld04954_bdl0001_doc0001/Simon_box00065_fld04954_bdl0001_doc0001.pdf'}]

In [15]:
with pdfs.stub.run():
    # first, we enrich the paper data by finding direct PDF URLs where we can
    paper_data = pdfs.get_pdf_url.map(
        pdf_infos[::25], #subsampling to run faster
        return_exceptions=True
    )
    # then we turn the PDFs into JSON documents
    documents = shared.unchunk(  # each pdf creates a list of documents, one per page, so we flatten
        # after we run the extract_pdf function on Modal to get those pages
        pdfs.extract_pdf.map(paper_data, return_exceptions=True)
    )

Output()

Output()

Output()

In [16]:
pp.pprint(documents[0]["metadata"])

{ 'arxiv_id': '2305.07895',
  'date': datetime.datetime(2023, 6, 19, 3, 36, 8, tzinfo=datetime.timezone.utc),
  'full-title': 'On the Hidden Mystery of OCR in Large Multimodal Models - p0',
  'ignore': False,
  'is_endmatter': False,
  'page': 0,
  'sha256': '0f8257ead372b2b93b7b1a257e70f6ed36f37a231b45a17852bb50672cc5bfef',
  'source': 'https://arxiv.org/abs/2305.07895',
  'title': 'On the Hidden Mystery of OCR in Large Multimodal Models'}


In [17]:
from IPython.display import IFrame

IFrame(src=documents[0]["metadata"]["source"], width=800, height=400)

In [18]:
with shared.stub.run():
    # we split our document list into 10 pieces, so that we don't open too many connections
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents, kwargs={"db": db, "collection": collection}))

Output()

Output()

Output()

In [19]:
with shared.stub.run():
  import docstore
   # pull only arxiv papers
  query = { "metadata.source": { "$regex": "arxiv\.org", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = docstore.query_one(query, projection, db=db, collection=collection)

pp.pprint(result)

Output()

Output()

Output()

{ '_id': ObjectId('654fde29fb7b0dac9fd0e78e'),
  'metadata': { 'arxiv_id': '2210.01848',
                'date': datetime.datetime(2023, 1, 26, 19, 14, 18),
                'full-title': 'Explaining Patterns in Data with Language '
                              'Models via Interpretable Autoprompting - p8',
                'ignore': True,
                'is_endmatter': True,
                'page': 8,
                'sha256': '102b6516a6fd533c3f4d5947832e4a78e759408e7d8c60bff26d6d09d4f7d5ae',
                'source': 'https://arxiv.org/abs/2210.01848',
                'title': 'Explaining Patterns in Data with Language Models via '
                         'Interpretable Autoprompting'},
  'type': 'Document'}


## Markdown Files: Lectures

```bash
!modal run etl/markdown.py --json-path data/lectures-2022.json
```

In [23]:
display_modal_image(markdown.image)

In [26]:
markdown_path = Path("data") / "lectures-2022.json"

with open(markdown_path) as f:
  markdown_corpus = json.load(f)

website_url, md_url = (
  markdown_corpus["website_url_base"],
  markdown_corpus["md_url_base"],
)

lectures = markdown_corpus["lectures"]

lectures[0]
print(website_url, md_url)

https://fullstackdeeplearning.com/course/2022 https://github.com/the-full-stack/website/tree/main/docs/course/2022


In [27]:
lectures

[{'slug': 'lecture-1-course-vision-and-when-to-use-ml',
  'title': 'Course Vision & When to Use ML'},
 {'slug': 'lecture-2-development-infrastructure-and-tooling',
  'title': 'Development Infrastructure & Tooling'},
 {'slug': 'lecture-3-troubleshooting-and-testing',
  'title': 'Troubleshooting & Testing'},
 {'slug': 'lecture-4-data-management', 'title': 'Data Management'},
 {'slug': 'lecture-5-deployment', 'title': 'Deployment'},
 {'slug': 'lecture-6-continual-learning', 'title': 'Continual Learning'},
 {'slug': 'lecture-7-foundation-models', 'title': 'Foundation Models'},
 {'slug': 'lecture-8-teams-and-pm', 'title': 'Teams & Product Management'},
 {'slug': 'lecture-9-ethics', 'title': 'Ethics'}]

In [25]:
with markdown.stub.run():
    documents = (
        shared.unchunk(  # each lecture creates multiple documents, one per section so we flatten
            markdown.to_documents.map(
                lectures,
                kwargs={"website_url": website_url, "md_url": md_url},
                return_exceptions=True,
            )
        )
    )

Output()

Output()

Output()

TypeError: 'UserCodeException' object is not iterable

In [None]:
pp.pprint(documents[1]["metadata"])

In [None]:
from IPython.display import IFrame

IFrame(src=documents[1]["metadata"]["source"], width=800, height=400)

In [None]:
with shared.stub.run():
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents, kwargs={"db": db, "collection": collection}))

In [None]:
with shared.stub.run():
  import docstore
  # pull only lectures
  query = { "metadata.source": { "$regex": "lecture", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = docstore.query_one(query, projection, db=db, collection=collection)

pp.pprint(result)

## Videos: YouTube Transcripts

In [6]:
display_modal_image(videos.image)

In [7]:
videos_path = Path("data") / "videos.json"

with open(videos_path) as f:
    video_infos = json.load(f)

video_ids = [video["id"] for video in video_infos]

video_infos[0]

{'id': '-Iob-FW5jVM',
 'title': 'Lecture 01: When to Use ML and Course Vision (FSDL 2022)'}

In [3]:
import requests
video_id='MyFrMFab6bo'
base_url = "https://yt.lemnoslife.com"
request_path = "/videos"
params = {"id": video_id, "part": "chapters"}
response = requests.get(base_url + request_path, params=params)

In [5]:
response.raise_for_status()

response.json()

{'error': {'code': 400,
  'message': 'YouTube has detected unusual traffic from this YouTube operational API instance. Please try your request again later or see alternatives at https://github.com/Benjamin-Loison/YouTube-operational-API/issues/11'}}

In [8]:
!pip install youtube-transcript-api==0.6.1

Collecting youtube-transcript-api==0.6.1
  Downloading youtube_transcript_api-0.6.1-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.1


In [13]:
from youtube_transcript_api import YouTubeTranscriptApi
out = YouTubeTranscriptApi.get_transcript('Or1JV2MImyg')


In [14]:
out

[{'text': '>> HAVE YOU DRIVEN IT?', 'start': 1.334, 'duration': 2.736},
 {'text': '>> NO.', 'start': 3.536, 'duration': 1.602},
 {'text': "I'M GETTING ONE.", 'start': 4.337, 'duration': 2.002},
 {'text': "THAT'S THE FIRST TIME I SAW IT", 'start': 5.171, 'duration': 3.871},
 {'text': "I'M GETTING ONE.", 'start': 7.14, 'duration': 3.403},
 {'text': ">> LET'S TALK ABOUT TESLA AND ", 'start': 9.142, 'duration': 3.27},
 {'text': 'OTHER COMPANIES YOU ARE INVESTED',
  'start': 10.577,
  'duration': 3.703},
 {'text': 'IN', 'start': 14.18, 'duration': 3.704},
 {'text': 'THERE WAS A CALL YESTERDAY, HSBC',
  'start': 14.314,
  'duration': 4.404},
 {'text': 'INITIATED COVERAGE WITH THE ', 'start': 17.917, 'duration': 3.304},
 {'text': 'PRICE TARGET OF $146', 'start': 18.752, 'duration': 4.804},
 {'text': 'I ASSUME YOU LOOKED AT THIS AND ',
  'start': 21.254,
  'duration': 3.637},
 {'text': 'LAUGHED WITH THE BULLISH VIEWS', 'start': 23.59, 'duration': 2.269},
 {'text': 'ON TESLA', 'start': 25.425, 

In [None]:
with videos.stub.run():
    documents = (
        shared.unchunk(  # each lecture creates multiple documents, one per chapter, so we flatten
            videos.extract_subtitles.map(
                video_infos[-3:],  # subsampling to run faster
                return_exceptions=True,
            )
        )
    )

Output()

Output()

Output()

In [None]:
pp.pprint(documents[1]["metadata"])

In [None]:
from IPython.display import YouTubeVideo

id_str, time_str = documents[1]["metadata"]["source"].split("?v=")[-1].split("&t=")
YouTubeVideo(id_str, start=int(time_str.strip("s")), width=800, height=400)

In [None]:
with shared.stub.run():
    chunked_documents = shared.chunk_into(documents, 10)
    list(shared.add_to_document_db.map(chunked_documents, kwargs={"db": db, "collection": collection}))

In [None]:
with shared.stub.run():
  import docstore
  # pull only lectures
  query = { "metadata.source": { "$regex": "youtube", "$options": "i" } }
  # project out the text field, it can get large
  projection = {"text": 0}
  # get just one result to show it worked
  result = docstore.query_one(query, projection, db=db, collection=collection)

pp.pprint(result)