# Parsing a single markdown file using frontmatter

In [2]:
import frontmatter

In [3]:
with open('example.md', 'r', encoding='utf-8') as f:
    post = frontmatter.load(f)

In [4]:
# Access metadata
print(post.metadata['title'])
print(post.metadata['tags'])

# Access content
print(post.content)

Getting Started with AI
['ai', 'machine-learning', 'tutorial']
# Getting Started with AI
This is the main content of the document written in **Markdown**.
You can include code blocks, links, and other formatting here.


# Importing and parsing git repos

In [5]:
import io
import zipfile
import requests
import frontmatter

In [6]:
url = 'https://codeload.github.com/DataTalksClub/faq/zip/refs/heads/main'
resp = requests.get(url)

In [13]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()
    
    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue
    
    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
    
    data = post.to_dict()
    data['filename'] = filename
    repository_data.append(data)

zf.close()

In [14]:
repository_data[1]

{'id': '9e508f2212',
 'question': 'Course: When does the course start?',
 'sort_order': 1,
 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.",
 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}

In [17]:
import io
import zipfile
import frontmatter
import requests

def read_repo_data(repo_owner, repo_name):
    prefix = "https://codeload.github.com"
    url = f"{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main"
    
    response = requests.get(url)
    
    if response.status_code != 200:
        raise Exception(f"Download failed: {response.status_code}")
    
    repo_data = []
    
    zf = zipfile.ZipFile(io.BytesIO(response.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename.lower()
        if filename.endswith(".md") or filename.endswith(".mdx"):
            try:
                with zf.open(file_info, "r") as file:
                    content = file.read().decode("utf-8", errors="ignore")
                data = frontmatter.loads(content).to_dict()
                data["filename"] = filename
                repo_data.append(data)
            except Exception as e:
                print(f"Failed to process {file_info.filename}: {e}")
    
    zf.close()
    return repo_data

In [18]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')
evidently_docs = read_repo_data('evidentlyai', 'docs')

print(f"FAQ Docs: {len(dtc_faq)}")
print(f"Evidently Docs: {len(evidently_docs)}")

FAQ Docs: 1217
Evidently Docs: 95
