# Notebook for trying out Langchain
## (Not part of solution)

In [1]:
import langchain
import pandas as pd
import numpy as np
import os
import re
import glob
import json
import matplotlib.pyplot as plt
from langchain_text_splitters import MarkdownHeaderTextSplitter

### Get data

In [2]:
folder_path = '../data/raw'
md_files = glob.glob(os.path.join(folder_path, '*.md'))

# Create an empty list to store the data
data = []

# Read the contents of each .md file and store in the list
for file in md_files:
    with open(file, 'r', encoding='utf-8') as f:
        content = f.read()
        doc_name = os.path.splitext(os.path.basename(file))[0]  
        data.append({'id': doc_name, 'text': content})

# Convert the list to a DataFrame
df = pd.DataFrame(data)

### Checkout markdown text splitter

In [3]:
# test

markdown_document = "# Foo\n\n    ## Bar\n\nHi this is Jim\n\nHi this is Joe\n\n ### Boo \n\n Hi this is Lance \n\n ## Baz\n\n Hi this is Molly"

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [4]:
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='Hi this is Jim  \nHi this is Joe'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}, page_content='Hi this is Lance'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='Hi this is Molly')]

In [5]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)
md_header_splits

[Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar'}, page_content='# Foo  \n## Bar  \nHi this is Jim  \nHi this is Joe'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Bar', 'Header 3': 'Boo'}, page_content='### Boo  \nHi this is Lance'),
 Document(metadata={'Header 1': 'Foo', 'Header 2': 'Baz'}, page_content='## Baz  \nHi this is Molly')]

### Apply the splitting function to each document in the DataFrame

In [6]:
df['splitted_text'] = df['text'].apply(markdown_splitter.split_text)

In [7]:
df.head()

Unnamed: 0,id,text,splitted_text
0,aws-properties-sagemaker-modelbiasjobdefinitio...,# AWS::SageMaker::ModelBiasJobDefinition Endpo...,[page_content='# AWS::SageMaker::ModelBiasJobD...
1,aws-properties-sagemaker-notebookinstance-inst...,# AWS::SageMaker::NotebookInstance InstanceMet...,[page_content='# AWS::SageMaker::NotebookInsta...
2,aws-resource-sagemaker-project,"# AWS::SageMaker::Project<a name=""aws-resource...",[page_content='# AWS::SageMaker::Project<a nam...
3,aws-properties-sagemaker-modelexplainabilityjo...,# AWS::SageMaker::ModelExplainabilityJobDefini...,[page_content='# AWS::SageMaker::ModelExplaina...
4,aws-properties-sagemaker-modelqualityjobdefini...,# AWS::SageMaker::ModelQualityJobDefinition Mo...,[page_content='# AWS::SageMaker::ModelQualityJ...


In [8]:
len(df.iloc[0]['splitted_text'])

5

In [15]:
df.iloc[0]['splitted_text']

[Document(metadata={'Header 1': 'AWS::SageMaker::ModelBiasJobDefinition EndpointInput<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput"></a>'}, page_content='# AWS::SageMaker::ModelBiasJobDefinition EndpointInput<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput"></a>  \nInput object for the endpoint'),
 Document(metadata={'Header 1': 'AWS::SageMaker::ModelBiasJobDefinition EndpointInput<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput"></a>', 'Header 2': 'Syntax<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput-syntax"></a>'}, page_content='## Syntax<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput-syntax"></a>  \nTo declare this entity in your AWS CloudFormation template, use the following syntax:'),
 Document(metadata={'Header 1': 'AWS::SageMaker::ModelBiasJobDefinition EndpointInput<a name="aws-properties-sagemaker-modelbiasjobdefinition-endpointinput"></a>', 'Header 2': 'Syntax<a

In [10]:
len(df.iloc[4]['splitted_text'])

5