In [1]:
from langchain.document_loaders import NotebookLoader
import ast

In [2]:
%pwd

'/Users/robertgshaw/dev/llms/langchain-gpt/memory/data'

In [3]:
md_cell_str = "'markdown' cell: "
md_cell_str_len = len(md_cell_str)
code_cell_str = "'code' cell: "
code_cell_str_len = len(code_cell_str)

def find_next_index(str):
    md_ind = str.find(md_cell_str)
    code_ind = str.find(code_cell_str)

    if md_ind == -1:
        return code_ind
    elif code_ind == -1:
        return md_ind
    else:
        return min(md_ind, code_ind)

def extract_text(document):
    notebook_txt = document.page_content
    saved_txts = []
    saved_txt = ""

    first_code_cell = True
    is_code_cell = False
    is_first_cell = True

    while True:
        if notebook_txt[:md_cell_str_len] == md_cell_str:
            notebook_txt = notebook_txt[md_cell_str_len:]
            first_code_cell = True
            is_code_cell = False

        elif notebook_txt[:code_cell_str_len] == code_cell_str:
            notebook_txt = notebook_txt[code_cell_str_len:]
            is_code_cell = True
            if first_code_cell:
                saved_txt += "Here is some code:\n"
                first_code_cell = False
            
        else:
            notebook_txt = notebook_txt[line_end:].rstrip()
            assert len(notebook_txt) == 0
            saved_txts.append(saved_txt)
            break

        line_end = notebook_txt.find("\n")
        line_lst = ast.literal_eval(notebook_txt[1:line_end-1])
        
        if not is_first_cell and not is_code_cell and len(line_lst) > 0 and len(line_lst[0]) >= 2 and line_lst[0][:2] == "##":
            saved_txts.append(saved_txt)
            saved_txt = ""

        for str in line_lst:
            saved_txt += str
            saved_txt += ("\n" if is_code_cell else " ")
        saved_txt += "\n"
        
        next_ind = find_next_index(notebook_txt)
        if next_ind != -1:
            notebook_txt = notebook_txt[next_ind:]

        is_first_cell = False
        
    return saved_txts

In [4]:
loader = NotebookLoader(
        # './langchain/docs/modules/memory/getting_started.ipynb',
        './langchain/docs/modules/agents/plan_and_execute.ipynb',
        include_outputs=False, 
        max_output_length=2000, 
        remove_newline=True)

document = loader.load()[0]
for text in extract_text(document):
    print(text)

## Plan and Execute  Plan and execute agents accomplish an objective by first planning what to do, then executing the sub tasks. This idea is largely inspired by [BabyAGI](https://github.com/yoheinakajima/babyagi) and then the ["Plan-and-Solve" paper](https://arxiv.org/abs/2305.04091).  The planning is almost always done by an LLM.  The execution is usually done by a separate agent (equipped with tools). 

## Imports 
Here is some code:
from langchain.chat_models import ChatOpenAI
from langchain.experimental.plan_and_execute import PlanAndExecute, load_agent_executor, load_chat_planner
from langchain.llms import OpenAI
from langchain import SerpAPIWrapper
from langchain.agents.tools import Tool
from langchain import LLMMathChain


## Tools 
Here is some code:
search = SerpAPIWrapper()
llm = OpenAI(temperature=0)
llm_math_chain = LLMMathChain.from_llm(llm=llm, verbose=True)
tools = [
    Tool(
        name = "Search",
        func=search.run,
        description="useful for when you nee

In [9]:
from os import walk

base_dir = "./langchain/docs/modules"
python_notebook_paths = []
for (dirpath, dirnames, filenames) in walk(base_dir):
    for filename in filenames:
        _, ext = filename.split(".")
        if ext == "ipynb":
            python_notebook_paths.append(f"{dirpath}/{filename}")

In [10]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

data = []
idx = 0
for path in python_notebook_paths:
    loader = NotebookLoader(
        path,
        include_outputs=False, 
        max_output_length=2000, 
        remove_newline=True)
    document = loader.load()[0]
    text_chunks = (extract_text(document))

    for text_chunk in text_chunks:
        data.append ({
            "idx": idx,
            "text": text_chunk,
            "token_len": len(encoding.encode(text_chunk)),
            "source": document.metadata["source"]
        })
        idx += 1

In [31]:
import pandas as pd

df = pd.DataFrame.from_records(data)
df.to_csv("langchain_docs_modules.csv")

In [36]:
df = pd.read_csv("langchain_docs_modules.csv")