<a href="https://colab.research.google.com/github/sugarforever/LangChainSummarizeYoutubeTranscript/blob/main/LangChainSummarizeYoutubeTranscript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install openai
%pip install langchain==0.0.139
%pip install unstructured
%pip install tiktoken

In [None]:
import os
from langchain.document_loaders import UnstructuredURLLoader, UnstructuredPowerPointLoader, ReadTheDocsLoader, PyPDFLoader
from langchain.llms import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.callbacks import get_openai_callback
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def summarize_docs(docs, doc_url):
    print (f'You have {len(docs)} document(s) in your {doc_url} data')
    print (f'There are {len(docs[0].page_content)} characters in your document')

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    split_docs = text_splitter.split_documents(docs)

    print (f'You have {len(split_docs)} split document(s)')

    OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
    llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model_name="text-davinci-003")
    chain = load_summarize_chain(llm, chain_type="map_reduce", verbose=False)

    response = ""
    with get_openai_callback() as cb:
        response = chain.run(input_documents=split_docs)
        print(f"Total Tokens: {cb.total_tokens}")
        print(f"Prompt Tokens: {cb.prompt_tokens}")
        print(f"Completion Tokens: {cb.completion_tokens}")
        print(f"Successful Requests: {cb.successful_requests}")
        print(f"Total Cost (USD): ${cb.total_cost}")

    return response

1. Load a web page by its URL and get its content summarized.

In [None]:
url = "https://edition.cnn.com/2023/04/13/business/delta-earnings/index.html"
summarize_docs(UnstructuredURLLoader(urls = [url]).load(), url)

2. Load PowerPoint file and get its content summarized.

In [None]:

!wget "https://github.com/tomw1808/truffle_eth_class2/blob/master/s08/Web3-intro.pptx?raw=true" -O Web3-intro.pptx

In [None]:
loader = UnstructuredPowerPointLoader("Web3-intro.pptx")
response = summarize_docs(loader.load(), "Web3-intro.pptx")
print(response)

3. Load readthedocs project and get its content summarized.

In [None]:
!wget -r -A.html -P langchain "https://langchain.readthedocs.io/en/latest/"

In [None]:
loader = ReadTheDocsLoader("langchain")
summarize_docs(loader.load(), "langchain")

4. Load PDF file by URL and get its content summarized.

In [None]:
!wget "https://ir.tesla.com/_flysystem/s3/sec/000095017023001409/tsla-20221231-gen.pdf" -O tsla-20221231-gen.pdf

In [None]:
%pip install pypdf

In [None]:
loader = PyPDFLoader("tsla-20221231-gen.pdf")
pages = loader.load_and_split()
summarize_docs(pages[:10], "tsla-20221231-gen.pdf")