# Summarization

In [3]:
import langchain_openai
import tiktoken
import chromadb
import langchain

In [4]:
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import ChatOpenAI

loader = WebBaseLoader("https://www.cnbc.com/2024/02/12/stock-market-today-live-updates.html")
docs = loader.load()

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-1106", openai_api_key="")
chain = load_summarize_chain(llm, chain_type="stuff")

chain.run(docs)

  warn_deprecated(


"The stock market experienced a sell-off, with the Dow losing 400 points and the S&P 500 and Nasdaq Composite also sliding. This was due to higher-than-expected inflation data for January, which raised doubts about the Federal Reserve's ability to cut rates multiple times this year. Tech stocks, small-cap stocks, and corporate earnings were all impacted by the market movement. The CME FedWatch Tool showed that the probability of a rate cut in May by the Federal Reserve decreased, while the probability of holding rates steady increased. The market is also awaiting key U.S. inflation data and retail sales data to determine the future direction of the market. Additionally, the Nikkei briefly surpassed the 38,000 mark for the first time since 1990, and India's weightage on the MSCI Global Standard index rose to a record high."

<img src="summarization.png" alt="Two common approaches to pass documents into LLMs context window">

## Stuff

When we use load_summarize_chain with chain_type="stuff", we will use the StuffDocumentsChain.

In [14]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

#define prompt
prompt_template = """write a concise summary of the following:
"{text}"
concise summry:
"""
prompt = PromptTemplate.from_template(prompt_template)

#define LLm chain 
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k", openai_api_key="")
llm_chain =  LLMChain(llm=llm, prompt = prompt)

#define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

docs = loader.load()
stuff_chain.run(docs)

"On February 13, 2024, the stock market experienced a sell-off as hotter-than-expected inflation data for January raised concerns about the Federal Reserve's ability to cut rates multiple times this year. The Dow Jones Industrial Average lost 443 points, or 1.2%, while the S&P 500 and Nasdaq Composite fell 1.2% and 1.4% respectively. The consumer price index rose 0.3% in January, exceeding expectations, and core prices, which exclude volatile food and energy components, rose 0.4%. The spike in Treasury yields also contributed to the decline in tech stocks, including Microsoft and Amazon."

In [15]:
#define prompt
prompt = PromptTemplate.from_template(
'''Please provide a bullet-point summary of the key points from the following document, 
   focusing on major events and findings. Ensure the summary is concise and under 100 words:
"{text}"
Summary:''')

#define LLm chain 
#temperatures in the range of 0.1 to 0.7 might produce summaries that balance creativity and accuracy.
llm = ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo-16k", openai_api_key="")
llm_chain =  LLMChain(llm=llm, prompt = prompt)

#define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

docs = loader.load()
stuff_chain.run(docs)

"- Stocks dropped on Tuesday after hotter-than-expected inflation data for January spiked Treasury yields and raised doubts about the Federal Reserve's ability to cut rates multiple times this year.\n- The Dow Jones Industrial Average lost 443 points, or 1.2%, while the S&P 500 slid 1.2% and the Nasdaq Composite fell 1.4%.\n- The consumer price index rose 0.3% in January from December, with economists expecting a 0.2% increase. Core prices, excluding food and energy, rose 0.4% month over month and 3.9% from a year ago.\n- Tech shares, including Microsoft and Amazon, led the losses in trading Tuesday, with Microsoft sliding 1.4% and Amazon falling 1.4%.\n- In corporate news, JetBlue Airways spiked 12% after activist investor Carl Icahn reported a nearly 10% stake in the airline, while Hasbro lost 6% after missing analyst expectations for the fourth quarter."

# RAG  
question-answering applications
* A typical RAG had two main components: indexing and retrieval/generation
* Indexing: a pipeline for ingesting data from a source and indexing it. This usually happens offline.
* Retrieval and generation: the actual RAG chain, which takes the user query at run time and retrieves the relevant data from the index, then passes that to the model.

<img src="indexing.png" alt="indexing">
<img src="retrieval_generation.png" alt="RAG">

In [19]:
import langchain 
import langchain_community 
import langchain_openai 
import chromadb 
import bs4

In [20]:
import getpass
import os

#os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [21]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

········


In [49]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate

In [47]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")  #can be customized 
        )
    ),
)
docs = loader.load() #load

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  #split
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(openai_api_key=""))   #store


# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
#prompt = hub.pull("rlm/rag-prompt")                          
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the beginning of the answer.

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0,openai_api_key="")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [51]:
rag_chain.invoke("What are the approaches to Task Decomposition?")

'The approaches to Task Decomposition include using a Language Model with simple prompting, task-specific instructions, or human inputs. Thanks for asking!'

In [52]:
rag_chain.invoke("what is the content of simple prompting?")

'The content of simple prompting is the use of a prompt to guide a language model in generating a response or completing a task. It involves providing specific instructions or cues to the model to elicit the desired output. Thanks for asking!'

In [53]:
# cleanup
vectorstore.delete_collection()

### Step-by-step explaination and functions

In [None]:
# Load docs
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Jack_Ma")
data = loader.load()

len(docs[0].page_content)

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

len(splits)
len(splits[0].page_content)
splits[10].metadata

# Store splits
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

# RAG prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

# LLM
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

question = "What are the approaches to Task Decomposition?"
result = qa_chain({"query": question})
result["result"]

In [None]:
#retrieval 
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke("What are the approaches to Task Decomposition?")
len(retrieved_docs)
print(retrieved_docs[0].page_content)

# Chatbot
* Chat models
* Prompt Templates
* Chat history
* Retrievers

In [55]:
import langchain_openai
import langchain
# import quiet

In [57]:
from langchain_openai import ChatOpenAI

chat = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0.2, openai_api_key="")

In [58]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. Answer all questions to the best of your ability.",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

chain = prompt | chat

In [59]:
from langchain.memory import ChatMessageHistory

demo_ephemeral_chat_history = ChatMessageHistory()

demo_ephemeral_chat_history.add_user_message("hi!")

demo_ephemeral_chat_history.add_ai_message("whats up?")

demo_ephemeral_chat_history.messages

[HumanMessage(content='hi!'), AIMessage(content='whats up?')]

In [62]:
demo_ephemeral_chat_history.add_user_message(
    "How can I build an interactive LLM application?"
)

response = chain.invoke({"messages": demo_ephemeral_chat_history.messages})

response

AIMessage(content='To build an interactive LLM (Language Model) application, you can use a programming language such as Python and a natural language processing library like TensorFlow or PyTorch. You would need to train a language model on a large dataset of text, and then use the model to generate responses based on user input. There are also pre-trained language models available, such as GPT-3, that you can use to build interactive applications. Additionally, you may want to consider using a framework like Flask or Django to create a web-based interface for your application.')

In [63]:
demo_ephemeral_chat_history.add_ai_message(response)

demo_ephemeral_chat_history.add_user_message("Do you have any learnign resources for me to study?")

chain.invoke({"messages": demo_ephemeral_chat_history.messages})

AIMessage(content='Certainly! There are many resources available for studying natural language processing and building interactive applications. Here are a few suggestions:\n\n1. Coursera and Udemy offer courses on natural language processing and machine learning that can provide a solid foundation for building language models and interactive applications.\n\n2. The TensorFlow and PyTorch websites provide extensive documentation, tutorials, and examples for building and training language models.\n\n3. "Natural Language Processing in Action" by Lane, Howard, and Hapke is a highly regarded book that covers the fundamentals of natural language processing and provides practical examples.\n\n4. GitHub is a great resource for finding open-source projects related to natural language processing and interactive applications. You can study the code and learn from the implementations of others.\n\n5. Online communities such as Stack Overflow, Reddit, and specialized forums like the TensorFlow and

# Interactive chatbox

In [66]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.31.1-py2.py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting altair<6,>=4.0
  Downloading altair-5.2.0-py3-none-any.whl (996 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.9/996.9 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7
  Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.4/196.4 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting blinker<2,>=1.0.0
  Downloading blinker-1.7.0-py3-none-any.whl (13 kB)
Collecting rich<14,>=10.14.0
  Downloading rich-13.7.0-py3-none-any.whl (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.6/240.6 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tzlocal<6,>=1.1
  Downloading tzlocal-5.2-py3-none-any.w

In [67]:
import streamlit
import openai
import langchain

In [68]:
# a simple question-answering chatbox
import streamlit as st
#from langchain_community.llms import OpenAI

st.title('Auto-Summarization App')

openai_api_key = st.sidebar.text_input('OpenAI API Key', type='password')

def generate_response(input_text):
    llm = OpenAI(temperature=0.7, openai_api_key=openai_api_key)
    st.info(llm(input_text))

with st.form('my_form'):
    text = st.text_area('Enter text:', 'What are the three key pieces of advice for learning how to code?')
    submitted = st.form_submit_button('Submit')
    if not openai_api_key.startswith('sk-'):
        st.warning('Please enter your OpenAI API key!', icon='⚠')
    if submitted and openai_api_key.startswith('sk-'):
        generate_response(text)

2024-02-14 11:47:45.390 
  command:

    streamlit run /Users/rufen/anaconda3/lib/python3.10/site-packages/ipykernel_launcher.py [ARGUMENTS]


# SQL 
* sql agent
* interact with csv data file

In [None]:
# sql agent 
from langchain_community.agent_toolkits import create_sql_agent
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)

# then use natural language to ask your question 

# CSV

*  use pandas, this approach is not fit for production use cases unless you have extensive safeguards in place

In [None]:
# can be deleted?
ai_msg = llm.invoke(
    "I have a pandas DataFrame 'df' with columns 'Age' and 'Fare'. Write code to compute the correlation between the two columns. Return Markdown for a Python code snippet and nothing else."
)
print(ai_msg.content)

In [None]:
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from langchain_experimental.tools import PythonAstREPLTool

df = pd.read_csv("titanic.csv")
tool = PythonAstREPLTool(locals={"df": df})
tool.invoke("df['Fare'].mean()")

In [None]:
llm_with_tools = llm.bind_tools([tool], tool_choice=tool.name)
llm_with_tools.invoke(
    "I have a dataframe 'df' and want to know the correlation between the 'Age' and 'Fare' columns"
)

In [None]:
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser

parser = JsonOutputKeyToolsParser(tool.name, return_single=True)
(llm_with_tools | parser).invoke(
    "I have a dataframe 'df' and want to know the correlation between the 'Age' and 'Fare' columns"
)

In [None]:
from operator import itemgetter

from langchain_core.messages import ToolMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough

system = f"""You have access to a pandas dataframe `df`. \
Here is the output of `df.head().to_markdown()`:

```
{df.head().to_markdown()}
```

Given a user question, write the Python code to answer it. \
Don't assume you have access to any libraries other than built-in Python ones and pandas.
Respond directly to the question once you have enough information to answer it."""
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            system,
        ),
        ("human", "{question}"),
        # This MessagesPlaceholder allows us to optionally append an arbitrary number of messages
        # at the end of the prompt using the 'chat_history' arg.
        MessagesPlaceholder("chat_history", optional=True),
    ]
)


def _get_chat_history(x: dict) -> list:
    """Parse the chain output up to this point into a list of chat history messages to insert in the prompt."""
    ai_msg = x["ai_msg"]
    tool_call_id = x["ai_msg"].additional_kwargs["tool_calls"][0]["id"]
    tool_msg = ToolMessage(tool_call_id=tool_call_id, content=str(x["tool_output"]))
    return [ai_msg, tool_msg]


chain = (
    RunnablePassthrough.assign(ai_msg=prompt | llm_with_tools)
    .assign(tool_output=itemgetter("ai_msg") | parser | tool)
    .assign(chat_history=_get_chat_history)
    .assign(response=prompt | llm | StrOutputParser())
    .pick(["tool_output", "response"])
)

In [None]:
chain.invoke({"question": "What's the correlation between age and fare"})

# Tagging (label document)
use case:
* sentiment
* language
* style (formal, informal etc.)
* covered topics
* political tendency

<img src="tagging.png" alt="tag">

In [None]:
# simple example
# Schema
schema = {
    "properties": {
        "sentiment": {"type": "string"},
        "aggressiveness": {"type": "integer"},
        "language": {"type": "string"},
    }
}

# LLM
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
chain = create_tagging_chain(schema, llm)

# Extraction
* synthetic data generation
* how to extract data using langchain
* pydantic_schema = (a class)

<img src="extraction.png" alt="ext">

In [None]:
from langchain.chains import create_extraction_chain
from langchain_openai import ChatOpenAI

# Schema
schema = {
    "properties": {
        "name": {"type": "string"},
        "height": {"type": "integer"},
        "hair_color": {"type": "string"},
    },
    "required": ["name", "height"],
}

# Input
inp = """Alex is 5 feet tall. Claudia is 1 feet taller Alex and jumps higher than him. Claudia is a brunette and Alex is blonde."""

# Run chain
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
chain = create_extraction_chain(schema, llm)
chain.run(inp)

# Web Scraping

<img src="web.png" alt="web scraping">

# Interacte with APIs

In [4]:
from langchain.chains.openai_functions.openapi import get_openapi_chain

chain = get_openapi_chain(
    "https://www.klarna.com/us/shopping/public/openai/v0/api-docs/"
)
chain("What are some options for a men's large blue button down shirt")

Attempting to load an OpenAPI 3.0.1 spec.  This may result in degraded performance. Convert your OpenAPI spec to 3.1.* spec for better support.


ValueError: Unable to parse spec from source https://www.klarna.com/us/shopping/public/openai/v0/api-docs/

In [6]:
from langchain.chains import APIChain
from langchain.chains.api import open_meteo_docs
from langchain_openai import OpenAI

llm = OpenAI(temperature=0, openai_api_key="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")
chain = APIChain.from_llm_and_api_docs(
    llm,
    open_meteo_docs.OPEN_METEO_DOCS,
    verbose=True,
    limit_to_domains=["https://api.open-meteo.com/"],
)
chain.run(
    "What is the weather like right now in Munich, Germany in degrees Fahrenheit?"
)

  warn_deprecated(




[1m> Entering new APIChain chain...[0m


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}