In [1]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from transformers import T5Tokenizer, T5ForConditionalGeneration

import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

# (1) Introduction: LangChain basics

## Definition of the LLM

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:30<00:00, 15.05s/it]


In [15]:
class CustomLLM(LLM):
    
    tokenizer: T5Tokenizer
    model: T5ForConditionalGeneration

    @property
    def _llm_type(self) -> str:
        return "custom"
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
        outputs = self.model.generate(input_ids, 
                         do_sample=True, 
                         max_new_tokens=100, 
                         temperature=0.8)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    

In [16]:
llm = CustomLLM(tokenizer=tokenizer, model=model)

## Create a prompt template
You can create simple hardcoded prompts using the PromptTemplate class. Prompt templates can take any number of input variables, and can be formatted to generate a prompt.

In [17]:
template = """
I want you to act as a naming consultant for new companies.

Here are some examples of good company names:

- search engine, Google
- social media, Facebook
- video sharing, YouTube

The name should be short, catchy and easy to remember.

What is a good name for a company that makes {product}?
"""

prompt = PromptTemplate(
    input_variables=["product"],
    template=template,
)
llm_chain = LLMChain(prompt=prompt, llm=llm)

product = "socks"
print(llm_chain.run(product=product))

Socksify


## Load a prompt template from LangChainHub

LangChainHub contains a collection of prompts which can be loaded directly via LangChain.

In [19]:
from langchain.prompts import load_prompt

prompt_2 = load_prompt("lc://prompts/conversation/prompt.json")

llm_chain_2 = LLMChain(prompt=prompt_2, llm=llm)

question = "What is the result of 1 + 1?"
print(llm_chain_2.run(history="", input=question))

No `_type` key found, defaulting to `prompt`.


The result of 1 + 1 is 2.


## Pass few shot examples to a prompt template
Few shot examples are a set of examples that can be used to help the language model generate a better response.

To generate a prompt with few shot examples, you can use the FewShotPromptTemplate. This class takes in a PromptTemplate and a list of few shot examples. It then formats the prompt template with the few shot examples.

In this example, we’ll create a prompt to generate word antonyms.

In [22]:
from langchain import PromptTemplate, FewShotPromptTemplate


# First, create the list of few shot examples.
examples = [
    {"word": "happy", "antonym": "sad"},
    {"word": "tall", "antonym": "short"},
]

# Next, we specify the template to format the examples we have provided.
# We use the `PromptTemplate` class for this.
example_formatter_template = """
Word: {word}
Antonym: {antonym}\n
"""

example_prompt = PromptTemplate(
    input_variables=["word", "antonym"],
    template=example_formatter_template,
)

# Finally, we create the `FewShotPromptTemplate` object.
few_shot_prompt = FewShotPromptTemplate(
    # These are the examples we want to insert into the prompt.
    examples=examples,
    # This is how we want to format the examples when we insert them into the prompt.
    example_prompt=example_prompt,
    # The prefix is some text that goes before the examples in the prompt.
    # Usually, this consists of intructions.
    prefix="Give the antonym of every input",
    # The suffix is some text that goes after the examples in the prompt.
    # Usually, this is where the user input will go
    suffix="Word: {input}\nAntonym:",
    # The input variables are the variables that the overall prompt expects.
    input_variables=["input"],
    # The example_separator is the string we will use to join the prefix, examples, and suffix together with.
    example_separator="\n\n",
)

llm_chain_3 = LLMChain(prompt=few_shot_prompt, llm=llm)

word = "monstruous"
print(llm_chain_3.run(word))

benign


# (2) Loading documents from our dataset

## - Loading a document

In [2]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.docstore.document import Document
from langchain.chains.summarize import load_summarize_chain

from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import TextLoader

In [5]:
email_type = "announcements"
email_number = "002"
ex_email_path = "./Dataset-v1/data/emails/email-"+email_type+"-"+email_number+".rtf"

loader = TextLoader(ex_email_path)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [6]:
texts

[Document(page_content="{\\rtf1\\ansi\\ansicpg1252\\cocoartf2638\n\\cocoatextscaling0\\cocoaplatform0{\\fonttbl\\f0\\fnil\\fcharset0 HelveticaNeue;}\n{\\colortbl;\\red255\\green255\\blue255;\\red42\\green49\\blue64;\\red245\\green245\\blue246;}\n{\\*\\expandedcolortbl;;\\cssrgb\\c21569\\c25490\\c31765;\\cssrgb\\c96863\\c96863\\c97255;}\n\\paperw11900\\paperh16840\\margl1440\\margr1440\\vieww11520\\viewh8400\\viewkind0\n\\deftab720\n\\pard\\pardeftab720\\sa400\\partightenfactor0\n\n\\f0\\fs32 \\cf2 \\cb3 \\expnd0\\expndtw0\\kerning0\n\\outl0\\strokewidth0 \\strokec2 Subject: Update on Office Relocation\\\nDear all,\\\nAs you know, we are in the process of relocating our corporate office to a new and improved location. We are pleased to inform you that the move is going smoothly, and we are on track to complete the relocation by the end of this month.\\\nOur new office is located at 456 Main Street, which is just a few blocks away from our current location. The new office space is larger

In [3]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
class CustomLLM(LLM):
    
    tokenizer: T5Tokenizer
    model: T5ForConditionalGeneration

    @property
    def _llm_type(self) -> str:
        return "custom"
    
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
        outputs = self.model.generate(input_ids, 
                         do_sample=True, 
                         max_new_tokens=500, 
                         temperature=1e-10)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
    
llm = CustomLLM(tokenizer=tokenizer, model=model)

## - Summarization chain

### CombineDocuments Chains

CombineDocuments chains are useful for when you need to run a language over multiple documents. Common use cases for this include question answering, question answering with sources, summarization, and more.

**1) "Stuffing"**
Stuffing is the simplest method, whereby you simply stuff all the related data into the prompt as context to pass to the language model. This is implemented in LangChain as the StuffDocumentsChain.

Pros: Only makes a single call to the LLM. When generating text, the LLM has access to all the data at once.

Cons: Most LLMs have a context length, and for large documents (or many documents) this will not work as it will result in a prompt larger than the context length.

The main downside of this method is that it only works on smaller pieces of data. Once you are working with many pieces of data, this approach is no longer feasible. The next two approaches are designed to help deal with that.

**2) "Map Reduce"**
This method involves running an initial prompt on each chunk of data (for summarization tasks, this could be a summary of that chunk; for question-answering tasks, it could be an answer based solely on that chunk). Then a different prompt is run to combine all the initial outputs. This is implemented in the LangChain as the MapReduceDocumentsChain.

Pros: Can scale to larger documents (and more documents) than StuffDocumentsChain. The calls to the LLM on individual documents are independent and can therefore be parallelized.

Cons: Requires many more calls to the LLM than StuffDocumentsChain. Loses some information during the final combined call.


**3) "Refine"**
This method involves running an initial prompt on the first chunk of data, generating some output. For the remaining documents, that output is passed in, along with the next document, asking the LLM to refine the output based on the new document.

Pros: Can pull in more relevant context, and may be less lossy than MapReduceDocumentsChain.

Cons: Requires many more calls to the LLM than StuffDocumentsChain. The calls are also NOT independent, meaning they cannot be paralleled like MapReduceDocumentsChain. There is also some potential dependencies on the ordering of the documents.

In [8]:
chain_mr = load_summarize_chain(llm, chain_type="map_reduce")
chain_mr.run(texts)

Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors


"We are in the process of relocating our corporate office to a new and improved location. We are pleased to inform you that the move is going smoothly, and we are on track to complete the relocation by the end of this month. Our new office is located at 456 Main Street, which is just a few blocks away from our current location. The new office space is larger and more modern, and it includes a range of amenities such as a fitness center, a caf'e9, and a rooftop terrace. We are excited about the move and the opportunities it presents for our company and our employees. We believe that the new office will provide a more comfortable and productive work environment, and it will help us to attract and retain top talent. We will be hosting a grand opening event on Friday, April 1st, from 4:00 pm to 7:00 pm, where you can tour the new office and enjoy some refreshments. Please RSVP to this email if you plan to attend the event. Thank you for your attention, and we look forward to welcoming you 

## - Question-Answering chain

For question answering over many documents, you almost always want to create an index over the data. This can be used to smartly access the most relevant documents for a given question, allowing you to avoid having to pass all the documents to the LLM (saving you time and money).

In [9]:
from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQA
from langchain.embeddings import HuggingFaceEmbeddings

# Create your index
embeddings = HuggingFaceEmbeddings()

email_type = "announcements"
texts = []
for i in range(4): # Loading the first 4 emails
    email_number = "00"+str(i)
    ex_email_path = "./Dataset-v1/data/emails/email-"+email_type+"-"+email_number+".rtf"
    loader = TextLoader(ex_email_path)
    documents = loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
    texts.append(text_splitter.split_documents(documents)[0])
    
db = Chroma.from_documents(texts, embeddings)    
qa = VectorDBQA.from_chain_type(llm=llm, chain_type="stuff", vectorstore=db)

# Create your query
query = "When is the deadline for registration for the Employee Training program?" # Should output 30th June
qa.run(query)

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


'Friday, June 30th'

In [10]:
query = "Who is the sender of the email about the Employee Training program?" # Maria Rodriguez
qa.run(query)

'Maria Rodriguez'

In [11]:
query = "What's the update on the office relocation topic?" 
qa.run(query)

'The move is going smoothly, and we are on track to complete the relocation by the end of this month.'

In [12]:
query = "What will the volunteer program consist of?"
qa.run(query)

'volunteering at local schools, participating in environmental clean-up events, and assisting with fundraising initiatives for local charities'