In [2]:
from  langchain.schema import Document
import json
from typing import Iterable

def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array


documents=load_docs_from_jsonl('data.jsonl')
print(len(documents))

43


In [3]:
import re
# Remove 2 or more consecutive \n with a single \n in each document
for doc in documents:
    doc.page_content = re.sub(r'\n{3,}', '\n\n\n', doc.page_content)

# Remove 2 or more consecutive \n with a single \n in each document
for doc in documents:
    doc.page_content = re.sub(r'\t{3,}', '\t\t\t', doc.page_content)

for doc in documents:
    doc.page_content = re.sub(r'\s{3,}', '   ', doc.page_content)

In [4]:
from dotenv import load_dotenv
import os

# Load the environment variables from the .env file
load_dotenv()

# Access environment variables
api_key = os.getenv("OPENAI_KEY")

In [8]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-16k")

In [19]:
# Map
map_template = """
You are a content writer for a tour operator who is writing a hotel summary to go on our website.
The following is a set of documents from the hotel website
{docs}
Based on this list of docs, summaries the docs based on the following category: 
- A paragraph on room categories (include all room category names, style and confirm the following room facilities/amenities: en suite bathroom (but not bath v shower); air conditioning; Wi-Fi
Not every docment will contain the information we require, it such a case do not return a summary.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [20]:
# Reduce
reduce_template = """
You are a content writer for a tour operator who is writing a hotel summary to go on our website.
The following is set of summaries of each webpage from a hotels website:
{docs}
Take these and distill it into a final summary following the main theme:
- A paragraph on room categories (include all room category names, style and confirm the following room facilities/amenities: en suite bathroom (but not bath v shower); air conditioning; Wi-Fi
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [21]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [22]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

In [23]:
summarized_text = map_reduce_chain.run(documents)
print(summarized_text)

The Langham Hotels and Resorts offer a variety of stylish and refined room categories, including Rooms, Club Rooms, and Suites. Each room is designed with a blend of modern conveniences and timeless elegance. All rooms come with en suite bathrooms, air conditioning, and Wi-Fi facilities. Please note that specific information about room styles and amenities may vary by location, so it is recommended to visit the hotel's website or contact their reservations team for more details.
