In [9]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter,NLTKTextSplitter,TextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import AzureChatOpenAI
import os
from typing import Any, List
import openai
import openai_config


def get_chunks(text, max_len_per_chunk=200):
    if len(text) <= max_len_per_chunk: return [text]
    paragraph_list = text.split("。")
    chunk_list = []
    start = 0
    while start < len(paragraph_list):
        end = start
        while end < len(paragraph_list):
            end += 1
            chunk = "。".join(paragraph_list[start:end])
            if len(chunk) > max_len_per_chunk:
                chunk_list.append(chunk)
                break
        if end == len(paragraph_list): break
        start = max(start+1, end-2)
    return chunk_list


class ChineseSplitter(TextSplitter):
    def __init__(self, **kwargs: Any):
        """Create a new TextSplitter."""
        super().__init__(**kwargs)

    def split_text(self, text: str) -> List[str]:
        return get_chunks(text)


llm = AzureChatOpenAI(temperature=0, deployment_name="gpt-4-32k")

text_splitter = ChineseSplitter()

In [10]:
from langchain.chains.summarize import load_summarize_chain
import nltk
# nltk.download('punkt')

with open("text.txt") as f:
    state_of_the_union = f.read()
texts = text_splitter.split_text(state_of_the_union)

from langchain.docstore.document import Document

docs = [Document(page_content=t) for t in texts]



In [11]:
prompt_template = """你是一名标书写作专家，请根据如下描述提取其中对标书写作比较重要的要求和事项，结果文本中请保留章节等结构信息。:


{text}


摘要为:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(llm, chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
abstract = chain({"input_documents": docs}, return_only_outputs=True)


with open("abstract.txt", "w") as f:
    f.writelines(abstract)

