In [21]:
# https://huggingface.co/docs/transformers/en/installation

In [22]:
import os
from dotenv import load_dotenv
load_dotenv("../.Venv")

True

In [23]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.llms import OpenAI

from transformers import GPT2TokenizerFast
from loguru import logger

In [24]:



class ModelApi:
    def __init__(self, temperature=0.5, model="gpt-3.5-turbo", max_tokens=1000):
        """
        Initializes the LLM with specified parameters.

        :param temperature: Controls the randomness of the output. Default is 0.5.
        :param model: Specifies the model to use. Default is "gpt-3.5-turbo".
        :param max_tokens: Maximum number of tokens to generate. Default is 1000.
        """
        self.temperature = temperature
        self.model = model
        self.max_tokens = max_tokens
        self.OpenAI = OpenAI(
            temperature=self.temperature,
            model=self.model,
            max_tokens=self.max_tokens
        )

    def generate_text(self, prompt):
        """
        Generates text based on the given prompt using the initialized LLM.

        :param prompt: The input text prompt to generate a response for.
        :return: Generated text response.
        """
        response = self.OpenAI.invoke(input=prompt)
        return response


In [25]:
with open("from_stats_to_datascience.txt", encoding="utf-8") as f:
    document = f.readlines()

In [26]:
#https://stackoverflow.com/questions/75774873/openai-api-error-this-is-a-chat-model-and-not-supported-in-the-v1-completions
params = {
    'temperature':0.5,
    'model':"gpt-3.5-turbo-instruct", 
    'max_tokens'  : 1000
}
llm = ModelApi(**params)

In [27]:
llm.generate_text("hey, how ae you")

" doing\n\nI'm doing well, thank you for asking. How about you?"

In [28]:
def get_chain(llm):
    
    prompt_template = """
    ### Instruction:
    The following text is an extract from a corpus of documents relevant to an aircraft Nacelle maintenance case.
    Please write a concise and informative 100-word summary that captures the key points and critical information from the extract. 
    Ensure that the summary includes important details such as the main issues, actions taken, and outcomes related to the maintenance case. 
    Avoid repetition and focus on clarity and completeness. Results In english please.

    {text}

    ### Summary:
    """
    summary_prompt = PromptTemplate(
        template=prompt_template, input_variables=["text"]
    )
    summarize_chain = load_summarize_chain(
        llm, chain_type="map_reduce",
        map_prompt=summary_prompt, combine_prompt=summary_prompt
    )
    return summarize_chain


In [30]:


def summarize_doc(content, summarize_chain, timeout=5*60):

    summary = ["Summary failed"]
    if not isinstance(content, str):
        logger.info("content is not string")

    else:
        content = re.sub(r" +\n", '\n', content, 0, re.MULTILINE)
        try:
            tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
            text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                tokenizer, chunk_size=int(1024), chunk_overlap=100,
                separators=["---------------page-------------", "\n\n\n", "\n\n", "\n", " ", ""]
            )
            texts = text_splitter.split_text(content)
            docs = [Document(page_content=t) for t in texts]

            summary = summarize_chain.invoke(input=docs, token_max=1800)
        except Exception as e:
            logger.info(content[:100])
            logger.exception(e)
    return summary



In [31]:
with open("from_stats_to_datascience.txt", encoding="utf-8") as f:
    document=   f.read()

In [32]:

s_chain  = get_chain(llm.OpenAI)
results = summarize_doc(document, s_chain, timeout=3*60)

In [34]:
results["output_text"]


'\nThe role of statisticians has evolved with the rise of digital technologies and the increasing amount of data available. Historically focused on demographic, economic, and social data, they now use advanced techniques such as data mining and text analysis. Vectorization has greatly improved the analysis of unstructured text, while techniques like convolutional neural networks and geographic information systems are used for images and geospatial data. Data scientists have emerged to handle the growing amount of unstructured data, using their skills in statistics, programming, and visualization. However, the field continues to evolve with new techniques such as deep learning and temporal data analysis, requiring professionals to stay updated. Ethical and regulatory considerations also play a significant role in this field.'