In [3]:
# from langchain.chat_models import ChatOpenAI
# from langchain.prompts import PromptTemplate
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS
from langchain.schema import Document


# Dynamic Header Generation

In [4]:
chunk = "ChatGPT is an advanced AI language model developed by OpenAI, based on the GPT (Generative Pre-trained Transformer) architecture. It is designed to understand and generate human-like text based on the input it receives. ChatGPT can engage in conversations, answer questions, write content, summarize information, and assist with a variety of tasks across domains. Trained on large datasets from the internet, it leverages deep learning to provide contextually relevant responses. Ideal for both casual use and professional applications, ChatGPT represents a significant step forward in natural language processing, making human-computer interaction more intuitive and accessible than ever before."

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model="gpt-4")

header_prompt = PromptTemplate(
    input_variables=["content"],
    template="Create a short, descriptive title for the following content:\n\n{content}"
)

def generate_header(text):
    return llm.predict(header_prompt.format(content=text))

header = generate_header(chunk)



  return llm.predict(header_prompt.format(content=text))


In [6]:
print("Header:", header)

Header: "ChatGPT: OpenAI's Advanced AI Language Model Revolutionizing Human-Computer Interaction"


In [7]:
# combine header with chunk
chunk_with_header = f"Header: {header}\n\nContent: {chunk}"
# Create LangChain document
doc = Document(page_content=chunk_with_header, metadata={"header": header})

# Embedding-Aware Section Titles


In [9]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2", device='cpu')

topics = ["Transformer Models", "Language Generation", "Chatbot Applications"]
topic_embeddings = model.encode(topics, convert_to_tensor=True)

chunk_text = "Transformers use attention mechanisms to weigh importance of tokens..."
chunk_embedding = model.encode(chunk_text, convert_to_tensor=True)

# Find most similar topic
cos_scores = util.pytorch_cos_sim(chunk_embedding, topic_embeddings)
best_topic = topics[cos_scores.argmax()]
print("Suggested Title:", best_topic)


Suggested Title: Transformer Models


# Summarization-Based Token Reduction


In [10]:
large_chunk_text = """Transformers have revolutionized the field of machine learning, particularly in Natural Language Processing (NLP). Introduced in the 2017 paper "Attention Is All You Need," they fundamentally changed how models process sequential data like text.

Unlike earlier recurrent neural networks (RNNs) and long short-term memory (LSTMs) that processed words sequentially, Transformers leverage a mechanism called self-attention. This allows them to weigh the importance of different words in a sequence simultaneously, regardless of their position. For example, in the sentence "The quick brown fox jumped over the lazy dog," a Transformer can understand the relationship between "fox" and "dog" directly, without having to process all the words in between step-by-step.

The core components of a Transformer include encoders and decoders. Encoders process the input sequence, while decoders generate the output. Both consist of multiple layers, each containing multi-head self-attention and position-wise feed-forward networks. Positional encodings are added to the input embeddings to provide information about the order of words, as the parallel processing inherent in self-attention would otherwise lose this crucial context.

This parallel processing capability is a major advantage, leading to significantly faster training times and enabling the development of massive models like BERT and GPT. Transformers have since expanded beyond NLP into computer vision (Vision Transformers or ViTs), revolutionizing tasks like image classification and object detection. Their ability to capture long-range dependencies and handle large datasets has made them the backbone of modern AI, driving advancements in areas like text generation, machine translation, and even protein folding prediction."""

In [13]:
from langchain.chains.summarize import load_summarize_chain

print("Large Chunk Size:", len(large_chunk_text))

docs = [Document(page_content=large_chunk_text)]
chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = chain.run(docs)
# print("Summarized Chunk:", summary)

print("Summarized Chunk Size:", len(summary))


Large Chunk Size: 1786
Summarized Chunk Size: 586


# Importance-Aware Trimming


In [16]:
long_chunk_text = "GPT was released in 2018. OpenAI is its developer. " \
"GPT is transformative for NLP. It can summarize text."

from sklearn.feature_extraction.text import TfidfVectorizer

def importance_trim(text, top_n=3):
    sentences = text.split('. ')
    tfidf = TfidfVectorizer().fit_transform(sentences)
    scores = tfidf.sum(axis=1).A1
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return '. '.join([sentences[i] for i in top_indices])

trimmed = importance_trim(long_chunk_text)
print("Trimmed Chunk:", trimmed)



Trimmed Chunk: GPT was released in 2018. GPT is transformative for NLP. It can summarize text.


# LLM-Assisted Compression


In [17]:
from langchain.prompts import PromptTemplate

chunk_text = "GPT-4, developed by OpenAI, is one of the most powerful models, trained on vast data..."

compression_prompt = PromptTemplate(
    input_variables=["content"],
    template="Compress the following content into a concise, information-dense paragraph:\n\n{content}"
)

def compress_text(content):
    return llm.predict(compression_prompt.format(content=content))

compressed = compress_text(chunk_text)
print("Compressed Output:", compressed)


Compressed Output: Developed by OpenAI, GPT-4 stands as one of the most potent models, benefiting from extensive training on expansive data sets.
