In [1]:
!pip install torch langchain transformers torchvision

Collecting langchain
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.2-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting packaging>=20.0 (from transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting requests-toolbelt<2.0.0,>=1.0.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Downloading langchain-0.3.9-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading langchain_core-0.3.21-py3-none-any.whl (409 kB)


In [2]:
# Import necessary modules
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# Load the summarization pipeline (default: BART)
def load_summarizer(model_name="facebook/bart-large-cnn"):
    """
    Load the summarization model pipeline.
    Args:
        model_name (str): The name of the Hugging Face model.
    Returns:
        summarizer function
    """
    if model_name.startswith("t5"):
        # Use T5 summarizer
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        def t5_summarizer(text):
            input_ids = tokenizer.encode(f"summarize: {text}", return_tensors="pt", truncation=True, max_length=512)
            outputs = model.generate(input_ids, max_length=130, min_length=30, length_penalty=2.0, num_beams=4)
            return tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return t5_summarizer
    # else:
    #     # Use BART summarizer
    #     summarizer = pipeline("summarization", model=model_name, device=0)
        
    #     def bart_summarizer(text):
    #         summary = summarizer(text, max_length=130, min_length=30, do_sample=False)
    #         return summary[0]["summary_text"]
        
    #     return summarizer


In [4]:

# Split text into manageable chunks using LangChain's RecursiveCharacterTextSplitter
def split_text_with_langchain(text, chunk_size=4096, chunk_overlap=200):
    """
    Splits the text into manageable chunks using LangChain's RecursiveCharacterTextSplitter.
    Args:
        text (str): The text to split.
        chunk_size (int): Maximum size of each chunk in tokens.
        chunk_overlap (int): Number of overlapping characters between chunks.
    Returns:
        List of text chunks.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.split_text(text)
    return chunks


In [5]:
# Function to summarize text
def summarize_text(file, model_name="facebook/bart-large-cnn"):
    """
    Summarizes the content of a given .txt file.
    Args:
        file_path (str): Path to the .txt file.
        model_name (str): Hugging Face model for summarization.
    Returns:
        Summary as a string.
    """
    # Load summarization model
    summarizer = load_summarizer(model_name)

    # Read text from the file
    with open(file, "r", encoding="utf-8") as file:
        text = file.read()
    
    # Split text if it's too long
    chunks = split_text_with_langchain(text, chunk_size=4096, chunk_overlap=200)

    # Summarize each chunk
    summaries = []
    for chunk in chunks:
        try:
            summary = summarizer(chunk)
            # print("Chunk: ", chunk, " - - - - ", "Summary: ", summary, "\n\n")
            summaries.append(summary)
        except Exception as e:
            print(f"Error summarizing chunk: {e}")

    # Combine all summaries
    final_summary = " ".join(summaries)
    return final_summary


In [6]:

# Path to the .txt file in your environment
input_file = "/kaggle/input/map-reduce-2/mapreduce_osdi04.txt"

# Choose the model (T5 or BART can be used here)
model_name = "t5-small"  # Change to "facebook/bart-large-cnn" if you prefer BART

# Summarize the text
try:
    summary = summarize_text(input_file, model_name)
    print("Summaries done")
    print(summary)
except Exception as e:
    print(f"An error occurred: {e}")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Summaries done
map function processes a key/value pair to generate a set of intermediate key/value pairs, and a reduce function that merges all intermediate values associated with the same intermediate key. many real world tasks are expressible in this model, as shown in the paper. the run-time system takes care of the details of partitioning the input data, scheduling the pro- gram's execution across a set of machines, handling ma- chine failures, and managing the required inter-machine communication. the user of the MapReduce library expresses the computation as two functions: Map and Reduce. Map, written by the user, takes an input pair and pro- duces a set of intermediate key/value pairs. the reducefunction sums together all counts emitted for a particular word. duce function is passed all per-document term vectors for a given host. it adds these term vectors together, throwing away infrequent terms, and then emits a nal hhostname; term vectori pair. the reduce function accepts all