In [21]:
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

In [22]:
def load_preprocessed_mda(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def chunk_text(text, max_chunk_size, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_chunk_size - overlap):
        chunk = ' '.join(words[i:i + max_chunk_size])
        chunks.append(chunk)
    return chunks

def generate_summary(text, model, tokenizer, max_length=400, min_length=30):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs, 
        max_length=max_length, 
        min_length=min_length, 
        length_penalty=2.0, 
        num_beams=4, 
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def summarize_long_text(text, model, tokenizer, max_chunk_size=400):
    chunks = chunk_text(text, max_chunk_size)
    chunk_summaries = []
    for chunk in chunks:
        chunk_summary = generate_summary(chunk, model, tokenizer)
        chunk_summaries.append(chunk_summary)
    
    full_summary = ' '.join(chunk_summaries)
    if len(tokenizer.encode(full_summary)) > 512:
        return generate_summary(full_summary, model, tokenizer)
    return full_summary


In [23]:
# Load T5 model and tokenizer
model_name = "t5-large"  # Using t5-large for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [24]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [25]:
# Directories
input_dirs = ['Final_Processed_Dataset/Bankrupt', 'Final_Processed_Dataset/Healthy']
output_dirs = ['Summarized_MDA/Bankrupt', 'Summarized_MDA/Healthy']

# Ensure output directories exist
for dir in output_dirs:
    os.makedirs(dir, exist_ok=True)

In [26]:
# Process files
for input_dir, output_dir in zip(input_dirs, output_dirs):
    print(f"Processing files in {input_dir}")
    for filename in tqdm(os.listdir(input_dir)):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            
            # Load and preprocess text
            mda_text = load_preprocessed_mda(input_path)
            
            
            # Generate summary
            summary = summarize_long_text(mda_text, model, tokenizer)
            
            # Save summary
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(summary)

print("Summarization complete!")

Processing files in Final_Processed_Dataset/Bankrupt


100%|██████████| 131/131 [1:48:43<00:00, 49.80s/it]  


Processing files in Final_Processed_Dataset/Healthy


100%|██████████| 130/130 [3:00:29<00:00, 83.30s/it]  

Summarization complete!



