# <strong> Problem Statement: Bankruptcy Prediction using Natural Language Processing </strong>

# Summary Generation using BART(Facebook)

This notebook contain code to generate summary for each text file present in the dataset. We experimented with different models like T5, BART, etc and analysed the rouge score to select the best possible model for summerization.

In [None]:
from transformers import pipeline, AutoTokenizer
import os

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load the tokenizer to handle input tokenization
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

In [1]:
def split_text(text, max_tokens=1024):
    # Split the document into sentences and group them into chunks that fit within max_tokens
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    
    for sentence in sentences:
        if len(tokenizer.encode(current_chunk + sentence)) < max_tokens:
            current_chunk += sentence + '. '
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + '. '
    
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [3]:
def generate_summary(input_folder, output_folder):
    
    # Ensure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    n = len(os.listdir(input_folder))
    
    # Iterate over all files in the input folder
    counter = 0
    for filename in os.listdir(input_folder):
        # Ensure only text files are processed
        if filename.endswith(".txt"):
            # Construct the full file path
            file_path = os.path.join(input_folder, filename)
        
            # Open and read the financial document
            with open(file_path, "r", encoding="utf-8") as file:
                financial_text = file.read()
        
            # Display the original text length
            print(f"Processing file: {filename} (Original text length: {len(financial_text)})")
        
            # Split the financial text into smaller chunks
            chunks = split_text(financial_text)
        
            # Summarize each chunk and combine the summaries
            summaries = []
            for chunk in chunks:
                # Ensure that the chunk is not too small for the model to handle
                if len(chunk) > 1024:
                    chunk = chunk[:1024]
        
                if len(chunk) > 10:  # Check if the chunk has enough content
                    try:
                        summary = summarizer(chunk, max_length=400, min_length=30, do_sample=False)
                        summaries.append(summary[0]['summary_text'])
                    except Exception as e:
                        print(f"Error summarizing chunk: {e}")
                        continue
        
            # Combine the summaries into one final summary
            final_summary = " ".join(summaries)
        
            # Save the summary to a new file in the output folder
            output_file_path = os.path.join(output_folder, f"summary_{filename}")
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(final_summary)
        
            print(f"Summary saved to: {output_file_path}")
            counter += 1
            print(f"{counter}/{n} done!")
    print("summary generated sucessfully")
        

In [None]:
input_folder = "/kaggle/input/nlp-preprocessed-dataset/Final_Processed_Dataset/Bankrupt" 
output_folder = "/kaggle/working/Bankruptcy"

generate_summary(input_folder, output_folder)

In [None]:
input_folder = "/kaggle/input/nlp-preprocessed-dataset/Final_Processed_Dataset/Healthy" 
output_folder = "/kaggle/working/Healthy"

generate_summary(input_folder, output_folder)

In [None]:
file = open("/kaggle/input/nlp-preprocessed-dataset/Final_Processed_Dataset/Bankrupt/UVSL_2016_MDA.txt", "r")
original = file.read()

print(original)

## Rouge score to evaluate summary

In [None]:
# Import the rouge_scorer module from the rouge_score package
from rouge_score import rouge_scorer

# Reference text (ground truth) and summary placeholders
# 'reference' is the actual text from the Management Discussion and Analysis (MDA) section.
reference = """
The Management Discussion and Analysis (MDA) section discusses the consolidated financial statements of a company as audited in accordance with Section 129 of the Companies Act 2013 and Accounting Standard 21. It highlights the salient features of the financial statements, ensuring compliance with the first proviso of Section 129 and Rule 5 of the Companies (Accounts) Rules, 2014, specifically referring to the prescribed form AOC-1 in Annexure V. Additionally, the report addresses the dividend policy in the context of accumulated losses.
"""

# Open the summary file in read mode, which contains the machine-generated summary.
# Note: This line attempts to open the summary file, but the summary is hardcoded below as a placeholder.
file = open("/kaggle/working/Bankruptcy/summary_UVSL_2016_MDA.txt", "r")

# The 'summary' is a machine-generated text summarizing the reference content.
# In this case, it's a placeholder string.
summary = "consolidated financial statement audit consolidated financial statement pursuant section 129 company act 2013 accounting standard 21 consolidated financial statement provide annual report. financial statement associate accordance rst proviso 3 section 129 read rule 5 company account rule 2014 prescribe form aoc 1 annex annexure v report."

# Function to evaluate the machine-generated summary against the reference using ROUGE scores
def evaluate_summary(reference, summary):
    # Initialize the ROUGE scorer with the types of ROUGE metrics: ROUGE-1, ROUGE-2, and ROUGE-L.
    # 'use_stemmer=True' ensures word stemming (e.g., "running" -> "run") for more lenient matching.
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Compute ROUGE scores between the reference and the machine-generated summary
    scores = scorer.score(reference, summary)
    
    # Return the computed scores
    return scores

# Evaluate the summary by calling the evaluate_summary function with reference and summary as arguments
scores = evaluate_summary(reference, summary)

# Print the computed ROUGE scores (Precision, Recall, and F1 Score) for each metric (ROUGE-1, ROUGE-2, ROUGE-L)
print("ROUGE Scores:")
for metric, score in scores.items():
    print(f"{metric}:")
    print(f"  Precision: {score.precision:.4f}")  # Precision: The ratio of overlapping words to the total words in the summary
    print(f"  Recall: {score.recall:.4f}")        # Recall: The ratio of overlapping words to the total words in the reference
    print(f"  F1 Score: {score.fmeasure:.4f}")    # F1 Score: The harmonic mean of Precision and Recall


## BLEU score to evaluate summary

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

# Ensure you have the necessary NLTK resources
nltk.download('punkt')

# Original text and summary placeholders


# Tokenize the original text and summary
original_tokens = nltk.word_tokenize(original)
summary_tokens = nltk.word_tokenize(summary)

# Function to evaluate the BLEU score
def evaluate_bleu(original, summary):
    # Create a list of reference translations (in this case, just one)
    references = [original_tokens]
    # Calculate BLEU score
    bleu_score = sentence_bleu(references, summary_tokens)
    return bleu_score

# Evaluate the BLEU score
bleu_score = evaluate_bleu(original, summary)

# Print the BLEU score
print(f"BLEU Score: {bleu_score:.4f}")