In [6]:
import json
import os
import re
import string
from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from docx import Document
from docx.shared import Pt, RGBColor

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define the path to the document
doc_path = 'sample.docx'

# Set the number of sentences for the summary
num_sentences = 30 # Increase the number of sentences

# Set the number of paragraphs per chunk
paragraphs_per_chunk = 500

# Define stopwords and punctuation
stopwords_list = stopwords.words('english')
punctuation_set = set(string.punctuation)

# Preprocess the text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text into sentences and words
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]

    return words


# Postprocess the summary content
def postprocess_summary_content(content,heading):
    # Capitalize the first letter of each sentence
    sentences = sent_tokenize(content)
    sentences = [sentence.capitalize() for sentence in sentences]

    # Join the sentences back into the summary content
    processed_content = ' '.join(sentences)

    # Remove numbers at the beginning of sentences
    #processed_content = re.sub(r"^\d+\.\s+--\s*", "", processed_content)

    # Add punctuation and full stops
    processed_content = processed_content.strip()
    if not processed_content.endswith('.') and not content!=heading:
        processed_content += '.'

    return processed_content


# Extract headings from the document
# Extract headings from the document
def extract_headings(document):
    headings = []
    previous_heading_level = 0
    for paragraph in document.paragraphs:
        if paragraph.style.name.startswith('Heading'):
            # Consider the paragraph as a heading
            headings.append(paragraph.text)
            previous_heading_level = 0
        elif paragraph.runs:
            # Check if the paragraph has any runs (formatted text)
            for run in paragraph.runs:
                if run.bold or (run.font.size is not None and run.font.size > Pt(12)):
                    # Consider the paragraph as a heading or subheading
                    if previous_heading_level > 0:
                        # Combine with the previous heading level
                        headings[-1] += ' ' + paragraph.text
                    else:
                        # Append as a new heading
                        headings.append(paragraph.text)
                    previous_heading_level = 1
                    break
            else:
                previous_heading_level = 0
        else:
            previous_heading_level = 0

    return headings



# Generate the summary for a chunk of text
def generate_summary(headings, chunk, num_sentences):
    # Preprocess the paragraphs in the chunk
    if not chunk:
        return {}
    preprocessed_text = []
    for paragraph in chunk:
        preprocessed_paragraph = preprocess_text(paragraph)
        preprocessed_text.extend(preprocessed_paragraph)

    # Check if preprocessed_text is empty
    if not preprocessed_text:
        print("No text found for summarization.")
        return {}

    # Create the TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer(stop_words=None)  # Remove the stop_words parameter
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(sentence) for sentence in preprocessed_text])

    # Check if tfidf_matrix is empty
    if tfidf_matrix.shape[0] == 0:
        print("Insufficient data for summarization.")
        return {}

    # Calculate the sentence importance scores
    sentence_scores = calculate_sentence_scores(tfidf_matrix)

    # Identify the important sentences
    important_sentences = get_important_sentences(sentence_scores, num_sentences)

    # Generate the summary with headings
    summary_data = defaultdict(dict)
    previous_heading = ''
    used_sentences = set()
    for i, sentence_id in enumerate(important_sentences):
        heading_id = sentence_id // num_sentences
        sentence_index = sentence_id % num_sentences
        if heading_id < len(headings):
            heading = headings[heading_id]
            sentence = ' '.join(preprocessed_text[sentence_id])
            sentence = sentence.capitalize()  # Capitalize the first letter
            sentence = sentence.strip()
            if not sentence.endswith('.'):
                sentence += '.'
            if heading != previous_heading:
                # Add two new lines if a new heading is encountered
                summary_data[heading][-1] = '\n\n'
                previous_heading = heading

            # Check if the sentence has been used before
            if sentence not in used_sentences:
                used_sentences.add(sentence)
                summary_data[heading][sentence_index] = sentence

    return summary_data


# Calculate sentence importance scores using cosine similarity
def calculate_sentence_scores(tfidf_matrix):
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    sentence_scores = similarity_matrix.sum(axis=1)

    return sentence_scores


# Identify the most important sentences based on scores
def get_important_sentences(sentence_scores, num_sentences):
    important_sentences = sentence_scores.argsort()[-num_sentences:][::-1]

    return important_sentences


# Load the document
if os.path.exists(doc_path):
    document = Document(doc_path)
    headings = extract_headings(document)
    paragraphs = [paragraph.text for paragraph in document.paragraphs]
    num_chunks = len(paragraphs) // paragraphs_per_chunk
    chunks = [paragraphs[i:i + paragraphs_per_chunk] for i in range(0, len(paragraphs), paragraphs_per_chunk)]

    # Generate summaries for each chunk
    summary_data = defaultdict(dict)
    for i, chunk in enumerate(chunks):
        summary_data_chunk = generate_summary(headings, chunk, num_sentences)
        summary_data.update(summary_data_chunk)

    # Post-process the summary content and print

for heading, sentences in summary_data.items():
    print(f"\033[1m{heading}:\033[0m \n")
    for sentence_index, sentence in sentences.items():
        if sentence_index == -1:
            continue
        if sentence_index == 0:
            sentence = sentence.lstrip("* ").rstrip(".")
        print(f" * {postprocess_summary_content(sentence, heading)}\n")

else:
    print(f"Document '{doc_path}' does not exist.")


[1mONLINE SHOPPING SYSTEM:[0m 

 * The purpose of this document is to provide a detailed outline of the requirements for the online shopping system .

 * This software requirement specification ( srs ) document outlines the requirements for the development of an online shopping system .

 * The online shopping system is a software system that allows customers to browse and purchase products online from the comfort of their own homes .

 * This document covers the functional and non-functional requirements for the online shopping system .

 * Software requirement specification of an online shopping system having the following sub topics introduction , purpose of this document , scope of this document , overview and general description .

 * This document will serve as a guide for the development team , ensuring that the final product meets the needs of both the business and the end-users .

 * The online shopping system is a web-based software application that enables customers to bro

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PRASANNA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRASANNA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
