In [1]:
import difflib
from docx import Document
import openai
import warnings
import re
from summarizer import Summarizer

# Configure the OpenAI GPT API key. 
openai.api_key = "OpenAI Key"

def read_docx(file_path):
    """
    Reads a DOCX file to extract all non-empty paragraphs. This function is particularly useful for processing
    documents where the content is structured in paragraphs, as it skips empty paragraphs that often do not
    contain useful information.

    ### Parameters:
    - `file_path` (str): The full path to the DOCX file, which needs to be accessible by the script.

    ### Returns:
    - list[str]: Returns a list of strings, where each string is a non-empty paragraph found in the DOCX file.
    """
    doc = Document(file_path)
    return [paragraph.text for paragraph in doc.paragraphs if paragraph.text]

def compare_documents(doc1, doc2):
    """
    Compares two lists of document strings line by line and identifies changes between them. This function uses
    the `difflib.unified_diff` to generate differences in a manner that is easy to understand and process further.

    ### Parameters:
    - `doc1` (list[str]): The text content of the first document, split into lines.
    - `doc2` (list[str]): The text content of the second document, split into lines.

    ### Returns:
    - list[str]: A list of strings where each string contains a line that has been identified as either an addition
                 (prefixed with '+') or a deletion (prefixed with '-').
    """
    doc1_text = '\n'.join(doc1)
    doc2_text = '\n'.join(doc2)
    
    diff = list(difflib.unified_diff(doc1_text.splitlines(), doc2_text.splitlines(), lineterm=''))
    return [line for line in diff if line.startswith('+') or line.startswith('-')]


def summarize_changes(changes, chunk_size=200):
    """
    Processes and summarizes document changes using GPT-3.5. Given a list of changes, this function divides
    the changes into manageable chunks and uses OpenAI's API to generate summaries for each chunk, which are then
    combined into a final comprehensive summary.

    ### Parameters:
    - `changes` (list[str]): A list containing strings of document changes.
    - `chunk_size` (int): Determines how many characters each chunk will contain for summarization, affecting
                          processing load and detail in summaries.

    ### Returns:
    - str: A single string that provides a comprehensive summary of all changes.
    """
    changes_text = ' '.join(changes)
    chunks = [changes_text[i:i + chunk_size] for i in range(0, len(changes_text), chunk_size)]
    summaries = []

    for chunk in chunks:
        summary = process_chunk(chunk)
        if summary:  # Ensure non-empty summaries
            summaries.append(summary)

    combined_summary = ' '.join(summaries)
    if len(combined_summary.split()) > 16000:
        combined_summary = second_pass_summary(combined_summary)
    return combined_summary

        
def process_chunk(chunk):
    """
    Calls the OpenAI API to summarize a chunk of text. This function handles the API interaction, sending the text
    and receiving the summarized response.

    ### Parameters:
    - `chunk` (str): A string chunk of text to be summarized.

    ### Returns:
    - str: Summarized version of the chunk, or None if there was an error during processing.
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Provide a concise summary."},
                {"role": "user", "content": f"Summarize this text: {chunk}"}
            ],
            max_tokens=300
        )
        return response.choices[0].message['content']
    except Exception as e:
        print(f"Error processing chunk: {e}")
        return None    

def second_pass_summary(text):
    """
    Handles a second pass of summarization if the combined text is too long.

    ### Parameters:
    - `text` (str): Combined text from initial summarizations.

    ### Returns:
    - str: A final summarized version of the text.
    """
    chunks = [text[i:i + 16000] for i in range(0, len(text), 16000)]
    final_summaries = []
    for chunk in chunks:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "Provide a final comprehensive summary."},
                    {"role": "user", "content": f"Provide a final summary of these changes: {chunk}"}
                ],
                max_tokens=1024
            )
            final_summaries.append(response.choices[0].message['content'])
        except Exception as e:
            print(f"Error in final summarization: {e}")
            final_summaries.append("Final summary process failed.")
    return ' '.join(final_summaries)


def format_summary_as_points(summary_text):
    """
    Formats a summary into a structured list of bullet points. This function enhances readability by organizing
    the summary into clearly defined points, making it easier to scan and comprehend.

    ### Parameters:
    - `summary_text` (str): The full text of the summary to be formatted into bullet points.

    ### Returns:
    - str: A string where each sentence of the summary is formatted as a bullet point.
    """
    sentences = re.split(r'\. |\.\n', summary_text)
    unique_sentences = set(filter(None, sentences))
    bullet_points = "\n".join(f"- {sentence.strip()}" for sentence in unique_sentences if len(sentence.strip()) > 3)
    return bullet_points


# Using BERT model for summarization

# Suppress warnings that are not crucial for the user to see, keeping the output clean and focused
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel")
warnings.filterwarnings("ignore", category=FutureWarning, message="The default value of `n_init` will change from 10 to 'auto' in 1.4.")

def summarize_changes_bert(changes):
    """
    Uses a BERT model to perform extractive summarization on a list of document changes.
    This function leverages a pre-trained BERT model specifically tuned for extractive summarization tasks.

    ### Parameters:
    - `changes` (list[str]): A list containing textual changes detected between two documents.

    ### Returns:
    - str: An extractive summary that captures the essence of the changes succinctly.
    """
    model = Summarizer()  # Initializes the summarization model
    changes_text = ' '.join(changes)  # Combines all changes into a single string for processing
    initial_summary = model(changes_text)  # Generates the summary based on the input text
    return initial_summary

def further_condense_summary_bert(text):
    """
    Reduces the length of an already summarized text by applying a higher compression ratio.
    This function is useful for creating even shorter summaries from detailed summaries, particularly when detail reduction is necessary.

    ### Parameters:
    - `text` (str): The text to condense further, typically already summarized.

    ### Returns:
    - str: A more concise version of the input summary, reduced by the specified ratio.
    """
    model = Summarizer()  # Reuses the BERT model for further condensation
    more_concise_summary = model(text, ratio=0.9)  # Applies a high compression ratio to significantly shorten the text
    return more_concise_summary
        
# Main execution starts here
if __name__ == "__main__":
    # Paths to the documents
    doc1_path = '/Users/soumyadasgupta/Desktop/simmons/data/Jan_2015.docx'
    doc2_path = '/Users/soumyadasgupta/Desktop/simmons/data/Mar_2023.docx'

    doc1 = read_docx(doc1_path)
    doc2 = read_docx(doc2_path)

    changes = compare_documents(doc1, doc2)
    final_summary = summarize_changes(changes)
    formatted_output_gpt = format_summary_as_points(final_summary)

    print("Structured Summary of Changes:")
    print(formatted_output_gpt)
    
    # Compare the documents to find changes using BERT
    if not changes:
        print("No changes detected between the documents.")
    else:
        # Summarize the changes using BERT
        summary = summarize_changes(changes)
        # Further condense the summary
        condensed_summary_bert = further_condense_summary_bert(summary)
        print("BERT Summary of Document Changes:")
        print(condensed_summary_bert)
        

Structured Summary of Changes:
- Apple disclaims liability related to issues with iCloud or the Service, and users are responsible for backing up their systems and purchased content as Apple is not obligated to continue providing content through services
- The text mentions the use of the 1-Click feature for quick transactions and outlines the process for activating in-game purchases
- Apple has the right to cooperate with legal processes regarding user use of services and handle potential issues of harmful or infringing content
- Apple may temporarily remove or cancel services for technical or operational reasons, making efforts to inform users of any issues and protect services from loss, corruption, attacks, viruses, or security breaches
- Users accessing the service from outside the designated country may encounter restrictions to ensure compliance
- Additionally, the text provides safety advice on product usage, including the Family Sharing feature and the responsibility of the or

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT Summary of Document Changes:
This text provides a detailed overview of the terms and conditions for using Apple's digital platforms, including the iTunes Store, Mac App Store, App Store, and iBooks Store. It outlines the legal agreements users enter into by accepting the terms and conditions, covering aspects such as payment obligations, refunds, cancellations, and billing processes. The text highlights the importance of reading the agreement carefully and emphasizes the user's responsibility to comply with the terms outlined. It also delves into specific features such as Family Sharing, 1-Click purchasing, and the provision of services and content on Apple platforms. Users are advised on safety measures when using digital products, including taking breaks to avoid strain and potential health risks. The text also outlines procedures for managing payments, cancellations, and refunds, providing users with clear guidance on their rights and responsibilities when interacting with Appl