In [None]:
%pip install --force-reinstall -r requirements.txt


In [32]:
# Standard library imports
import os
import re
from datetime import datetime, timedelta, timezone
from io import StringIO
import json
import math
from typing import Dict, List
import logging

# Third-party imports
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.storage.blob import (
    BlobServiceClient,
    BlobClient,
    BlobSasPermissions,
    generate_blob_sas,
    ContainerClient,
)
from openai import AzureOpenAI
import tiktoken
import pandas as pd
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter
from IPython.display import Markdown, display
import openai

# Load environment variables
# Make sure to set all the environment variables
load_dotenv("local.env", override=True)

True

### Solution Approach
![approach](../Other/Comment_Analytics_Solution_Approach.jpg)

In [3]:
# Generates a SAS token for accessing a blob in Azure Storage.
def generate_sas_token(
    blob_service_client: BlobServiceClient, source_blob: BlobClient
) -> str:

    # Create a SAS token that's valid for one hour, as an example
    sas_token = generate_blob_sas(
        account_name=blob_service_client.account_name,
        container_name=source_blob.container_name,
        blob_name=source_blob.blob_name,
        account_key=blob_service_client.credential.account_key,
        permission=BlobSasPermissions(read=True),
        expiry=datetime.now(timezone.utc) + timedelta(hours=1),
        start=datetime.now(timezone.utc) + timedelta(hours=-1),
    )
    return sas_token


# Returns the number of tokens in a text string.
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


# Aggregates text chunks into larger chunks based on token size, without splitting the initial chunks
def aggregate_chunks(chunks: list, chunk_tokens: int) -> list:
    agg_chunks = []
    current_chunk = ""
    text_chunk_size = chunk_tokens * 4
    # aggregate paragraphs
    for chunk in chunks:
        if len(current_chunk) + len(chunk) + 2 <= text_chunk_size:
            # Add paragraph to the current chunk with two newlines
            current_chunk += chunk + "\n\n"
        else:
            # Add the current chunk to the list and start a new chunk
            agg_chunks.append(current_chunk.strip())
            current_chunk = chunk + "\n\n"

    # Add the last chunk if it's not empty
    if current_chunk:
        agg_chunks.append(current_chunk.strip())

    return agg_chunks


# In some documents, paragraphs may not be consistently separated by double newlines or other clear markers. May be in those cases leveraging NLP packages like Spacy might be helpful.
def split_text_by_paragraphs(text):
    delimiters = [". \r\n", ".\r\n", ".  \n"]
    for delimiter in delimiters:
        try:
            # Attempt to split the text using the current delimiter
            split_result = text.split(delimiter)
            if len(split_result) > 1:
                return split_result
        except Exception as e:
            # If an error occurs, print the error and try the next delimiter
            logger.error(f"Error splitting with delimiter '{delimiter}': {e}")
    # If no valid split was found, return the original text as a single-element list
    logger.info("Could not chunk by paragraph. Returning whole text")
    return [text]


#    Splits a document into chunks based on the specified criteria.
def split_document_into_chunks(
    content_string: str,
    split_type: str,
    file_type: str,
    chunk_tokens: int,
    encoding_name: str,
) -> list:
    num_tokens = num_tokens_from_string(content_string, encoding_name)
    logger.info(f"Number of Tokens: {num_tokens}")

    agg_chunks = []
    chunks = []
    if file_type == "csv":
        df = pd.read_csv(StringIO(content_string))
        # df['combined_text'] = df['title'] + '\n' + df['comment']
        agg_chunks = df.apply(
            lambda row: f"{row['Title']}\n{row['Comment']}", axis=1
        ).tolist()

    else:
        if num_tokens < chunk_tokens:
            agg_chunks.append(content_string)
        elif (file_type == "pdf") and (split_type == "sections"):
            # print("Splitting PDF into sections")
            # Split the document into chunks base on markdown headers.
            headers_to_split_on = [
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3"),
            ]
            text_splitter = MarkdownHeaderTextSplitter(
                headers_to_split_on=headers_to_split_on
            )
            ts_chunks = text_splitter.split_text(content_string)
            for ts_chunk in ts_chunks:
                chunks.append(ts_chunk.page_content)
            agg_chunks = aggregate_chunks(chunks, chunk_tokens)
            # logger.debug(f"This book has {len(agg_chunks)} sections in it")
        elif file_type == "text":
            # print("Splitting TXT into paragraphs")
            # Split the text into paragraphs. Following may need to be updated
            paragraphs = split_text_by_paragraphs(content_string)
            agg_chunks = aggregate_chunks(paragraphs, chunk_tokens)

    return agg_chunks


# Gets a response from the Azure OpenAI service based on the provided prompts.
def getResponseFromAoAi(systemPrompt, userPrompt, temp=0):
    conversaion = [
        {"role": "system", "content": systemPrompt},
        {"role": "user", "content": userPrompt},
    ]
    try:
        # Send the conversation to the API
        response = client.chat.completions.create(
            model=aoai_api_deployment_name,  # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
            messages=conversaion,
            # response_format={ "type": "json_object" }, #requires ptu enabled gpt4
            temperature=temp,
        )
        responseText = response.choices[0].message.content
        return responseText
    except openai.APIError as error:
        if error.code == "content_filter":
            responseText = "Content_Filter_Error"
        return responseText


def clean_json_string(json_string):
    pattern = r"^```json\s*(.*?)\s*```$"
    cleaned_string = re.sub(pattern, r"\1", json_string, flags=re.DOTALL)
    return cleaned_string.strip()


def insights_from_ind_comments(
    chunks: list,
    temperature: float,
    response_type: str,
    output_container_client: ContainerClient,
    llm_response_file_name: str,
    file_type: str,
):
    try:
        if file_type == "csv":
            i = 1
            for chunk in chunks:
                if chunk.strip():
                    final_user_prompt = chunk_insights_usr_prompt.format(
                        document_text=chunk
                    )
                    aoai_response = getResponseFromAoAi(
                        chunk_insights_sys_prompt, final_user_prompt, temp=temperature
                    )
                    if aoai_response == "Content_Filter_Error":
                        print(f"Content Filter error with : {chunk}")
                    else:
                        cleaned_response = clean_json_string(aoai_response)
                        blob_client = output_container_client.get_blob_client(
                            llm_response_file_name + str(i) + ".json"
                        )
                        blob_client.upload_blob(cleaned_response, overwrite=True)
                i += 1
        else:
            aggregated_results = []
            for chunk in chunks:
                if chunk.strip():
                    final_user_prompt = chunk_insights_usr_prompt.format(
                        document_text=chunk
                    )
                    aoai_response = getResponseFromAoAi(
                        chunk_insights_sys_prompt, final_user_prompt, temp=temperature
                    )
                    if aoai_response == "Content_Filter_Error":
                        print(f"Content Filter error with : {chunk}")
                    else:
                        cleaned_response = clean_json_string(aoai_response)
                        json_response = json.loads(cleaned_response)
                        aggregated_results.append(json_response)

            aggregated_results_str = json.dumps(aggregated_results, indent=4)
            main_themes_count = sum(
                1 for obj in aggregated_results if "main_themes" in obj
            )
            if main_themes_count > 1:
                agg_resp_json = aggregate_individual_insights(aggregated_results)
                blob_client = output_container_client.get_blob_client(
                    llm_response_file_name
                )
                blob_client.upload_blob(agg_resp_json, overwrite=True)
            else:
                blob_client = output_container_client.get_blob_client(
                    llm_response_file_name
                )
                aggregated_results_str = json.dumps(aggregated_results[0], indent=4)
                blob_client.upload_blob(aggregated_results_str, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate individual comment insight jsons: {e}")
        return False


def aggregate_individual_insights(ind_insights_json) -> str:
    agg_insights_json: Dict[str, List] = {
        "summary": [],
        "main_themes": [],
        "overall_sentiment": [],
        "aspect_based_sentiment": [],
        "suggestions": [],
    }
    for obj in ind_insights_json:
        agg_insights_json["summary"].append(obj["summary"].strip())
        agg_insights_json["main_themes"].extend(obj["main_themes"])
        agg_insights_json["overall_sentiment"].append(obj["overall_sentiment"])
        agg_insights_json["aspect_based_sentiment"].extend(
            obj["aspect_based_sentiment"]
        )
        suggestions = obj["suggestions"]
        if isinstance(suggestions, str):
            agg_insights_json["suggestions"].append(suggestions)
        else:
            agg_insights_json["suggestions"].extend(suggestions)
    agg_insights_json_str = json.dumps(agg_insights_json, indent=4)
    return agg_insights_json_str


def ind_comment_summary(
    container_client: ContainerClient, temperature: float, comment_folder_name: str
):
    try:
        blob_list = container_client.list_blobs(
            name_starts_with="individual/" + comment_folder_name
        )
        for blob in blob_list:
            if blob.name.endswith(".json"):
                comment_summary_file_name = (
                    blob.name.replace("individual/", "individual_summary/")
                    .replace(" - extracted insights", " - summary")
                    .replace(".json", ".md")
                )
                blob_client = container_client.get_blob_client(blob.name)
                downloaded_blob = blob_client.download_blob().readall()
                # json_data = json.loads(downloaded_blob)
                # ind_comments_json = json.dumps(json_data, indent=4)
                ind_comments_json = downloaded_blob.decode("utf-8")
                final_user_prompt = comment_summary_usr_prompt.format(
                    document_text=ind_comments_json
                )
                aoai_response = getResponseFromAoAi(
                    comment_summary_sys_prompt, final_user_prompt, temp=temperature
                )
                blob_client = container_client.get_blob_client(
                    comment_summary_file_name
                )
                blob_client.upload_blob(aoai_response, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate individual comment summaries: {e}")
        return False


def merge_json_files(container_client: ContainerClient, comment_folder_name: str):
    try:
        aggregated_json: Dict[str, List] = {
            "summary": [],
            "main_themes": [],
            "suggestions": [],
        }
        blob_list = container_client.list_blobs(
            name_starts_with="individual/" + comment_folder_name
        )
        for blob in blob_list:
            if blob.name.endswith(".json"):
                blob_client = container_client.get_blob_client(blob.name)
                downloaded_blob = blob_client.download_blob().readall()
                json_data = json.loads(downloaded_blob)
                if json_data.get("summary") != "Comments in Attached File":
                    # Merge summary
                    if "summary" in json_data:
                        if isinstance(json_data["summary"], list):
                            aggregated_json["summary"].extend(json_data["summary"])
                        else:
                            aggregated_json["summary"].append(json_data["summary"])
                    # Merge main themes
                    if "main_themes" in json_data:
                        aggregated_json["main_themes"].extend(json_data["main_themes"])
                    # Merge suggestions
                    if "suggestions" in json_data:
                        if isinstance(json_data["suggestions"], list):
                            aggregated_json["suggestions"].extend(
                                json_data["suggestions"]
                            )
                        else:
                            aggregated_json["suggestions"].append(
                                json_data["suggestions"]
                            )

        summary_json = {"summary": aggregated_json["summary"]}
        main_themes_json = {"main_themes": aggregated_json["main_themes"]}
        suggestions_json = {"suggestions": aggregated_json["suggestions"]}
        agg_summary_blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_summary.json"
        )
        agg_summary_blob_client.upload_blob(
            json.dumps(summary_json, indent=4), overwrite=True
        )
        agg_themes_blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_themes.json"
        )
        agg_themes_blob_client.upload_blob(
            json.dumps(main_themes_json, indent=4), overwrite=True
        )
        agg_suggestions_blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_suggestions.json"
        )
        agg_suggestions_blob_client.upload_blob(
            json.dumps(suggestions_json, indent=4), overwrite=True
        )
        return True
    except Exception as e:
        logger.error(f"Failed to generate aggregated files: {e}")
        return False


def split_agg_json_into_chunks(item_list, num_parts, item_type):
    # Determine the length of each part
    part_length = math.ceil(len(item_list) / num_parts)
    # Split the item list into the specified number of parts
    parts = [
        item_list[i : i + part_length] for i in range(0, len(item_list), part_length)
    ]
    # Concatenate the items in each part
    concatenated_parts = []
    if item_type == "themes":
        concatenated_parts = [{"main_themes": part} for part in parts]
    else:
        concatenated_parts = [" ".join(part) for part in parts]
    return concatenated_parts


def aggregate_comment_summary(
    container_client: ContainerClient, temperature: float, comment_folder_name: str
):
    try:
        blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_summary.json"
        )
        downloaded_blob = blob_client.download_blob().readall()
        json_data = json.loads(downloaded_blob)
        num_tokens = num_tokens_from_string(
            "\n".join(json_data["summary"]), encoding_name
        )
        logger.info(f"Number of Tokens for aggregated_summary: {num_tokens}")
        num_chunks = round(num_tokens / chunk_tokens)
        aggregated_results = []
        if num_chunks > 1:
            chunks = split_agg_json_into_chunks(
                json_data["summary"], num_chunks, "summary"
            )
            # print(len(chunks))
            for chunk in chunks:
                final_user_prompt = agg_summary_usr_prompt.format(document_text=chunk)
                aoai_response = getResponseFromAoAi(
                    agg_summary_sys_prompt, final_user_prompt, temp=temperature
                )
                aggregated_results.append("Summary of Part:: \n" + aoai_response)
            chunk_summary_file_name = (
                "aggregated_summary/" + comment_folder_name + "/agg_summary_chunks.md"
            )
            blob_client = container_client.get_blob_client(chunk_summary_file_name)
            chunk_responses_str = "\n".join(aggregated_results)
            blob_client.upload_blob(chunk_responses_str, overwrite=True)

            final_user_prompt = agg_chunk_summary_usr_prompt.format(
                document_text=chunk_responses_str
            )
            aoai_response = getResponseFromAoAi(
                agg_chunk_summary_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_summary_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_summary.md"
            )
            blob_client = container_client.get_blob_client(comment_summary_file_name)
            blob_client.upload_blob(aoai_response, overwrite=True)
        else:
            total_summary = "\n".join(json_data["summary"])
            final_user_prompt = agg_summary_usr_prompt.format(
                document_text=total_summary
            )
            aoai_response = getResponseFromAoAi(
                agg_summary_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_summary_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_summary.md"
            )
            blob_client = container_client.get_blob_client(comment_summary_file_name)
            blob_client.upload_blob(aoai_response, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate aggregated summary: {e}")
        return False


def aggregate_comment_suggestions(
    container_client: ContainerClient, temperature: float, comment_folder_name: str
):
    try:
        blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_suggestions.json"
        )
        downloaded_blob = blob_client.download_blob().readall()
        json_data = json.loads(downloaded_blob)
        num_tokens = num_tokens_from_string(
            "\n".join(json_data["suggestions"]), encoding_name
        )
        logger.info(f"Number of Tokens for aggregated_suggestions : {num_tokens}")

        num_chunks = round(num_tokens / chunk_tokens)
        aggregated_results = []
        if num_chunks > 1:
            chunks = split_agg_json_into_chunks(
                json_data["suggestions"], num_chunks, "suggestions"
            )
            # print(len(chunks))
            for chunk in chunks:
                final_user_prompt = agg_suggestions_usr_prompt.format(
                    document_text=chunk
                )
                aoai_response = getResponseFromAoAi(
                    agg_suggestions_sys_prompt, final_user_prompt, temp=temperature
                )
                aggregated_results.append(
                    "Aggregated Suggestions Report of Part:: \n" + aoai_response
                )
            chunk_suggestions_file_name = (
                "aggregated_summary/"
                + comment_folder_name
                + "/agg_suggestions_chunks.md"
            )
            blob_client = container_client.get_blob_client(chunk_suggestions_file_name)
            chunk_responses_str = "\n".join(aggregated_results)
            blob_client.upload_blob(chunk_responses_str, overwrite=True)

            final_user_prompt = agg_chunk_suggestions_usr_prompt.format(
                document_text=chunk_responses_str
            )
            aoai_response = getResponseFromAoAi(
                agg_chunk_suggestions_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_suggestions_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_suggestions.md"
            )
            blob_client = container_client.get_blob_client(
                comment_suggestions_file_name
            )
            blob_client.upload_blob(aoai_response, overwrite=True)
        else:
            total_suggestions = "\n".join(json_data["suggestions"])
            final_user_prompt = agg_suggestions_usr_prompt.format(
                document_text=total_suggestions
            )
            aoai_response = getResponseFromAoAi(
                agg_suggestions_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_suggestions_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_suggestions.md"
            )
            blob_client = container_client.get_blob_client(
                comment_suggestions_file_name
            )
            blob_client.upload_blob(aoai_response, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate aggregated summary: {e}")
        return False


def aggregate_comment_themes(
    container_client: ContainerClient, temperature: float, comment_folder_name: str
):
    try:
        blob_client = container_client.get_blob_client(
            "aggregated/" + comment_folder_name + "/aggregated_themes.json"
        )
        downloaded_blob = blob_client.download_blob().readall()
        json_data = json.loads(downloaded_blob)
        num_tokens = num_tokens_from_string(
            json.dumps(json_data["main_themes"]), encoding_name
        )
        logger.info(f"Number of Tokens for aggregated_themes: {num_tokens}")
        num_chunks = round(num_tokens / chunk_tokens)
        aggregated_results = []
        if num_chunks > 1:
            chunks = split_agg_json_into_chunks(
                json_data["main_themes"], num_chunks, "themes"
            )
            # print(len(chunks))
            for chunk in chunks:
                final_user_prompt = agg_themes_usr_prompt.format(document_text=chunk)
                aoai_response = getResponseFromAoAi(
                    agg_themes_sys_prompt, final_user_prompt, temp=temperature
                )
                aggregated_results.append(re.sub(r"```markdown", "", aoai_response))
            chunk_themes_file_name = (
                "aggregated_summary/" + comment_folder_name + "/agg_themes_chunks.md"
            )
            blob_client = container_client.get_blob_client(chunk_themes_file_name)
            chunk_responses_str = "\n".join(aggregated_results)
            blob_client.upload_blob(chunk_responses_str, overwrite=True)

            final_user_prompt = agg_chunk_themes_usr_prompt.format(
                document_text=chunk_responses_str
            )
            aoai_response = getResponseFromAoAi(
                agg_chunk_themes_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_themes_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_themes.md"
            )
            blob_client = container_client.get_blob_client(comment_themes_file_name)
            blob_client.upload_blob(aoai_response, overwrite=True)
        else:
            agg_themes_json = downloaded_blob.decode("utf-8")
            final_user_prompt = agg_themes_usr_prompt.format(
                document_text=agg_themes_json
            )
            aoai_response = getResponseFromAoAi(
                agg_themes_sys_prompt, final_user_prompt, temp=temperature
            )
            comment_theme_file_name = (
                "aggregated_summary/" + comment_folder_name + "/overall_themes.md"
            )
            blob_client = container_client.get_blob_client(comment_theme_file_name)
            blob_client.upload_blob(aoai_response, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate aggregated summary: {e}")
        return False


def generate_overall_summary(
    container_client: ContainerClient, temperature: float, comment_folder_name: str
):
    try:
        blob_list = container_client.list_blobs(
            name_starts_with="aggregated_summary/" + comment_folder_name
        )
        for blob in blob_list:
            if blob.name.endswith("overall_summary.md"):
                blob_client = container_client.get_blob_client(blob.name)
                downloaded_blob = blob_client.download_blob().readall()
                summary_text = downloaded_blob.decode("utf-8")
            elif blob.name.endswith("overall_suggestions.md"):
                blob_client = container_client.get_blob_client(blob.name)
                downloaded_blob = blob_client.download_blob().readall()
                suggestions_text = downloaded_blob.decode("utf-8")
            elif blob.name.endswith("overall_themes.md"):
                blob_client = container_client.get_blob_client(blob.name)
                downloaded_blob = blob_client.download_blob().readall()
                themes_text = downloaded_blob.decode("utf-8")

        num_tokens = num_tokens_from_string(
            summary_text + themes_text + suggestions_text, encoding_name
        )
        logger.info(f"Number of Tokens for over all summary is: {num_tokens}")

        final_user_prompt = overall_summary_usr_prompt.format(
            summary=summary_text, themes=themes_text, suggestions=suggestions_text
        )
        aoai_response = getResponseFromAoAi(
            overall_summary_sys_prompt, final_user_prompt, temp=temperature
        )
        overall_summary_file_name = "final/" + comment_folder_name + "/exec_summary.md"
        blob_client = container_client.get_blob_client(overall_summary_file_name)
        blob_client.upload_blob(aoai_response, overwrite=True)
        return True
    except Exception as e:
        logger.error(f"Failed to generate executive summary: {e}")
        return False



### Prompt Templates

In [4]:
# prompts extracting insights from each chunk

chunk_insights_sys_prompt = """ You are an AI assistant trained to analyze and summarize feedback regarding rules and regulations implemented by our organization. You will be provided with a piece of feedback from individuals or organizations, and your task is to:

1. Check if the feedback text mentions that the comments or submitter information are provided in an attached file (e.g., "See attached file(s)," "Please see the attached document for comments," "Please find attached comments," etc.).
2. If the feedback contains no other meaningful comments beyond references to attached files, do not process further and respond only with a summary saying "Comments in Attached File."
3. If the feedback includes meaningful comments in addition to references to attached files, proceed to:
    - Extract and identify the submitter information (which may be located at the beginning or the end of the feedback text).
    - Summarize the feedback.
    - Identify the main theme(s) mentioned in the feedback and provide one or two sentences summarizing each theme.
    - Determine the overall sentiment and its score (range: -1 to +1).
    - Identify specific aspects (which will be the same as the themes) mentioned in the feedback, and determine the sentiment and score (range: -1 to +1) for each aspect.
    - Identify any suggestions or remediations mentioned in the feedback.

Provide the output in JSON format as follows:
{
    "submitter_info": "[Submitter's name, organization, or any identifying information, if available]",
    "summary": "[Your summary here or 'Comments in Attached File']",
    "main_themes": [
        {"theme": "Theme1", "summary": "[One or two sentences summarizing Theme1]"},
        {"theme": "Theme2", "summary": "[One or two sentences summarizing Theme2]"},
        ...
    ],
    "overall_sentiment": {
        "sentiment": "[Positive/Negative/Neutral]",
        "score": [Sentiment score]
    },
    "aspect_based_sentiment": [
        {"aspect": "Theme1", "sentiment": "[Positive/Negative/Neutral]", "score": [Sentiment score]},
        {"aspect": "Theme2", "sentiment": "[Positive/Negative/Neutral]", "score": [Sentiment score]},
        ...
    ],
    "suggestions": "[Suggestions or remediations mentioned in the feedback, if any]",
}

Here is the user feedback for your analysis:
 """

chunk_insights_usr_prompt = """
User Feedback:
{document_text}
"""
# prompts generating summary from individual document's extracted insights

comment_summary_sys_prompt = """
You are an advanced AI tasked with summarizing feedback and comments regarding the rules and regulations implemented by an organization. Each comment has been processed into a structured JSON format containing the summary of the comment, themes identified in the comment along with a brief summary of each theme, and suggestions provided.

The JSON structure for each comment is as follows:

{
    "summary": "abc" or ["abc", "def"],
    "main_themes": [
        {
            "theme": "abc",
            "summary": "def"
        }
    ],
    "suggestions": "abc" or ["abc", "def"]
}

Based on the provided JSON data, you need to generate an overall summary for broader and leadership consumption. The overall summary should include the following sections:

1. **Overall Summary**: A brief overview of the general sentiment and key points from the collected feedback.
2. **Identified Themes**: A list and description of the main themes that emerged from the comments, highlighting the most frequently mentioned themes.
3. **Key Suggestions**: A summary of the most common and significant suggestions provided by the commenters.

Your task is to provide the overall summary in the following format:

**Overall Summary:**
[Overall summary here]

**Identified Themes:**
- **Theme 1**: [Theme 1 summary]
- **Theme 2**: [Theme 2 summary]
- // Add more themes as identified

**Key Suggestions:**
- [Suggestion 1]
- [Suggestion 2]
- // Add more suggestions as identified

Here is the comment's processed JSON:
"""
comment_summary_usr_prompt = """
comment's processed JSON:

{document_text}
"""

# prompt to generate aggregated summary from each comment's insights json
agg_summary_sys_prompt = """

You are an expert in text summarization and natural language processing. Your task is to create an overall summary from a given aggregated text of summaries. The goal is to capture all relevant insights and key points related to the comments about the organization's rules and regulations. Ensure that the final summary is concise, coherent, and captures the essence of the feedback provided. Follow these steps:

1. Identify the main themes and topics discussed in the aggregated text.
2. Highlight any recurring issues or concerns mentioned by multiple commenters.
3. Summarize positive feedback and suggestions for improvement.
4. Ensure that the summary is comprehensive yet concise, avoiding unnecessary details.

The input will be a long text containing aggregated summaries of comments. Your output should be a well-structured summary capturing all the essential points.

Please proceed with the summarization based on the following text .
"""
agg_summary_usr_prompt = '''
"""
{document_text}
"""
Generate well-structured summary capturing all the essential points from the text provided
'''

# prompt to generate aggregated summary from merged LLM summaries
agg_chunk_summary_sys_prompt = """
You are an expert in text summarization and natural language processing. Your task is to create a single, final summary based on a series of concise summaries that each capture feedback related to the organization's rules and regulations. The goal is to integrate all relevant insights and key points from these summaries into one cohesive summary.
Follow these steps:

1. **Aggregate Content**: Merge the content of all concise summaries into one unified summary, removing any duplication or repetitive information.
2. **Identify Core Themes**: Extract and highlight the core themes and topics that are discussed across multiple summaries.
3. **Synthesize Information**: Combine overlapping or similar insights, ensuring that all unique aspects are retained and presented in a cohesive manner.
4. **Prioritize Important Points**: Emphasize the most critical issues, concerns, and suggestions mentioned, while ensuring the final summary is both comprehensive and succinct.
5. **Ensure Coherence**: Structure the final summary logically, making sure it reads smoothly and presents the information in a clear, concise manner.

The input will be a series of concise summaries, each separated by "Summary of Part::". Your output should be a single, well-structured summary that encapsulates all the essential feedback points.
Please proceed with the summarization based on the following text.
"""
agg_chunk_summary_usr_prompt = '''
"""
{document_text}
"""
Generate well-structured summary capturing all the essential points from the text provided
'''

# prompt to generate aggregated suggestions from each comment's insights json
agg_suggestions_sys_prompt = """
You are an expert in text summarization and natural language processing. Your task is to create an overall summary from a given aggregated text of suggestions. The goal is to capture all relevant insights and key points related to the suggestions about the organization's rules and regulations. Ensure that the final summary is concise, coherent, and captures the essence of the suggestions provided. Follow these steps:

1. Identify the main themes and topics discussed in the aggregated text of suggestions.
2. Highlight any recurring suggestions or common themes mentioned by multiple commenters.
3. Summarize key suggestions for improvement and any positive feedback.
4. Ensure that the summary is comprehensive yet concise, avoiding unnecessary details.

The input will be a long text containing aggregated suggestions from various comment files. Your output should be a well-structured summary capturing all the essential points.

Please proceed with the summarization based on the following text.


"""
agg_suggestions_usr_prompt = '''
"""
{document_text}
"""
Generate well-structured summary capturing all the essential points from the text provided
'''

# prompt to generate aggregated suggestions from merged LLM suggestions
agg_chunk_suggestions_sys_prompt = """
You are an expert in text summarization and natural language processing. Your task is to create a comprehensive and coherent summary of the overall suggestions insights from multiple aggregated reports. Each report starts with "Aggregated Suggestions Report of Part:: " and summarizes suggestions related to an organization's rules and regulations. The goal is to merge these insights into a single, well-structured summary that captures all key points and avoids redundancy. It is crucial that no important information from any individual suggestion insights report is missed in the final summary.
Steps to follow:

1. Identify Main Themes and Topics: Extract the overarching themes and topics that span across the individual suggestions insights, each of which begins with "Aggregated Suggestions Report of Part:: ".
2. Summarize Recurring Suggestions: Combine recurring suggestions mentioned in multiple chunks, ensuring to capture their significance without duplicating the information.
3. Summarize Key Suggestions for Improvement: Integrate the key suggestions for improvement from all chunks, preserving the essence of each suggestion.
4. Highlight Positive Feedback: Include any positive feedback that reflects broader sentiment across the aggregated reports.
5. Ensure Coherence and Conciseness: Ensure the final summary is coherent, free of repetition, and concisely captures the critical insights from all chunks.
6. Preserve Important Details: Pay special attention to ensure that no important details from any of the individual suggestion insights are omitted.

The input will consist of multiple summaries, each starting with "Aggregated Suggestions Report of Part:: ", representing aggregated suggestions insights from different parts of the feedback. Your output should be a single, well-organized summary that retains all essential points while removing any redundant information.
Please proceed with the summarization based on the following aggregated suggestions insights.
"""
agg_chunk_suggestions_usr_prompt = '''
"""
{document_text}
"""
Generate well-structured summary capturing all the essential points from the text provided
'''

# prompt to generate aggregated themes from each comment's insights json
agg_themes_sys_prompt = """
You are an advanced AI specialized in processing feedback and identifying key themes. You are provided with a JSON input that contains a list of themes and their corresponding summaries extracted from feedback. The JSON is structured as follows:

{
    "main_themes": [
        {
            "theme": "Payment Initiation",
            "summary": "The ABA argues that Section 1033 should not be used to mandate payment initiation to and from Regulation E accounts."
        }
    ]
}

Your task is to:
1. Identify and group similar themes together, even if they have slightly different wording or phrasing but share the same context.
2. Determine the frequency of each theme (including grouped similar themes).
3. Provide the top 25 most frequently occurring themes, along with their grouped summary and frequency count.
4. Ensure that the summaries for each grouped theme are concise and reflect the core message.

Your output should be in this Markdown format without any additional explanations, just the formatted result:

Top 25 Most Occurring Themes:
- theme1: [count] [Generated summary]
- theme2: [count] [Generated summary]

Please be precise and ensure that the themes are accurately identified and counted.
"""
agg_themes_usr_prompt = """
Following is the JSON that contain the themes and their summaries: 
{document_text}
"""

# prompt to generate aggregated themes from merged LLM themes
agg_chunk_themes_sys_prompt = """ 
You are tasked with consolidating themes extracted from various text chunks, all contained within a large text file. The file includes multiple sections, each corresponding to a text chunk. Each section begins with "Top 25 Most Occurring Themes" and lists themes in the following format:
- Theme Name: [count of occurrence] Summary of the theme

Your objective is to:
1. **Consolidate Themes:** Identify and merge similar or duplicate themes across all sections. Ensure that similar themes are combined into a single theme, and their occurrence counts are summed.
2. **Generate Top 25 Themes:** From the consolidated list, generate the "Top 25 Most Occurring Themes" based on the highest total occurrence counts.
3. **Output Format:** Provide the output in the following format:
Top 25 Most Occurring Themes
- Theme Name: [count of occurrence] Summary of the theme

Ensure the summaries accurately reflect the combined themes, and the list is ranked according to the total count of occurrences.
"""
agg_chunk_themes_usr_prompt = '''
Here is the text from the file:
"""
{document_text}
"""
Ensure the summaries accurately reflect the combined themes, and the list is ranked according to the total count of occurrences.
'''

# prompt to generate executive/overall summary from aggregates summary, themes and suggestions
overall_summary_sys_prompt = """
You are a highly skilled assistant with expertise in summarizing and consolidating feedback and comments from various sources. Your task is to generate an executive summary report based on the provided text from three files. The files contain summaries of comments or feedback regarding rules and regulations implemented by an organization. The executive summary should be concise, coherent, and cover all the main themes, recurring issues, positive feedback, suggestions for improvement, and conclusions.

The provided files contain the following topics:

File 1: Overall Summary
- Core Themes and Topics
- Recurring Issues and Concerns
- Positive Feedback and Suggestions for Improvement
- Conclusion

File 2: Overall Themes
- Top 25 Most Occurring Themes

File 3: Overall Suggestions
- Main Themes and Topics
- Key Suggestions for Improvement
- Recurring Suggestions
- Positive Feedback

Please read through the text from these files and create a single, consolidated executive summary report. Ensure that the report includes all significant points and provides a clear, comprehensive overview of the feedback and suggestions.

### Executive Summary Report Structure:
1. Introduction
2. Main Themes and Topics
3. Recurring Issues and Concerns
4. Positive Feedback
5. Suggestions for Improvement
6. Conclusion

Be sure to maintain a professional tone and ensure the summary is well-organized and easy to understand.
Below is the text from the files:
"""
overall_summary_usr_prompt = """

File 1: Overall Summary
{summary}

File 2: Overall Themes
{themes}

File 3: Overall Suggestions
{suggestions}

Please read through the text from these files and generate a single, consolidated executive summary report. The report should be structured as follows:

1. Introduction
2. Main Themes and Topics
3. Recurring Issues and Concerns
4. Positive Feedback
5. Suggestions for Improvement
6. Conclusion

Ensure that the report is comprehensive, well-organized, and professional.
"""

In [34]:
aoai_api_key = os.getenv("AZURE_OPENAI_KEY")
aoai_api_endpoint =  os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_deployment_name =  os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

account_name = os.getenv("STORAGE_ACCOUNT_NAME")
account_key = os.getenv("STORAGE_ACCOUNT_KEY")
input_container_name = os.getenv("STORAGE_INPUT_CONTAINER_NAME")
connection_string = f"DefaultEndpointsProtocol=https;AccountName={account_name};AccountKey={account_key};EndpointSuffix=core.windows.net"
output_container_name = os.getenv("STORAGE_OUTPUT_CONTAINER_NAME")

docintel_endpoint=os.getenv("AZURE_DOC_INTEL_ENDPOINT")
docintel_key= os.getenv("AZURE_DOC_INTEL_KEY")

aoai_api_version= '2024-06-01'
encoding_name = 'o200k_base'
# Change the chunk token size to fit your needs
chunk_tokens=8000
temp=0
# Name of the folder that has the comments
comment_folder_name='rule-10-21-final-2'

#logger config
logger = logging.getLogger("comment_analytics")
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
logger.addHandler(console_handler)

# Suppress logging from external libraries by setting the root logger level to WARNING
# logging.getLogger().setLevel(logging.WARNING)

In [36]:
try:
    start_time = datetime.now()
    logger.info(f"Starting time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    # Create a BlobServiceClient object using the connection string
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    # Create a ContainerClient object
    input_container_client = blob_service_client.get_container_client(
        input_container_name
    )
    output_container_client = blob_service_client.get_container_client(
        output_container_name
    )

    # define openai and document intelligence Clients
    client = AzureOpenAI(
        api_key=aoai_api_key,
        api_version=aoai_api_version,
        azure_endpoint=aoai_api_endpoint,
    )
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=docintel_endpoint, credential=AzureKeyCredential(docintel_key)
    )

    # List the blobs in the container
    blob_list = input_container_client.list_blobs(name_starts_with=comment_folder_name)
    chunks = []

    for blob in blob_list:
        logger.info(f"Blob's Name: \t{blob.name}")
        logger.info(f"Blob's Content Type: \t{blob.content_settings.content_type}")

        blob_client = input_container_client.get_blob_client(blob.name)
        sas_token = generate_sas_token(
            blob_service_client=blob_service_client, source_blob=blob_client
        )
        source_blob_sas_url = blob_client.url + "?" + sas_token
        llm_response_file_name = ""
        file_type = ""
        if blob.content_settings.content_type == "application/pdf":
            file_type = "pdf"
            llm_response_file_name = "individual/" + blob.name.replace(
                ".pdf", " - extracted insights.json"
            )
            loader = AzureAIDocumentIntelligenceLoader(
                url_path=source_blob_sas_url,
                api_key=docintel_key,
                api_endpoint=docintel_endpoint,
                api_model="prebuilt-layout",
            )
            docs = loader.load()
            docs_string = docs[0].page_content
            chunks = split_document_into_chunks(
                docs_string,
                "sections",
                file_type,
                chunk_tokens,
                encoding_name="o200k_base",
            )
            logger.info(f"Number of chunks: {len(chunks)}")
        elif blob.content_settings.content_type == "text/plain":
            file_type = "text"
            llm_response_file_name = "individual/" + blob.name.replace(
                ".txt", " - extracted insights.json"
            )
            blob_data = blob_client.download_blob().readall()
            # Attempt to decode the bytes with utf-8, fallback to other encodings if it fails
            try:
                text_data = blob_data.decode("utf-8")
            except UnicodeDecodeError:
                try:
                    text_data = blob_data.decode("ISO-8859-1")
                except UnicodeDecodeError:
                    text_data = blob_data.decode("windows-1252")
            chunks = split_document_into_chunks(
                text_data,
                "paragraphs",
                file_type,
                chunk_tokens,
                encoding_name="o200k_base",
            )
            logger.info(f"Number of chunks: {len(chunks)}")
        elif blob.content_settings.content_type in [
            "text/csv",
            "application/vnd.ms-excel",
        ]:
            file_type = "csv"
            llm_response_file_name = "individual/" + blob.name.replace(
                ".csv", " - extracted insights"
            )
            blob_data = blob_client.download_blob().readall()
            # Attempt to decode the bytes with utf-8, fallback to other encodings if it fails
            try:
                text_data = blob_data.decode("utf-8")
            except UnicodeDecodeError:
                try:
                    text_data = blob_data.decode("ISO-8859-1")
                except UnicodeDecodeError:
                    text_data = blob_data.decode("windows-1252")

            chunks = split_document_into_chunks(
                text_data, "row", file_type, chunk_tokens, encoding_name="o200k_base"
            )
            logger.info(f"Number of chunks: {len(chunks)}")

        if len(chunks) > 0:
            # generate insights from individual comments file
            if insights_from_ind_comments(
                chunks,
                temp,
                "json",
                output_container_client,
                llm_response_file_name,
                file_type,
            ):
                logger.info(
                    f"Successfully generated individual comment insight jsons for blob: \t{blob.name}"
                )

            else:
                logger.info(
                    f"Failed to generate individual comment insight jsonsfor blob: \t{blob.name}"
                )

    # Task:generate summary from individual comment insights json
    if ind_comment_summary(output_container_client, temp, comment_folder_name):
        logger.info("Successfully generated individual comment summaries")
    else:
        logger.info("Failed to generate individual comment summaries")

    # Task:Merge summary/themes/suggestions from individual comment insights json

    if merge_json_files(output_container_client, comment_folder_name):
        logger.info("Successfully Merged insights from individual comment files")
    else:
        logger.info("Failed to merge insights from individual comment files")

except Exception as ex:
    logger.error(ex)

Starting time: 2024-11-16 17:06:23
Starting time: 2024-11-16 17:06:23
Starting time: 2024-11-16 17:06:23
Starting time: 2024-11-16 17:06:23
Blob's Name: 	rule-10-21-final-2
Blob's Name: 	rule-10-21-final-2
Blob's Name: 	rule-10-21-final-2
Blob's Name: 	rule-10-21-final-2
Blob's Content Type: 	application/octet-stream
Blob's Content Type: 	application/octet-stream
Blob's Content Type: 	application/octet-stream
Blob's Content Type: 	application/octet-stream
Blob's Name: 	rule-10-21-final-2/ABA.pdf
Blob's Name: 	rule-10-21-final-2/ABA.pdf
Blob's Name: 	rule-10-21-final-2/ABA.pdf
Blob's Name: 	rule-10-21-final-2/ABA.pdf
Blob's Content Type: 	application/pdf
Blob's Content Type: 	application/pdf
Blob's Content Type: 	application/pdf
Blob's Content Type: 	application/pdf
Number of Tokens: 13523
Number of Tokens: 13523
Number of Tokens: 13523
Number of Tokens: 13523
Number of chunks: 3
Number of chunks: 3
Number of chunks: 3
Number of chunks: 3
Successfully generated individual comment insigh

In [37]:
try:

    # Task:Consilidated summary from aggregated summary json
    if aggregate_comment_summary(output_container_client, temp, comment_folder_name):
        logger.info("Successfully generated aggregated summary")
    else:
        logger.info("Failed to generate aggregated summary")

    # Task:Consilidated suggestions from aggregated suggestions json
    if aggregate_comment_suggestions(
        output_container_client, temp, comment_folder_name
    ):
        logger.info("Successfully generated aggregated suggestions")
    else:
        logger.info("Failed to generate aggregated suggestions")

    # Task:Consilidated themes from aggregated themes json
    if aggregate_comment_themes(output_container_client, temp, comment_folder_name):
        logger.info("Successfully generated aggregated themes")
    else:
        logger.info("Failed to generate aggregated themes")

    # Task:Overall Summary report
    if generate_overall_summary(output_container_client, temp, comment_folder_name):
        logger.info("Successfully generated executive summary ")
    else:
        logger.info("Failed to generate executive summary ")

    end_time = datetime.now()
    elapsed_time = end_time - start_time
    logger.info(f"Total time taken: {elapsed_time}")

except Exception as ex:
    logger.error(ex)

Number of Tokens for aggregated_summary: 3228
Number of Tokens for aggregated_summary: 3228
Number of Tokens for aggregated_summary: 3228
Number of Tokens for aggregated_summary: 3228
Successfully generated aggregated summary
Successfully generated aggregated summary
Successfully generated aggregated summary
Successfully generated aggregated summary
Number of Tokens for aggregated_suggestions : 2296
Number of Tokens for aggregated_suggestions : 2296
Number of Tokens for aggregated_suggestions : 2296
Number of Tokens for aggregated_suggestions : 2296
Successfully generated aggregated suggestions
Successfully generated aggregated suggestions
Successfully generated aggregated suggestions
Successfully generated aggregated suggestions
Number of Tokens for aggregated_themes: 10420
Number of Tokens for aggregated_themes: 10420
Number of Tokens for aggregated_themes: 10420
Number of Tokens for aggregated_themes: 10420
Successfully generated aggregated themes
Successfully generated aggregated t

In [38]:
#display one individual comment summary
from IPython.display import Markdown, display
#blob_client = output_container_client.get_blob_client( )
blob_client = output_container_client.get_blob_client('individual_summary/'+comment_folder_name+'/FDX - summary.md')
downloaded_blob = blob_client.download_blob().readall()
markdown_text = downloaded_blob.decode('utf-8')
display(Markdown(markdown_text))

**Overall Summary:**
The Financial Data Exchange, LLC (FDX) has provided positive feedback on the CFPB's Notice of Proposed Rulemaking for Personal Financial Data Rights. FDX emphasizes the importance of clear, interoperable standards and seeks recognition as a Qualified Industry Standards Setting Body (QSSB). They highlight their role in developing the FDX API for secure, user-permissioned financial data sharing and their commitment to fair, open, and inclusive governance.

**Identified Themes:**
- **Recognition as QSSB**: FDX seeks to be recognized as a Qualified Industry Standards Setting Body to unify the financial services industry around common, interoperable standards for user-permissioned data sharing.
- **FDX API and Standards**: FDX highlights the success and continuous improvement of the FDX API, which has transitioned from credential-based 'screen scraping' to secure token-based access, with over 65 million consumer accounts using the API.
- **Collaboration with CFPB**: FDX appreciates the CFPB's efforts and ongoing dialogue with industry stakeholders, seeking further clarifications and expressing a commitment to align with the CFPB's principles of fair, open, and inclusive governance.
- **Standard Setting Body Characteristics**: FDX outlines the attributes of a standard setting body as recognized by the CFPB, including openness, balance, due process, appeals process, consensus, and transparency, and commits to embodying these attributes.
- **Future Capabilities and Certification**: FDX discusses its current and future capabilities, including the implementation of a technical certification program to ensure compliance with standardized data formatting and the potential use of a registry for data connections.

**Key Suggestions:**
- Clarify the process and procedures for recognizing QSSBs and the standards they issue.
- Emphasize the importance of timely recognition to ensure compliance with the Proposed Rules.
- Provide further clarification on the definition of 'Terms and Conditions' for standardized data formatting.

In [39]:
#display one Aggregated summary of individual comment summaries

blob_client = output_container_client.get_blob_client('aggregated_summary/'+comment_folder_name+'/overall_summary.md')
downloaded_blob = blob_client.download_blob().readall()
markdown_text = downloaded_blob.decode('utf-8')
display(Markdown(markdown_text))

The feedback on the Consumer Financial Protection Bureau's (CFPB) Proposed Rule on Personal Financial Data Rights, primarily from financial institutions and industry associations, highlights several key themes and concerns:

1. **General Support and Appreciation**: Many organizations, including the American Bankers Association (ABA), American Fintech Council (AFC), Bank Policy Institute, Clearing House Association, Financial Technology Association (FTA), and others, appreciate the CFPB's efforts to enhance consumer financial data rights and support the rule's intent to promote innovation, competition, and consumer protection.

2. **Concerns and Recommendations**:
   - **Scope and Definitions**: There are significant concerns about the rule's scope, definitions, and limitations on data use. Organizations like AFC and FTA suggest refining these aspects to avoid unintended consequences and ensure clarity.
   - **Liability and Compliance**: Multiple commenters, including ABA and JPMorgan Chase & Co. (JPMC), emphasize the need for clear liability frameworks and practical compliance timelines. They stress the importance of fair apportionment of liability within the financial data sharing ecosystem.
   - **Data Security and Standards**: The necessity for minimum data security standards, standardized data formats, and clear regulatory requirements is a recurring theme. Organizations advocate for industry-led standards and tokenized account numbers to enhance security.
   - **Consumer Authorization and Authentication**: There is a strong emphasis on the need for clear distinctions between identity authentication and data sharing authorization. Commenters suggest that data providers should obtain their own consumer authorizations and manage risks associated with data sharing.
   - **Prohibition on Fees**: Several organizations, including JPMC and FDATA North America, argue against the prohibition on fees for data access, citing substantial costs and the need for fair compensation for data providers.
   - **Screen Scraping**: Many commenters support prohibiting screen scraping once API access is available, highlighting the risks associated with unsafe data access practices.
   - **Implementation and Compliance Burdens**: Concerns about the technological and financial burdens on small banks and community banks are raised, with suggestions for exemptions or fee allowances to mitigate these challenges.

3. **Positive Feedback and Suggestions for Improvement**:
   - **Consumer Empowerment and Innovation**: Organizations like FTA and Plaid emphasize the importance of consumer control over financial data and the role of data access platforms in promoting competition and innovation.
   - **Standardization and Interoperability**: There is broad support for standardized data formats and interoperable standards to facilitate secure and efficient data sharing.
   - **Regulatory Clarity and Safe Harbors**: Commenters highlight the need for clear regulatory requirements, safe harbors for compliance, and practical approaches to authorization and data security.
   - **Collaboration and Supervision**: Suggestions for collaboration with other regulatory bodies and direct supervision of data access platforms are made to ensure effective implementation and enforcement.

4. **Specific Concerns and Detailed Feedback**:
   - **Data Privacy Protections**: Plaid and other organizations emphasize the importance of consistent application of privacy protections to both third parties and data providers, arguing that de-identified data should not be subject to privacy restrictions.
   - **Performance Specifications**: Concerns about the quantitative performance specifications, such as the 99.5% successful return rate, are raised, with suggestions for more practical and achievable standards.
   - **Consumer Data Rights and Revocation**: Feedback supports the proposal to allow consumers to manage their data sharing but raises concerns about potential anticompetitive behavior and consumer confusion. Recommendations include establishing guardrails and clear authorization procedures.

Overall, the feedback underscores the need for a balanced approach that ensures consumer protection, promotes innovation, and provides clear and practical regulatory requirements. The commenters advocate for refinements to the proposed rule to address their concerns and enhance its effectiveness.

In [40]:
#display one Aggregated suggestions of individual comment suggestions
blob_client = output_container_client.get_blob_client('aggregated_summary/'+comment_folder_name+'/overall_suggestions.md')
downloaded_blob = blob_client.download_blob().readall()
markdown_text = downloaded_blob.decode('utf-8')
display(Markdown(markdown_text))

The aggregated suggestions regarding the organization's rules and regulations primarily focus on the following key themes and topics:

1. **Extension and Flexibility in Compliance Timelines**:
   - Multiple commenters suggest extending the comment period and compliance timelines, particularly for small banks and large institutions, to ensure adequate preparation and implementation.

2. **Clarification of Roles and Definitions**:
   - There is a call for clear definitions of data providers, data aggregators, and third parties, as well as their respective roles and responsibilities. This includes clarifying liability rules and the scope of data collection and sharing.

3. **Prohibition of Screen Scraping and Credential-Based Access**:
   - A recurring suggestion is to explicitly prohibit screen scraping and credential-based access by third parties once API access is available, to enhance data security and privacy.

4. **Data Security and Privacy Standards**:
   - Commenters emphasize the need for robust data security standards, including minimum data security requirements, tokenized account numbers, and clear data deletion and retention policies. There is also a call for standardized data formats and machine-readable data fields.

5. **Supervision and Certification of Third Parties**:
   - Suggestions include creating a supervisory program for data aggregators, establishing a non-binding accreditation body for nonbanks, and requiring third parties to become authorized and certified. There is also a recommendation for direct CFPB supervision of these entities.

6. **Consumer Control and Transparency**:
   - Enhancing consumer control over their data is a key theme, with suggestions for opt-in/opt-out choices for data use, clear revocation methods, and transparency in data access and usage. There is also a call for model forms and guidance for small entities.

7. **Fee Structures and Cost Recovery**:
   - Commenters suggest allowing data providers to charge reasonable fees for data access and developer interfaces, and to recover costs and margins. There is also a recommendation to remove the prohibition on fees.

8. **Performance and Reporting Standards**:
   - Recommendations include adopting commercially reasonable performance specifications, modifying latency thresholds, and allowing industry bodies to publish performance statistics. There is also a call for clear public disclosure requirements for developer interface documentation.

9. **Regulatory Coordination and Avoidance of Duplication**:
   - Commenters urge the CFPB to coordinate with other regulatory bodies, avoid simultaneous rulemaking activities, and ensure new rules do not duplicate existing regulatory obligations.

10. **Secondary Data Use and Data Sharing Limitations**:
    - There are suggestions to allow responsible secondary use of data with appropriate safeguards, exempt de-identified data from restrictions, and establish stringent limitations on secondary data sharing.

11. **Feedback on Specific Provisions**:
    - Specific feedback includes withdrawing proposed § 1033.211(c), ensuring Regulation E obligations for data aggregators, supporting the use of tokenized account numbers, and recognizing industry-led standards for data sharing.

12. **Positive Feedback**:
    - Some commenters appreciate the CFPB's efforts to enhance data privacy and security, and the inclusion of consumer protection measures in the proposed rules.

Overall, the suggestions emphasize the need for clear definitions and roles, robust data security and privacy standards, consumer control and transparency, reasonable compliance timelines, and coordination with other regulatory bodies to avoid duplication and ensure effective implementation.

In [41]:
#display one Aggregated themes of individual comment themes
blob_client = output_container_client.get_blob_client('aggregated_summary/'+comment_folder_name+'/overall_themes.md')
downloaded_blob = blob_client.download_blob().readall()
markdown_text = downloaded_blob.decode('utf-8')
display(Markdown(markdown_text))

Top 25 Most Occurring Themes:
- Prohibition on Fees: [4] ABA and JPMC argue against the prohibition on fees for data providers, stating it is not supported by law and ignores the real costs of building and maintaining interfaces.
- Liability: [3] ABA and JPMC express concerns about liability in the NPRM, suggesting that liability should flow with the data and be fairly apportioned within the financial data sharing ecosystem.
- Screen Scraping: [3] ABA, ICBA, and JPMC support the prohibition of screen scraping and credential-based access, recommending explicit provisions to prohibit unauthorized data access.
- Consumer Authorization: [3] ABA and JPMC emphasize the importance of data providers obtaining their own consumer authorizations before sharing data with third parties to ensure informed consent and protect against allegations of improper data sharing.
- Compliance Timelines: [3] ABA, JPMC, and Plaid argue that the proposed compliance timelines are insufficient and suggest extending the first compliance date to 24 months after the final rule's publication.
- Data Security Standards: [2] ABA and JPMC stress that all entities in the data sharing ecosystem should meet minimum data security standards to ensure a high level of consumer data protection.
- Standardized Data Formats: [2] ABA and ICBA support the adoption of standardized data formats throughout the ecosystem to promote competition and reduce switching costs for third parties.
- Risk Management: [2] ABA and JPMC highlight the importance of allowing data providers to conduct comprehensive risk management programs and not be constrained by narrow definitions of risk management concerns.
- Consumer Data Protection: [2] ABA and Plaid emphasize that existing regulations already provide sufficient protection for consumer data, and the proposed rules do not offer additional benefits.
- Data Aggregators: [2] ABA and JPMC underscore the critical role of data aggregators in the data sharing ecosystem and call for clear regulatory obligations and oversight for these entities.
- Consumer Data Access and Security: [1] The ABA supports secure and transparent consumer access to financial information but emphasizes that all participants in the data sharing ecosystem should adhere to high standards similar to banks.
- Data Fields and Legal Authority: [1] The ABA contends that certain data fields in the NPRM exceed the statutory authority, such as 'authorized but not yet settled' transactions and 'upcoming bill information.'
- Role of Data Providers and Aggregators: [1] The ABA urges the CFPB to clarify that data providers are not responsible for ensuring third parties' compliance and to prohibit screen scraping.
- Risk Management and Fraud Prevention: [1] The ABA stresses the importance of risk management and fraud prevention, advocating for flexibility for data providers to manage risks and prevent fraud.
- Accreditation and Compliance: [1] The ABA suggests creating a non-binding accreditation body for nonbanks to meet minimum standards, which could expedite the due diligence process for data providers.
- Strict Construction of Section 1033: [1] ABA argues that the CFPB should limit the rule to facilitating access to consumer information and not extend it to enable transactions.
- Competitive Impacts on Small Banks: [1] ABA highlights the potential negative impacts on small banks and urges the CFPB to consider the collective impact of multiple regulations and allow cost recoupment.
- Standards and Certification: [1] ABA calls for clear standards and certification processes for qualified industry standards (QIS) to ensure practical operationalization and compliance.
- Simultaneous Rulemaking Activities: [1] ABA cautions against conducting simultaneous rulemaking activities on foundational matters, which could lead to unintended consequences.
- Data Provider Obligations and Compliance Timeframes: [1] ABA requests clarity on data provider obligations, compliance timeframes, and the need for reasonable deadlines to ensure successful implementation.
- Exemptions for 'Natural' Third Parties: [1] ABA recommends that 'natural' third parties, such as agents and attorneys, should be exempt from the developer interface requirements under Section 1033.
- Electronic and Wet Signatures: [1] ABA suggests that the final rule should align with ESIGN and remove the option for wet signatures to ensure consistency with electronic data access.
- Secondary Language Section Clarity: [1] ABA members seek additional clarity on how UDAAP will apply to the provision of financial products and services, particularly regarding the secondary language section in authorization disclosures.
- Data Deletion Requirements: [1] ABA believes that third parties should have explicit requirements to delete data once authorization lapses or is revoked, and consumers should be able to close accounts and have data deleted upon request.
- Retention Obligations: [1] ABA finds the retention provisions in the NPRM to be overly formalized and onerous, suggesting a reduction in the retention period to no more than 24 months and clearer guidelines on what information needs to be retained.

In [42]:
#display executive summary based on aggregated summary, suggestions and themes
blob_client = output_container_client.get_blob_client('final/'+comment_folder_name+'/exec_summary.md')
downloaded_blob = blob_client.download_blob().readall()
markdown_text = downloaded_blob.decode('utf-8')
display(Markdown(markdown_text))

### Executive Summary Report

#### 1. Introduction
This executive summary consolidates feedback and comments from various stakeholders regarding the Consumer Financial Protection Bureau's (CFPB) Proposed Rule on Personal Financial Data Rights. The feedback primarily comes from financial institutions, industry associations, and other relevant organizations. The report aims to provide a comprehensive overview of the main themes, recurring issues, positive feedback, and suggestions for improvement.

#### 2. Main Themes and Topics
The feedback highlights several core themes and topics:
- **General Support and Appreciation**: There is broad support for the CFPB's efforts to enhance consumer financial data rights, promote innovation, competition, and consumer protection.
- **Scope and Definitions**: Concerns about the rule's scope, definitions, and limitations on data use are prevalent, with calls for refinement to avoid unintended consequences.
- **Liability and Compliance**: Clear liability frameworks and practical compliance timelines are emphasized, with a focus on fair apportionment of liability within the financial data sharing ecosystem.
- **Data Security and Standards**: The necessity for minimum data security standards, standardized data formats, and clear regulatory requirements is a recurring theme.
- **Consumer Authorization and Authentication**: The need for clear distinctions between identity authentication and data sharing authorization is highlighted.
- **Prohibition on Fees**: There is significant opposition to the prohibition on fees for data access, citing substantial costs and the need for fair compensation for data providers.
- **Screen Scraping**: Support for prohibiting screen scraping once API access is available is strong, due to the associated risks with unsafe data access practices.
- **Implementation and Compliance Burdens**: Concerns about the technological and financial burdens on small banks and community banks are raised, with suggestions for exemptions or fee allowances.

#### 3. Recurring Issues and Concerns
Several recurring issues and concerns are identified:
- **Compliance Timelines**: The proposed compliance timelines are deemed insufficient, with suggestions to extend the first compliance date to 24 months after the final rule's publication.
- **Data Security Standards**: All entities in the data sharing ecosystem should meet minimum data security standards to ensure a high level of consumer data protection.
- **Standardized Data Formats**: Adoption of standardized data formats is supported to promote competition and reduce switching costs for third parties.
- **Risk Management**: Allowing data providers to conduct comprehensive risk management programs without being constrained by narrow definitions of risk management concerns is emphasized.
- **Consumer Data Protection**: Existing regulations are considered sufficient for consumer data protection, with the proposed rules not offering additional benefits.
- **Role of Data Providers and Aggregators**: Clear regulatory obligations and oversight for data aggregators are called for, along with clarification that data providers are not responsible for ensuring third parties' compliance.

#### 4. Positive Feedback
Positive feedback includes:
- **Consumer Empowerment and Innovation**: Emphasis on consumer control over financial data and the role of data access platforms in promoting competition and innovation.
- **Standardization and Interoperability**: Broad support for standardized data formats and interoperable standards to facilitate secure and efficient data sharing.
- **Regulatory Clarity and Safe Harbors**: The need for clear regulatory requirements, safe harbors for compliance, and practical approaches to authorization and data security is highlighted.
- **Collaboration and Supervision**: Suggestions for collaboration with other regulatory bodies and direct supervision of data access platforms to ensure effective implementation and enforcement.

#### 5. Suggestions for Improvement
Key suggestions for improvement include:
- **Extension and Flexibility in Compliance Timelines**: Extending the comment period and compliance timelines, particularly for small banks and large institutions.
- **Clarification of Roles and Definitions**: Clear definitions of data providers, data aggregators, and third parties, as well as their respective roles and responsibilities.
- **Prohibition of Screen Scraping and Credential-Based Access**: Explicit prohibition of screen scraping and credential-based access by third parties once API access is available.
- **Data Security and Privacy Standards**: Robust data security standards, including minimum data security requirements, tokenized account numbers, and clear data deletion and retention policies.
- **Supervision and Certification of Third Parties**: Creating a supervisory program for data aggregators, establishing a non-binding accreditation body for nonbanks, and requiring third parties to become authorized and certified.
- **Consumer Control and Transparency**: Enhancing consumer control over their data with opt-in/opt-out choices, clear revocation methods, and transparency in data access and usage.
- **Fee Structures and Cost Recovery**: Allowing data providers to charge reasonable fees for data access and developer interfaces, and to recover costs and margins.
- **Performance and Reporting Standards**: Adopting commercially reasonable performance specifications, modifying latency thresholds, and allowing industry bodies to publish performance statistics.
- **Regulatory Coordination and Avoidance of Duplication**: Coordinating with other regulatory bodies, avoiding simultaneous rulemaking activities, and ensuring new rules do not duplicate existing regulatory obligations.
- **Secondary Data Use and Data Sharing Limitations**: Allowing responsible secondary use of data with appropriate safeguards, exempting de-identified data from restrictions, and establishing stringent limitations on secondary data sharing.

#### 6. Conclusion
The feedback on the CFPB's Proposed Rule on Personal Financial Data Rights underscores the need for a balanced approach that ensures consumer protection, promotes innovation, and provides clear and practical regulatory requirements. While there is broad support for the CFPB's efforts, stakeholders advocate for refinements to address their concerns and enhance the rule's effectiveness. Key areas for improvement include extending compliance timelines, clarifying roles and definitions, enhancing data security and privacy standards, and ensuring consumer control and transparency. By addressing these suggestions, the CFPB can create a more effective and balanced regulatory framework for personal financial data rights.