In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

endpoint = ""
key = ""

image_path = "example.png" # Enter image path here

def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])

def analyze_read():
    # Sample document
    form_path = image_path

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    
    with open(form_path, "rb") as form_file:
        poller = document_analysis_client.begin_analyze_document(
            model_id="prebuilt-read", document=form_file
        )
        result = poller.result()

    print("Document contains content: ", result.content)

    for page in result.pages:
        for line_idx, line in enumerate(page.lines):
            continue
    
    return result

result = analyze_read()

Document contains content:  Shashwat Saini +919970013662 | shashwatsaini290@gmail.com | LinkedIn | GitHub
EDUCATION
Dayananda Sagar University B.Tech. in Computer Science Engineering- Artificial Intelligence & Machine Learning
Bengaluru, Karnataka 2022 - 2026 Chennai, Tamil Nadu 2023 - 2027
Indian Institute of Technology Madras B.S. in Data Science & Applications
EXPERIENCE
Machine Learning Intern
July 2024 - August 2024 Bengaluru, Karnataka
AIMonk Labs Private Ltd.
· Designed and evaluated multi-agent systems using Microsoft Autogen, Crew.AI, OpenAI, and LangChain.
· Developed an advanced content re-purposing system to summarize videos, generate highlights and reels, as well as Twitter tweets, and Instagram posts.
· Deployed the system on a remote server with MongoDB, incorporating result caching and a front-end for testing.
RESEARCH PUBLICATIONS
Saini, S., Vrindavanam, J., & Mondal, S. (2024). Methodological Insights into Protein Clustering Using BERT And ROBERTa. IEEE CONECCT 2024, 

In [2]:
import autogen

config_list = {
    'model': 'llama3.1',
    'base_url': 'http://localhost:11434/v1',
    'api_key': 'ollama',
}

chat_outline = {
    'message': result.content
}

In [3]:
text_assistant = autogen.ConversableAgent(
    'assistant', 
    system_message='OBJECTIVE: You are an agent designed to identify personal information from any text provided to you.\n'
                   'Your sole task is to output a Python list of words or phrases that need to be redacted based on the following categories:'
                   '[Names, Proper Nouns, Company Names, Full Dates, Phone Numbers, Email Addresses, Physical Addresses, Social Security Numbers].\n'
                   'IMPORTANT: Your output MUST strictly be a SINGLE Python list of strings representing the items that need to be redacted.'
                   'Example Output: ["John Doe", "123-456-7890"].\n'
                   'You should NOT respond with explanations, code blocks, or any other format.\n'
                   'DO NOT explain or comment on your reasoning; just output the list.\n'
                   'FAILURE EXAMPLE (do not output this): "The following need to be redacted..."\n'
                   'SUCCESS EXAMPLE (the correct output): ["John Doe", "123-456-7890"].',
    llm_config=config_list, 
    human_input_mode='NEVER',
    code_execution_config=False
)

evaluation = autogen.ConversableAgent(
    'evaluation-agent',
    system_message='OBJECTIVE: You are an evaluation agent responsible for reviewing text to ensure that all personal information has been correctly redacted.\n'
                   'Your task is to verify whether any personal information from the following LIST has been missed:'
                   ' [Names, Proper Nouns, Company Names, Full Dates, Months, Phone Numbers, Email Addresses, Physical Addresses, Country Names, Social Security Numbers].\n'
                   'If any of this information is still present, flag the specific issue and provide feedback on the missing redactions.\n'
                   'OUTPUT: Provide your feedback as bullet points, listing any additional word(s) or phrases that need to be redacted.\n'
                   'Example Output: \n'
                   '- "John Doe" (Name)\n'
                   '- "123-456-7890" (Phone Number)\n'
                   'Do not provide explanations or reasoning, only the missing redactions in this format.',
    llm_config=config_list,
    human_input_mode='NEVER',
    code_execution_config=False
)

user_proxy = autogen.UserProxyAgent(
    'user_proxy', 
    human_input_mode='NEVER',
    code_execution_config=False
)

In [4]:
conversation_state = 1
speakers = []
def text_redact_selection_func(speaker: autogen.AssistantAgent, groupchat: autogen.GroupChat):
    global conversation_state
    if speaker == user_proxy and conversation_state == 1:
        conversation_state += 1
        speakers.append(text_assistant.name)
        return text_assistant
    elif speaker == text_assistant and conversation_state == 2:
        conversation_state += 1
        speakers.append(evaluation.name)
        return evaluation
    elif speaker == evaluation and conversation_state == 3:
        conversation_state += 1
        speakers.append(text_assistant.name)
        return text_assistant

text_redaction_chat = autogen.GroupChat(
    agents=[user_proxy, text_assistant, evaluation],
    speaker_selection_method=text_redact_selection_func,
    messages=[],
    max_round=4
)

text_redaction_manager = autogen.GroupChatManager(
    groupchat=text_redaction_chat, llm_config=config_list
)

In [5]:
text_redaction_chats = user_proxy.initiate_chat(
    text_redaction_manager,
    message=chat_outline['message']
)

[33muser_proxy[0m (to chat_manager):

Shashwat Saini +919970013662 | shashwatsaini290@gmail.com | LinkedIn | GitHub
EDUCATION
Dayananda Sagar University B.Tech. in Computer Science Engineering- Artificial Intelligence & Machine Learning
Bengaluru, Karnataka 2022 - 2026 Chennai, Tamil Nadu 2023 - 2027
Indian Institute of Technology Madras B.S. in Data Science & Applications
EXPERIENCE
Machine Learning Intern
July 2024 - August 2024 Bengaluru, Karnataka
AIMonk Labs Private Ltd.
· Designed and evaluated multi-agent systems using Microsoft Autogen, Crew.AI, OpenAI, and LangChain.
· Developed an advanced content re-purposing system to summarize videos, generate highlights and reels, as well as Twitter tweets, and Instagram posts.
· Deployed the system on a remote server with MongoDB, incorporating result caching and a front-end for testing.
RESEARCH PUBLICATIONS
Saini, S., Vrindavanam, J., & Mondal, S. (2024). Methodological Insights into Protein Clustering Using BERT And ROBERTa. IEEE CO

In [6]:
import ast
redacted_text = text_redaction_chats.chat_history[-1]['content']
redacted_text = ast.literal_eval(redacted_text)
redacted_words = []
for text in redacted_text:
    for word in str(text).split(' '):
        if len(word) > 3:
            redacted_words.append(word)
redacted_words

['Shashwat',
 'Saini',
 '+919970013662',
 'shashwatsaini290@gmail.com',
 'LinkedIn',
 'GitHub',
 'Dayananda',
 'Sagar',
 'University',
 'Indian',
 'Institute',
 'Technology',
 'Madras',
 'AIMonk',
 'Labs',
 'Private',
 'Ltd.',
 '+919970013662',
 'July',
 '2024',
 'August',
 '2024',
 'B.Tech.',
 'Computer',
 'Science',
 'Engineering-',
 'Artificial',
 'Intelligence',
 'Machine',
 'Learning',
 'Bengaluru,',
 'Karnataka',
 'Chennai,',
 'Tamil',
 'Nadu',
 'January',
 '2024',
 'November',
 '2023']

In [7]:
redacted_cords = []
for page in result.pages:
        for word in page.words:
                if word.content in redacted_words:
                    cords = []
                    for polygon in word.polygon:
                           cords.append((polygon.x, polygon.y))
                    redacted_cords.append(cords)
redacted_cords[:5]

[[(598.0, 96.0), (914.0, 96.0), (914.0, 155.0), (598.0, 153.0)],
 [(941.0, 96.0), (1104.0, 95.0), (1105.0, 154.0), (941.0, 155.0)],
 [(443.0, 161.0), (629.0, 160.0), (630.0, 190.0), (444.0, 190.0)],
 [(652.0, 160.0), (998.0, 160.0), (998.0, 190.0), (653.0, 190.0)],
 [(1033.0, 160.0), (1139.0, 160.0), (1139.0, 190.0), (1033.0, 190.0)]]

In [8]:
from PIL import Image, ImageDraw

# Load the original image
image_path = image_path
image = Image.open(image_path)
draw = ImageDraw.Draw(image)
width, height = image.size

# For each set of coordinates, draw the black boxes
for coord_set in redacted_cords:
    x_coords = [point[0] for point in coord_set]
    y_coords = [point[1] for point in coord_set]
    x_min, x_max = min(x_coords), max(x_coords)
    y_min, y_max = min(y_coords), max(y_coords)
    draw.rectangle([x_min, y_min, x_max, y_max], fill='black')

image.save('modified_image.png')
