In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
cd /content/drive/MyDrive/ML Task Assignment

/content/drive/MyDrive/ML Task Assignment


In [None]:
### Install Required Libraries
!pip install PyMuPDF tabulate

Collecting PyMuPDF
  Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.1-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.1


In [None]:
### Upload ADT-1 PDF

from google.colab import files
uploaded = files.upload()
pdf_file = next(iter(uploaded))

Saving Form ADT-1-29092023_signed.pdf to Form ADT-1-29092023_signed.pdf


In [None]:
### Load and Read PDF

import fitz
doc = fitz.open(pdf_file)
pdf_text = ""
for page in doc:
    pdf_text += page.get_text()
print("✅ PDF loaded and text extracted")

✅ PDF loaded and text extracted


In [None]:
###  Extract Fields from Text

import json
import re

def extract_fields(text):
    data = {
        "company_name": "",
        "cin": "",
        "registered_office": "",
        "appointment_date": "",
        "auditor_name": "",
        "auditor_address": "",
        "auditor_frn_or_membership": "",
        "appointment_type": ""
    }

    # Using refined regex patterns to find patterns in the text
    patterns = {
        "company_name": r"Name of the company\s*\n?\s*(.*?)\s*\n?\(b\)",
        "cin": r"Corporate identity number \(CIN\) of company\s*\n?\s*(.*?)\s*\n?2\.\(a\)",
        "registered_office": r"Address of the registered office\s*\n?\s*of\s*\n?\s*the company\s*\n?\s*(.*?)\s*\n?\[Pursuant",
        "appointment_date": r"Date of appointment\s*\(DD\/MM\/YYYY\)\s*\n?\s*(.*?)\s*\n?Yes",
        "auditor_name": r"Name of the auditor or auditor's firm\s*\n?\s*(.*?)\s*\n?\(d\)",
        "auditor_address": r"Address of the Auditor\s*\n?\s*or auditor's firm\s*\n?\s*Line I\s*\n?\s*(.*?)\s*\n?Line II\s*\n?\s*(.*?)\s*\n?\*City",
        "auditor_frn_or_membership": r"Membership Number of auditor or auditor's firm's registration number\s*\n?\s*(.*?)\s*\n?\(e\)",
        "appointment_type": r"Nature of appointment\s*\*\s*\n?\s*(.*?)\s*\n?\*",
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            if key == "auditor_address":
                # Handle multi-line address
                data[key] = f"{match.group(1).strip()} {match.group(2).strip()}"
            else:
                data[key] = match.group(1).strip()

    return data

extracted_data = extract_fields(pdf_text)

# Print the extracted text and data for debugging
print("--- Extracted PDF Text ---")
print(pdf_text)
print("\n--- Extracted Data ---")
print(extracted_data)

--- Extracted PDF Text ---
Page 1 of 3
  (b)  Global location number (GLN) of company
1.(a) *Corporate identity number (CIN) of company
2.(a)  Name of the company
(b) Address of the registered office  
      of  the company
[Pursuant to section 139 of the Companies Act, 
2013 and Rule 4(2) of the Companies  
(Audit and Auditors) Rules, 2014]
FORM NO. ADT-1
Notice to the Registrar by 
company for appointment of 
auditor
Refer the instruction kit for filing the form.
Form language
(c)   email id of the company
English
Hindi
*
3.(a)   Whether company is falling under any class of companies as per section 139(2)   
Yes
No
4.   Whether joint auditors have been appointed
Yes
No
Number of auditor(s) appointed
*
*
*
(b)   Nature of appointment    
*
(b) *Income Tax permanent account number of auditor or auditor's firm
I. (a) *Category of Auditor
(c) *Name of the auditor or auditor's firm
Individual
Auditor's Firm
(d) *Membership Number of auditor or auditor's firm's registration number
(e) *Ad

In [None]:
###  Save to output.json

with open("output.json", "w") as f:
    json.dump(extracted_data, f, indent=2)
files.download("output.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
### Generate AI-style Summary

summary = (
    f"{extracted_data.get('company_name', 'The company')} has appointed "
    f"{extracted_data.get('auditor_name', 'an auditor')} as its statutory auditor "
    f"on {extracted_data.get('appointment_date', 'unknown date')}. "
    f"The auditor's registration number is {extracted_data.get('auditor_frn_or_membership', 'N/A')}. "
    f"This is a {extracted_data.get('appointment_type', 'new/reappointment')} appointment."
)

with open("summary.txt", "w") as f:
    f.write(summary)

print(summary)
files.download("summary.txt")

 has appointed Individual
Auditor's Firm as its statutory auditor on . The auditor's registration number is . This is a (b) appointment.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
### Extract Embedded Attachments (Bonus)
import os

os.makedirs("attachments", exist_ok=True)
attachments_found = False

for page_index in range(len(doc)):
    page = doc.load_page(page_index)
    annot = page.first_annot
    while annot:
        if annot.type[0] == 17:  # 17 = FileAttachment
            file_info = annot.file_info
            fname = file_info["filename"]
            filedata = annot.file_get()
            with open(f"attachments/{fname}", "wb") as f:
                f.write(filedata)
            print(f"✅ Extracted: {fname}")
            attachments_found = True
        annot = annot.next

if not attachments_found:
    print("⚠️ No file attachments found in annotation layer.")


⚠️ No file attachments found in annotation layer.


In [None]:
### Accuracy Test Function with Validation

from tabulate import tabulate

def test_accuracy(data):
    checks = []
    missing_fields = []

    required_fields = {
        "company_name": None,
        "cin": r"^[A-Z0-9]{21}$",
        "registered_office": None,
        "appointment_date": r"^\d{4}-\d{2}-\d{2}$",
        "auditor_name": None,
        "auditor_address": None,
        "auditor_frn_or_membership": r"^\d+$",
        "appointment_type": None
    }

    for field, pattern in required_fields.items():
        value = data.get(field, "").strip()
        if not value:
            checks.append([field, "❌ Missing", "—"])
            missing_fields.append(field)
        elif pattern and not re.match(pattern, value):
            checks.append([field, "⚠️ Format Error", f"Got: {value}"])
            missing_fields.append(field)
        else:
            checks.append([field, "✅ OK", value])

    print("\n🧪 Accuracy Report:")
    print(tabulate(checks, headers=["Field", "Status", "Value"], tablefmt="github"))

    if not missing_fields:
        print("\n✅ All fields passed basic validation.")
    else:
        print(f"\n⚠️ Issues found in fields: {', '.join(missing_fields)}")

In [None]:
# Run validation
test_accuracy(extracted_data)


🧪 Accuracy Report:
| Field                     | Status     | Value   |
|---------------------------|------------|---------|
| company_name              | ❌ Missing | —       |
| cin                       | ❌ Missing | —       |
| registered_office         | ❌ Missing | —       |
| appointment_date          | ❌ Missing | —       |
| auditor_name              | ✅ OK      | Individual
Auditor's Firm         |
| auditor_address           | ❌ Missing | —       |
| auditor_frn_or_membership | ❌ Missing | —       |
| appointment_type          | ✅ OK      | (b)     |

⚠️ Issues found in fields: company_name, cin, registered_office, appointment_date, auditor_address, auditor_frn_or_membership


In [None]:
###  List Extracted Attachments

import os
if os.path.exists("attachments"):
    print("📁 Attachments extracted:", os.listdir("attachments"))
else:
    print("📂 No attachments directory found.")


📁 Attachments extracted: []


In [None]:
### View Final JSON Output Inline

print(json.dumps(extracted_data, indent=2))

{
  "company_name": "",
  "cin": "",
  "registered_office": "",
  "appointment_date": "",
  "auditor_name": "Individual\nAuditor's Firm",
  "auditor_address": "",
  "auditor_frn_or_membership": "",
  "appointment_type": "(b)"
}


In [None]:
### Clean Up Temp Files (Optional)

import shutil
from google.colab import files # Import files here

shutil.make_archive("submission_bundle", 'zip', ".")
files.download("submission_bundle.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>