<a href="https://colab.research.google.com/github/rjshvjy/tds-automation/blob/main/TDS_Automation_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
# Cell 1: Install required libraries and import modules
# This cell installs necessary packages and imports all required libraries

# Install required packages (run this only once)
!pip install PyPDF2 openpyxl pandas numpy tabulate

# Import all necessary libraries
import PyPDF2
import pandas as pd
import numpy as np
import re
import os
from datetime import datetime
import openpyxl
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, Alignment, Border, Side
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries imported successfully!")
print("Ready to process TDS files...")
print("\n📋 Expected file naming conventions:")
print("   - Masters file: TDS_Masters*.xlsx")
print("   - Template file: TDS_Template*.xlsx")
print("   - Output file: TDS_[Month]_[Year].xlsx (auto-generated from payment dates)")

✅ All libraries imported successfully!
Ready to process TDS files...

📋 Expected file naming conventions:
   - Masters file: TDS_Masters*.xlsx
   - Template file: TDS_Template*.xlsx
   - Output file: TDS_[Month]_[Year].xlsx (auto-generated from payment dates)


In [44]:
# Cell 2: Functions to extract data from PDF challans - FIXED AMOUNT EXTRACTION
# This cell contains all the functions needed to read and parse PDF challan files

def extract_challan_data_from_pdf(pdf_path):
    """
    Extract challan data from a single PDF file
    Returns a dictionary with all challan details
    """
    challan_data = {}

    try:
        # Open and read the PDF
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)

            # Extract text from first page
            text = ""
            if len(pdf_reader.pages) > 0:
                text = pdf_reader.pages[0].extract_text()

            # Extract all required fields using regex patterns - IMPROVED PATTERNS
            patterns = {
                'tan': r'TAN\s*:\s*([A-Z0-9]+)',
                'nature_of_payment': r'Nature of Payment\s*:\s*(\d+[A-Z])',
                'cin': r'CIN\s*:\s*([A-Z0-9]+)',
                'bsr_code': r'BSR code\s*:\s*([\d]+)',
                'challan_no': r'Challan No\s*:\s*([\d]+)',
                'tender_date': r'Tender Date\s*:\s*(\d{2}/\d{2}/\d{4})',
                'mode_of_payment': r'Mode of Payment\s*:\s*([^\n]+)',
            }

            # Extract each field
            for field, pattern in patterns.items():
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    value = match.group(1).strip()
                    # Convert mode of payment to uppercase
                    if field == 'mode_of_payment':
                        value = value.upper()
                    # Keep BSR code and challan_no as strings to preserve leading zeros
                    elif field in ['bsr_code', 'challan_no']:
                        value = value.zfill(7) if field == 'bsr_code' else value
                    challan_data[field] = value
                else:
                    challan_data[field] = ""

            # SPECIAL HANDLING FOR AMOUNTS - Multiple patterns to try
            # Pattern 1: Try the tax breakup section with flexible whitespace
            tax_patterns = [
                r'A\s+Tax\s+₹\s*([\d,]+)',  # Original pattern
                r'A\s+Tax\s+₹\s*([\d,]+)',  # With regular space
                r'A\s+Tax\s+[₹]\s*([\d,]+)',  # ₹ in brackets
                r'A\s+Tax\s+.\s*([\d,]+)',  # Any character instead of ₹
                r'Tax\s+₹\s*([\d,]+)',  # Simplified pattern
                r'A\s+Tax[^0-9]+([\d,]+)',  # Skip any non-digits after Tax
            ]

            tax_amount = ""
            for pattern in tax_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    tax_amount = match.group(1).strip().replace(',', '')
                    break

            # Fallback: Try the header amount field
            if not tax_amount:
                amount_patterns = [
                    r'Amount \(in Rs\.\)\s*:\s*₹\s*([\d,]+)',
                    r'Amount.*?₹\s*([\d,]+)',
                    r'Amount.*?Rs.*?([\d,]+)',
                ]
                for pattern in amount_patterns:
                    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
                    if match:
                        tax_amount = match.group(1).strip().replace(',', '')
                        break

            challan_data['tax_amount'] = tax_amount

            # Extract other amounts with similar flexible patterns
            amount_fields = {
                'surcharge': [
                    r'B\s+Surcharge\s+₹\s*([\d,]+)',
                    r'B\s+Surcharge[^0-9]+([\d,]+)',
                    r'Surcharge\s+₹\s*([\d,]+)'
                ],
                'cess': [
                    r'C\s+Cess\s+₹\s*([\d,]+)',
                    r'C\s+Cess[^0-9]+([\d,]+)',
                    r'Cess\s+₹\s*([\d,]+)'
                ],
                'interest': [
                    r'D\s+Interest\s+₹\s*([\d,]+)',
                    r'D\s+Interest[^0-9]+([\d,]+)',
                    r'Interest\s+₹\s*([\d,]+)'
                ],
                'penalty': [
                    r'E\s+Penalty\s+₹\s*([\d,]+)',
                    r'E\s+Penalty[^0-9]+([\d,]+)',
                    r'Penalty\s+₹\s*([\d,]+)'
                ],
                'fee_234e': [
                    r'F\s+Fee under section 234E\s+₹\s*([\d,]+)',
                    r'Fee under section 234E\s+₹\s*([\d,]+)',
                    r'234E[^0-9]+([\d,]+)'
                ]
            }

            for field, patterns_list in amount_fields.items():
                value = ""
                for pattern in patterns_list:
                    match = re.search(pattern, text, re.IGNORECASE)
                    if match:
                        value = match.group(1).strip().replace(',', '')
                        break
                challan_data[field] = value if value else "0"

            # Extract total amount
            total_patterns = [
                r'Total \(A\+B\+C\+D\+E\+F\)\s+₹\s*([\d,]+)',
                r'Total.*?₹\s*([\d,]+)',
                r'Total[^0-9]+([\d,]+)'
            ]

            total_amount = ""
            for pattern in total_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    total_amount = match.group(1).strip().replace(',', '')
                    break

            challan_data['total_amount'] = total_amount

            # Add file name for reference
            challan_data['file_name'] = os.path.basename(pdf_path)

            print(f"✅ Extracted data from: {os.path.basename(pdf_path)}")
            print(f"   Nature of Payment: {challan_data.get('nature_of_payment', 'Not found')}")
            print(f"   Challan No: {challan_data.get('challan_no', 'Not found')}")
            print(f"   Tax Amount: ₹{challan_data.get('tax_amount', 'Not found')}")

            # Debug: Show all amounts if tax amount is found
            if challan_data.get('tax_amount'):
                print(f"   Surcharge: ₹{challan_data.get('surcharge', '0')}")
                print(f"   Cess: ₹{challan_data.get('cess', '0')}")
                print(f"   Total: ₹{challan_data.get('total_amount', 'Not found')}")

    except Exception as e:
        print(f"❌ Error processing {pdf_path}: {str(e)}")
        challan_data['error'] = str(e)

    return challan_data

def extract_all_challans(pdf_folder_path):
    """
    Extract data from all PDF files in a folder and DEDUPLICATE by challan number
    Returns a list of dictionaries, one for each UNIQUE challan
    """
    all_challan_data = []
    challan_map = {}  # To track unique challans by challan number

    # Get all PDF files in the folder
    pdf_files = [f for f in os.listdir(pdf_folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"❌ No PDF files found in {pdf_folder_path}")
        return all_challan_data

    print(f"\n📁 Found {len(pdf_files)} PDF files to process...")
    print("-" * 50)

    # Process each PDF
    duplicate_count = 0
    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder_path, pdf_file)
        challan_data = extract_challan_data_from_pdf(pdf_path)

        # Check if this challan number already exists
        challan_no = challan_data.get('challan_no', '')

        if challan_no and challan_no in challan_map:
            # This is a duplicate challan
            duplicate_count += 1
            existing_challan = challan_map[challan_no]

            # Verify the duplicate has same amount (data integrity check)
            if challan_data.get('tax_amount') != existing_challan.get('tax_amount'):
                print(f"⚠️  WARNING: Duplicate challan {challan_no} has different tax amounts!")
                print(f"   File 1: {existing_challan.get('file_name')} - ₹{existing_challan.get('tax_amount')}")
                print(f"   File 2: {challan_data.get('file_name')} - ₹{challan_data.get('tax_amount')}")
        else:
            # This is a new unique challan
            if challan_no:  # Only add if challan number exists
                challan_map[challan_no] = challan_data
                all_challan_data.append(challan_data)
            else:
                print(f"⚠️  Skipping file {pdf_file} - no challan number found")

    print("-" * 50)
    print(f"✅ Total PDF files processed: {len(pdf_files)}")
    print(f"✅ Unique challans found: {len(all_challan_data)}")
    if duplicate_count > 0:
        print(f"ℹ️  Duplicate challans skipped: {duplicate_count}")

    # Create summary by Nature of Payment
    summary = {}
    total_all = 0
    for challan in all_challan_data:
        nop = challan.get('nature_of_payment', 'Unknown')
        if nop:
            if nop not in summary:
                summary[nop] = {'count': 0, 'total_tax': 0}
            summary[nop]['count'] += 1
            try:
                tax_amt = float(challan.get('tax_amount', 0))
                summary[nop]['total_tax'] += tax_amt
                total_all += tax_amt
            except:
                pass

    print("\n📊 Summary by Nature of Payment:")
    for nop, data in sorted(summary.items()):
        print(f"   {nop}: {data['count']} challan(s), Total Tax: ₹{data['total_tax']:,.0f}")
    print(f"   GRAND TOTAL: ₹{total_all:,.0f}")

    return all_challan_data

# Test function to verify extraction is working
def test_extraction():
    """Test the extraction with a sample text"""
    print("PDF extraction functions loaded successfully!")
    print("Now with:")
    print("  ✅ Improved amount extraction with multiple fallback patterns")
    print("  ✅ Fixed regex patterns for better ₹ symbol handling")
    print("  ✅ Deduplication based on challan number")
    print("  ✅ Data integrity checks for duplicates")
    print("  ✅ Mode of payment converted to uppercase")
    print("  ✅ BSR codes and challan numbers preserved as strings")
    print("Use extract_all_challans('/path/to/pdf/folder') to process your PDF files")

test_extraction()

PDF extraction functions loaded successfully!
Now with:
  ✅ Improved amount extraction with multiple fallback patterns
  ✅ Fixed regex patterns for better ₹ symbol handling
  ✅ Deduplication based on challan number
  ✅ Data integrity checks for duplicates
  ✅ Mode of payment converted to uppercase
  ✅ BSR codes and challan numbers preserved as strings
Use extract_all_challans('/path/to/pdf/folder') to process your PDF files


In [45]:
# Cell 3: Functions to read and update Excel files - FIXED VERSION
# This cell handles reading TDS Masters and updating it with challan data

import math
from decimal import Decimal, ROUND_HALF_UP

def read_tds_masters(file_path):
    """
    Read the TDS Masters Excel file and return data from all sheets
    FIXED:
    1. Removed TDS RATES reading (not used in processing)
    2. Smart row detection - stops at empty data rows (ignores formula-only rows)
    """
    try:
        # Use pandas for TDS CODES only (removed TDS RATES)
        tds_codes = pd.read_excel(file_path, sheet_name='TDS CODES', keep_default_na=False)

        # REMOVED TDS RATES - Not used in processing
        # tds_rates = pd.read_excel(file_path, sheet_name='TDS RATES', keep_default_na=False)

        # Use openpyxl for TDS PARTIES to preserve values
        wb = load_workbook(file_path, data_only=True)
        ws_parties = wb['TDS PARTIES']

        # Find the row with column codes
        code_row = None
        for idx in range(1, 11):  # Check first 10 rows
            row_values = []
            for col in range(1, ws_parties.max_column + 1):
                cell_value = ws_parties.cell(row=idx, column=col).value
                if cell_value:
                    row_values.append(str(cell_value))

            code_patterns = ['(415)', '(427)', '(416)', '(415A)', '-415', '-427', '-416', '-417', '-418', '-419', '-421']
            if any(pattern in val for val in row_values for pattern in code_patterns):
                code_row = idx
                print(f"✅ Found column codes at row {idx}")
                break

        # Read headers (row before codes)
        header_row = code_row - 1 if code_row else 1
        headers = []
        for col in range(1, ws_parties.max_column + 1):
            header_val = ws_parties.cell(row=header_row, column=col).value
            headers.append(header_val if header_val else f"Column_{col}")

        # Get column codes
        code_to_column_name = {}
        column_code_map = {}
        if code_row:
            for col in range(1, ws_parties.max_column + 1):
                code_val = ws_parties.cell(row=code_row, column=col).value
                if code_val:
                    code_str = str(code_val).strip()
                    code_match = None
                    if '(' in code_str and ')' in code_str:
                        code_match = re.search(r'\(([0-9A-Z]+)\)', code_str)
                    elif code_str.startswith('-'):
                        code_match = re.search(r'-([0-9A-Z]+)', code_str)
                    if code_match:
                        extracted_code = code_match.group(1)
                        normalized_code = f'({extracted_code})'
                        column_code_map[normalized_code] = col - 1
                        code_to_column_name[normalized_code] = headers[col - 1]

        # Fallback mappings by column names
        column_name_mappings = {
            '(415)': ['Deductee Code', 'Individual/Company', 'Indiv/Comp', 'Code'],
            '(415A)': ['Section Under Payment Made', 'Type of Payment', 'Nature of Payment', 'Section'],
            '(416)': ['PAN of the Deductee', 'PAN', 'PAN No', 'Deductee PAN'],
            '(417)': ['Name of the Deductee', 'Deductee Name', 'Name', 'Party Name'],
            '(418)': ['Date of Payment/credit', 'Payment Date', 'Date of Payment', 'Credit Date'],
            '(419)': ['Amount Paid /Credited', 'Amount Paid', 'Gross Amount', 'Payment Amount', 'Amount'],
            '(421)': ['TDS', 'Tax Deducted', 'TDS Amount', 'TDS               Rs.', 'TDS Rs.'],
            '(425D)': ['BSR Code', 'BSR', 'Bank BSR Code'],
            '(425E)': ['Challan Serial No', 'Challan No', 'Challan Number'],
            '(425F)': ['Date on which deposited', 'Date Deposited', 'Deposit Date', 'Challan Date'],
            '(427)': ['TDS Deducted Rates %', 'TDS Rate', 'Rate %', 'Deduction Rate', 'Rate']
        }

        for code, possible_names in column_name_mappings.items():
            if code not in code_to_column_name:
                for col_idx, col_name in enumerate(headers):
                    col_name_clean = str(col_name).strip()
                    for possible_name in possible_names:
                        if possible_name.lower() in col_name_clean.lower():
                            code_to_column_name[code] = col_name
                            column_code_map[code] = col_idx
                            print(f"   Found {code} by column name: '{col_name}'")
                            break
                    if code in code_to_column_name:
                        break

        # FIXED: Smart row reading - stop at empty data rows
        data_rows = []
        data_start_row = code_row + 1 if code_row else 2

        # Find critical columns for determining real data
        name_col_idx = None
        pan_col_idx = None

        # Get column indices for Name (417) and PAN (416)
        if '(417)' in code_to_column_name:
            name_col_idx = column_code_map.get('(417)') + 1  # +1 for 1-based Excel columns
        if '(416)' in code_to_column_name:
            pan_col_idx = column_code_map.get('(416)') + 1

        # Count consecutive empty rows
        consecutive_empty = 0
        max_consecutive_empty = 5  # Stop after 5 consecutive empty rows

        for row in range(data_start_row, ws_parties.max_row + 1):
            row_data = []
            has_meaningful_data = False

            # Check if this row has meaningful data
            for col in range(1, ws_parties.max_column + 1):
                cell_value = ws_parties.cell(row=row, column=col).value
                row_data.append(cell_value)

                # Check specifically Name and PAN columns for real data
                if name_col_idx and col == name_col_idx:
                    if cell_value and str(cell_value).strip() and str(cell_value).strip() != '0':
                        has_meaningful_data = True
                elif pan_col_idx and col == pan_col_idx:
                    if cell_value and str(cell_value).strip() and str(cell_value).strip() != '0':
                        has_meaningful_data = True

            # If no Name or PAN columns found, use general check
            if not name_col_idx and not pan_col_idx:
                # Check if at least one cell has non-zero, non-empty value
                for val in row_data:
                    if val is not None and str(val).strip() and str(val).strip() != '0':
                        has_meaningful_data = True
                        break

            # Process the row decision
            if has_meaningful_data:
                data_rows.append(row_data)
                consecutive_empty = 0  # Reset counter
            else:
                consecutive_empty += 1
                # Stop if we've seen enough consecutive empty rows
                if consecutive_empty >= max_consecutive_empty:
                    print(f"   Stopped reading at row {row} (found {consecutive_empty} consecutive empty rows)")
                    break

        print(f"   Read {len(data_rows)} rows with actual data (ignoring formula-only rows)")

        tds_parties = pd.DataFrame(data_rows, columns=headers)

        # Convert numeric columns with precise rounding - EXCLUDE (427) TO PRESERVE DECIMAL RATES
        numeric_codes = ['(419)', '(421)']  # Removed '(427)' to avoid quantizing rates to integers
        for code in numeric_codes:
            col_name = code_to_column_name.get(code)
            if col_name and col_name in tds_parties.columns:
                tds_parties[col_name] = pd.to_numeric(
                    tds_parties[col_name].astype(str).str.replace(',', '').str.replace('₹', '').str.strip(),
                    errors='coerce'
                ).apply(lambda x: Decimal(str(x)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if pd.notna(x) else x)
                print(f"   ✅ Converted '{col_name}' to numeric with ROUND_HALF_UP")

        # Convert date columns
        date_codes = ['(418)', '(425F)']
        for code in date_codes:
            col_name = code_to_column_name.get(code)
            if col_name and col_name in tds_parties.columns:
                tds_parties[col_name] = pd.to_datetime(tds_parties[col_name], errors='coerce', dayfirst=True)
                print(f"   ✅ Converted '{col_name}' to datetime")

        # Validate PANs (only for rows with actual data)
        pan_col = code_to_column_name.get('(416)')
        if pan_col and pan_col in tds_parties.columns:
            invalid_pan_count = 0
            for idx, pan in tds_parties[pan_col].items():
                # Only validate if PAN exists and is not empty/zero
                if pd.notna(pan) and str(pan).strip() and str(pan).strip() != '0':
                    if not re.match(r'^[A-Z]{5}[0-9]{4}[A-Z]$', str(pan)):
                        invalid_pan_count += 1
                        if invalid_pan_count <= 5:  # Only show first 5 warnings
                            print(f"⚠️ Invalid PAN format at row {idx + data_start_row}: {pan}")
            if invalid_pan_count > 5:
                print(f"   ... and {invalid_pan_count - 5} more invalid PANs")

        challan_details = pd.read_excel(file_path, sheet_name='Challan Details', header=1, keep_default_na=False)
        wb.close()

        print(f"\n✅ Successfully read TDS Masters file")
        print(f"   TDS PARTIES: {len(tds_parties)} rows (actual data only)")
        print(f"   Challan Details: {len(challan_details)} rows")
        print(f"   TDS CODES: {len(tds_codes)} entries")
        # REMOVED: print(f"   TDS RATES: {len(tds_rates)} rates")
        print(f"   Column codes mapped: {len(code_to_column_name)}")

        tds_col = code_to_column_name.get('(421)', None)
        if tds_col and tds_col in tds_parties.columns:
            print(f"\n📊 Sample TDS amounts from first 5 rows:")
            for idx in range(min(5, len(tds_parties))):
                payment_type = tds_parties.iloc[idx].get(code_to_column_name.get('(415A)', ''), '')
                tds_amount = tds_parties.iloc[idx].get(tds_col, 0)
                name = tds_parties.iloc[idx].get(code_to_column_name.get('(417)', ''), '')
                print(f"   Row {idx}: Name={name}, Payment={payment_type}, TDS={tds_amount}")

        print("\n📊 Column Code Mapping Found:")
        for code, col_name in sorted(code_to_column_name.items())[:15]:
            print(f"   Code {code} → Column: '{col_name}'")

        return {
            'tds_codes': tds_codes,
            'tds_parties': tds_parties,
            'challan_details': challan_details,
            # REMOVED: 'tds_rates': tds_rates,
            'file_path': file_path,
            'column_code_map': column_code_map,
            'code_to_column_name': code_to_column_name,
            'code_row': code_row
        }

    except Exception as e:
        print(f"❌ Error reading TDS Masters: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def update_tds_masters_with_challans(tds_masters_data, challan_data_list):
    """
    Update TDS Masters with challan information
    FIXED: Uses data_only=True to preserve static TDS values, writes BSR/challan as strings
    """
    try:
        # Load workbook with data_only=True to preserve static values
        wb = load_workbook(tds_masters_data['file_path'], data_only=True)
        ws_parties = wb['TDS PARTIES']
        ws_challan = wb['Challan Details']

        code_to_column_name = tds_masters_data.get('code_to_column_name', {})
        code_row = tds_masters_data.get('code_row', 1)

        # Find columns by codes
        col_425E = col_425F = col_415A = None
        for col_idx in range(1, ws_parties.max_column + 1):
            cell_value = str(ws_parties.cell(row=code_row, column=col_idx).value)
            if '425E' in cell_value:
                col_425E = col_idx
                print(f"Found (425E) at column {col_idx}")
            elif '425F' in cell_value:
                col_425F = col_idx
                print(f"Found (425F) at column {col_idx}")
            elif '415A' in cell_value:
                col_415A = col_idx
                print(f"Found (415A) (Type of Payment) at column {col_idx}")

        # Create mapping of nature of payment to challan data
        challan_map = {}
        for challan in challan_data_list:
            nop = challan.get('nature_of_payment', '')
            if nop:
                nop_clean = nop.replace(' ', '')
                challan_map[nop_clean] = challan

        print(f"\n📝 Updating TDS PARTIES sheet...")
        print(f"   Challan Serial No (425E) → Column {col_425E}")
        print(f"   Date deposited (425F) → Column {col_425F}")
        updates_made = 0

        # Update TDS PARTIES
        data_start_row = code_row + 1
        for row_idx in range(data_start_row, ws_parties.max_row + 1):
            payment_type = ws_parties.cell(row=row_idx, column=col_415A).value if col_415A else None
            if payment_type and str(payment_type).strip() not in ['', 'nan', 'None']:
                payment_type_clean = str(payment_type).replace(' ', '').strip()
                if payment_type_clean in challan_map:
                    challan = challan_map[payment_type_clean]
                    if col_425E:
                        ws_parties.cell(row=row_idx, column=col_425E).value = challan.get('challan_no', '')
                    if col_425F:
                        date_str = challan.get('tender_date', '')
                        if date_str:
                            try:
                                date_obj = datetime.strptime(date_str, '%d/%m/%Y')
                                ws_parties.cell(row=row_idx, column=col_425F).value = date_obj
                                ws_parties.cell(row=row_idx, column=col_425F).number_format = 'DD/MM/YYYY'
                            except:
                                ws_parties.cell(row=row_idx, column=col_425F).value = date_str
                    updates_made += 1

        print(f"✅ Updated {updates_made} rows in TDS PARTIES")

        # Update Challan Details
        print("\n📝 Updating Challan Details sheet...")
        for row in ws_challan.iter_rows(min_row=3, max_row=ws_challan.max_row):
            for cell in row:
                cell.value = None

        for idx, challan in enumerate(challan_data_list, start=3):
            tax_amt = Decimal(challan.get('tax_amount', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('tax_amount', '') else 0
            surcharge = Decimal(challan.get('surcharge', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('surcharge', '') else 0
            cess = Decimal(challan.get('cess', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('cess', '') else 0
            interest = Decimal(challan.get('interest', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('interest', '') else 0
            penalty = Decimal(challan.get('penalty', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('penalty', '') else 0

            ws_challan.cell(row=idx, column=1).value = idx - 2
            ws_challan.cell(row=idx, column=2).value = challan.get('nature_of_payment', '')
            ws_challan.cell(row=idx, column=3).value = int(tax_amt)
            ws_challan.cell(row=idx, column=4).value = int(surcharge)
            ws_challan.cell(row=idx, column=5).value = int(cess)
            ws_challan.cell(row=idx, column=6).value = int(interest)
            ws_challan.cell(row=idx, column=7).value = int(penalty)
            ws_challan.cell(row=idx, column=8).value = f'=SUM(C{idx}:G{idx})'
            ws_challan.cell(row=idx, column=9).value = challan.get('mode_of_payment', '')
            ws_challan.cell(row=idx, column=10).value = challan.get('bsr_code', '')
            date_str = challan.get('tender_date', '')
            if date_str:
                try:
                    date_obj = datetime.strptime(date_str, '%d/%m/%Y')
                    ws_challan.cell(row=idx, column=11).value = date_obj
                    ws_challan.cell(row=idx, column=11).number_format = 'DD/MM/YYYY'
                except:
                    ws_challan.cell(row=idx, column=11).value = date_str
            ws_challan.cell(row=idx, column=12).value = challan.get('challan_no', '')
            ws_challan.cell(row=idx, column=13).value = 'NO'

        print(f"✅ Added {len(challan_data_list)} challans to Challan Details")

        output_file = tds_masters_data['file_path'].replace('.xlsx', '_UPDATED.xlsx')
        wb.save(output_file)
        wb.close()

        print(f"\n✅ Saved updated TDS Masters to: {output_file}")
        return read_tds_masters(output_file)

    except Exception as e:
        print(f"❌ Error updating TDS Masters: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def validate_tds_totals(tds_masters_data, challan_data_list):
    """
    Validate that party-wise TDS totals match challan amounts
    Uses column codes to identify the correct columns
    """
    print("\n🔍 Validating TDS totals...")

    try:
        tds_parties = tds_masters_data['tds_parties']
        code_to_column_name = tds_masters_data.get('code_to_column_name', {})
        payment_col = code_to_column_name.get('(415A)', None)
        tds_col = code_to_column_name.get('(421)', None)

        print(f"\n📊 Debug - Column mappings:")
        print(f"   Payment Type column (415A): {payment_col}")
        print(f"   TDS Amount column (421): {tds_col}")

        if not payment_col or not tds_col:
            print("⚠️ Missing required columns")
            return False

        print(f"\n📊 Debug - Sample data (first 5 valid rows):")
        sample_count = 0
        for idx, row in tds_parties.iterrows():
            if sample_count >= 5:
                break
            payment = row.get(payment_col, '') if payment_col else ''
            tds_amount = row.get(tds_col, 0) if tds_col else 0
            if payment and str(payment) not in ['nan', 'NaT', '']:
                print(f"   Row {idx}: Payment={payment}, TDS={tds_amount}")
                sample_count += 1

        party_totals = {}
        for _, row in tds_parties.iterrows():
            payment_type = str(row[payment_col] if payment_col in row else '').strip()
            if payment_type and payment_type != 'nan' and payment_type != 'NaT':
                tds_amount = 0
                if tds_col and tds_col in row:
                    try:
                        val = row[tds_col]
                        if pd.notna(val):
                            tds_amount = Decimal(str(val)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP)
                    except Exception as e:
                        print(f"   Warning: Could not convert TDS value '{val}' for payment type {payment_type}: {e}")
                        tds_amount = 0
                payment_type_clean = payment_type.replace(' ', '')
                if payment_type_clean not in party_totals:
                    party_totals[payment_type_clean] = 0
                party_totals[payment_type_clean] += tds_amount

        challan_totals = {}
        for challan in challan_data_list:
            nop = challan.get('nature_of_payment', '').replace(' ', '')
            tax_amount = Decimal(challan.get('tax_amount', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('tax_amount', '') else 0
            if nop:
                challan_totals[nop] = tax_amount

        print(f"\n📊 Debug - Totals found:")
        print(f"   Party totals: {party_totals}")
        print(f"   Challan totals: {challan_totals}")

        validation_passed = True
        print("\n📊 Validation Results:")
        print("-" * 60)
        print(f"{'Nature of Payment':<20} {'Party Total':<15} {'Challan Total':<15} {'Status':<10}")
        print("-" * 60)

        for nop in sorted(set(list(party_totals.keys()) + list(challan_totals.keys()))):
            party_total = party_totals.get(nop, 0)
            challan_total = challan_totals.get(nop, 0)
            difference = abs(party_total - challan_total)
            status = "✅ PASS" if difference <= 1 else "❌ FAIL"
            if difference > 1:
                validation_passed = False
            print(f"{nop:<20} ₹{party_total:<14,.0f} ₹{challan_total:<14,.0f} {status}")

        print("-" * 60)
        print("\n✅ All validations passed!" if validation_passed else "\n❌ Validation failed! Please check the discrepancies above.")

        return validation_passed

    except Exception as e:
        print(f"❌ Error during validation: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

print("✅ Excel handling functions loaded - COMPLETE FIX!")
print("   ✓ REMOVED TDS RATES reading (not used in processing)")
print("   ✓ Smart row detection - stops at empty data rows")
print("   ✓ Ignores formula-only rows with 0 or empty values")
print("   ✓ Uses Name (417) and PAN (416) to detect real data")
print("   ✓ Stops after 5 consecutive empty rows")
print("   ✓ Only validates PANs for rows with actual data")
print("   ✓ All other functionality preserved")

✅ Excel handling functions loaded - COMPLETE FIX!
   ✓ REMOVED TDS RATES reading (not used in processing)
   ✓ Smart row detection - stops at empty data rows
   ✓ Ignores formula-only rows with 0 or empty values
   ✓ Uses Name (417) and PAN (416) to detect real data
   ✓ Stops after 5 consecutive empty rows
   ✓ Only validates PANs for rows with actual data
   ✓ All other functionality preserved


In [46]:
# Cell 4: Main processing function that coordinates all steps
# This cell contains the main process_tds_returns function

import math
from decimal import Decimal, ROUND_HALF_UP

def get_output_filename_from_masters(tds_masters_data):
    """
    Extract month and year from the first payment date in TDS Masters
    to generate output filename as TDS_Month_Year.xlsx
    """
    try:
        code_to_column_name = tds_masters_data.get('code_to_column_name', {})
        date_col = code_to_column_name.get('(418)')
        if date_col and date_col in tds_masters_data['tds_parties'].columns:
            dates = tds_masters_data['tds_parties'][date_col].dropna()
            if not dates.empty:
                first_date = pd.to_datetime(dates.iloc[0])
                month_name = first_date.strftime('%B')
                year = first_date.strftime('%Y')
                return f"TDS_{month_name}_{year}.xlsx"
        current_date = datetime.now()
        return f"TDS_{current_date.strftime('%B')}_{current_date.strftime('%Y')}.xlsx"
    except:
        current_date = datetime.now()
        return f"TDS_{current_date.strftime('%B')}_{current_date.strftime('%Y')}.xlsx"

def update_deductee_breakup(ws, tds_masters_data, challan_data_list):
    """
    Helper function to update DEDUCTEE BREAK-UP sheet
    FIXED: Rate formatting to show 2 decimal places (e.g., 0.10%), preserve totals row, delete overflow rows after totals
    """
    code_to_column_name = tds_masters_data.get('code_to_column_name', {})
    tds_parties = tds_masters_data['tds_parties']
    code_row = tds_masters_data.get('code_row', 1)

    col_map = {}
    for col_idx in range(1, ws.max_column + 1):
        cell_value = str(ws.cell(row=2, column=col_idx).value)
        if '414' in cell_value:
            col_map['sr_no'] = col_idx
        elif '415' in cell_value and '415A' not in cell_value:
            col_map['deductee_code'] = col_idx
        elif '415A' in cell_value:
            col_map['payment_type'] = col_idx
        elif '416' in cell_value:
            col_map['pan'] = col_idx
        elif '417' in cell_value:
            col_map['name'] = col_idx
        elif '418' in cell_value:
            col_map['date_payment'] = col_idx
        elif '419' in cell_value:
            col_map['amount_paid'] = col_idx
        elif '420' in cell_value:
            col_map['book_entry'] = col_idx
        elif '421' in cell_value:
            col_map['tds'] = col_idx
        elif '422' in cell_value:
            col_map['surcharge'] = col_idx
        elif '423' in cell_value:
            col_map['cess'] = col_idx
        elif '424' in cell_value:
            col_map['total_deducted'] = col_idx
        elif '425' in cell_value and '425A' not in cell_value and '425B' not in cell_value and '425C' not in cell_value and '425D' not in cell_value and '425E' not in cell_value and '425F' not in cell_value:
            col_map['total_deposited'] = col_idx
        elif '425A' in cell_value:
            col_map['interest'] = col_idx
        elif '425B' in cell_value:
            col_map['others'] = col_idx
        elif '425C' in cell_value:
            col_map['total'] = col_idx
        elif '425D' in cell_value:
            col_map['bsr_code'] = col_idx
        elif '425E' in cell_value:
            col_map['challan_no'] = col_idx
        elif '425F' in cell_value:
            col_map['date_deposited'] = col_idx
        elif '426' in cell_value:
            col_map['date_deduction'] = col_idx
        elif '427' in cell_value:
            col_map['rate'] = col_idx
        elif '428' in cell_value:
            col_map['reason'] = col_idx

    print("\n📝 Updating DEDUCTEE BREAK-UP...")
    print("   Column Mappings Found:")
    for key, col in col_map.items():
        print(f"   - {key}: {ws.cell(row=1, column=col).value}")

    # Create challan map
    challan_map = {challan.get('nature_of_payment', '').replace(' ', ''): challan for challan in challan_data_list}

    # Find totals row first to protect it
    totals_row = None
    for row in range(4, ws.max_row + 1):
        if str(ws.cell(row=row, column=2).value or '').lower() == 'total':
            totals_row = row
            print(f"   Found totals row at row {totals_row}")
            break

    data_start = 4
    if not totals_row:
        totals_row = 20  # Default to row 20 if not found (based on template)
        print(f"   No totals row found, assuming row {totals_row}")

    # Find effective data end (last row with any value, including totals)
    data_end = data_start - 1
    for row in range(data_start, ws.max_row + 1):
        if any(cell.value is not None for cell in ws[row]) or row == totals_row:
            data_end = max(data_end, row)

    print(f"   Data end at row {data_end}, max_row: {ws.max_row}")

    # Delete extra empty rows after data_end, but protect totals row
    if ws.max_row > data_end:
        delete_amount = ws.max_row - data_end
        ws.delete_rows(data_end + 1, delete_amount)
        print(f"   Deleted {delete_amount} extra empty rows. New max_row: {ws.max_row}")

    # Clear data rows (but not totals row)
    for row in ws.iter_rows(min_row=data_start, max_row=totals_row-1):
        for cell in row:
            cell.value = None

    # Count valid parties
    valid_parties = []
    payment_col = code_to_column_name.get('(415A)', '')
    for _, party in tds_parties.iterrows():
        payment_type = party.get(payment_col, '') if payment_col else ''
        if payment_type and str(payment_type).strip() not in ['', 'nan', 'None']:
            valid_parties.append(party)

    print(f"   Found {len(valid_parties)} valid parties to process")

    # Insert rows if needed
    available_slots = totals_row - data_start
    if len(valid_parties) > available_slots:
        rows_to_insert = len(valid_parties) - available_slots
        ws.insert_rows(totals_row, amount=rows_to_insert)
        totals_row += rows_to_insert
        print(f"   Inserted {rows_to_insert} additional rows. New totals row: {totals_row}")

    # Write parties
    row_idx = data_start
    party_count = 0
    for _, party in tds_parties.iterrows():
        payment_type = party.get(payment_col, '') if payment_col else ''
        if payment_type and str(payment_type).strip() not in ['', 'nan', 'None']:
            payment_type_clean = str(payment_type).replace(' ', '').strip()
            challan = challan_map.get(payment_type_clean, {})

            # Write each field
            if 'sr_no' in col_map:
                ws.cell(row=row_idx, column=col_map['sr_no']).value = party_count + 1

            if 'deductee_code' in col_map:
                deductee_code = party.get(code_to_column_name.get('(415)', ''), '') if code_to_column_name.get('(415)') else ''
                ws.cell(row=row_idx, column=col_map['deductee_code']).value = deductee_code

            if 'payment_type' in col_map:
                ws.cell(row=row_idx, column=col_map['payment_type']).value = payment_type

            if 'pan' in col_map:
                pan = party.get(code_to_column_name.get('(416)', ''), '') if code_to_column_name.get('(416)') else ''
                ws.cell(row=row_idx, column=col_map['pan']).value = pan

            if 'name' in col_map:
                name = party.get(code_to_column_name.get('(417)', ''), '') if code_to_column_name.get('(417)') else ''
                ws.cell(row=row_idx, column=col_map['name']).value = name

            if 'date_payment' in col_map:
                date_payment = party.get(code_to_column_name.get('(418)', ''), '') if code_to_column_name.get('(418)') else ''
                if isinstance(date_payment, pd.Timestamp):
                    date_payment = date_payment.to_pydatetime()
                ws.cell(row=row_idx, column=col_map['date_payment']).value = date_payment
                ws.cell(row=row_idx, column=col_map['date_payment']).number_format = 'DD/MM/YYYY'

            if 'amount_paid' in col_map:
                amount_paid = party.get(code_to_column_name.get('(419)', ''), 0) if code_to_column_name.get('(419)') else 0
                if isinstance(amount_paid, Decimal):
                    amount_paid = int(amount_paid)
                ws.cell(row=row_idx, column=col_map['amount_paid']).value = amount_paid

            if 'book_entry' in col_map:
                ws.cell(row=row_idx, column=col_map['book_entry']).value = ''

            if 'tds' in col_map:
                tds_amount = party.get(code_to_column_name.get('(421)', ''), 0) if code_to_column_name.get('(421)') else 0
                if isinstance(tds_amount, Decimal):
                    tds_amount = int(tds_amount)
                ws.cell(row=row_idx, column=col_map['tds']).value = tds_amount

            if 'surcharge' in col_map:
                ws.cell(row=row_idx, column=col_map['surcharge']).value = 0

            if 'cess' in col_map:
                ws.cell(row=row_idx, column=col_map['cess']).value = 0

            if 'total_deducted' in col_map:
                ws.cell(row=row_idx, column=col_map['total_deducted']).value = f'=SUM(I{row_idx}:K{row_idx})'

            if 'total_deposited' in col_map:
                ws.cell(row=row_idx, column=col_map['total_deposited']).value = ws.cell(row=row_idx, column=col_map['total_deducted']).value

            if 'interest' in col_map:
                ws.cell(row=row_idx, column=col_map['interest']).value = 0

            if 'others' in col_map:
                ws.cell(row=row_idx, column=col_map['others']).value = 0

            if 'total' in col_map:
                ws.cell(row=row_idx, column=col_map['total']).value = f'=SUM(M{row_idx}:O{row_idx})'

            if 'bsr_code' in col_map:
                ws.cell(row=row_idx, column=col_map['bsr_code']).value = challan.get('bsr_code', '')

            if 'challan_no' in col_map:
                ws.cell(row=row_idx, column=col_map['challan_no']).value = challan.get('challan_no', '')

            if 'date_deposited' in col_map:
                date_str = challan.get('tender_date', '')
                if date_str:
                    try:
                        date_obj = datetime.strptime(date_str, '%d/%m/%Y')
                        ws.cell(row=row_idx, column=col_map['date_deposited']).value = date_obj
                        ws.cell(row=row_idx, column=col_map['date_deposited']).number_format = 'DD/MM/YYYY'
                    except:
                        ws.cell(row=row_idx, column=col_map['date_deposited']).value = date_str

            if 'date_deduction' in col_map:
                date_deduction = party.get(code_to_column_name.get('(418)', ''), '') if code_to_column_name.get('(418)') else ''
                if isinstance(date_deduction, pd.Timestamp):
                    date_deduction = date_deduction.to_pydatetime()
                ws.cell(row=row_idx, column=col_map['date_deduction']).value = date_deduction
                ws.cell(row=row_idx, column=col_map['date_deduction']).number_format = 'DD/MM/YYYY'

            if 'rate' in col_map:
                rate = party.get(code_to_column_name.get('(427)', ''), 0) if code_to_column_name.get('(427)') else 0
                rate_percent = float(rate) * 100
                ws.cell(row=row_idx, column=col_map['rate']).value = f"{rate_percent:.2f}%"

            if 'reason' in col_map:
                ws.cell(row=row_idx, column=col_map['reason']).value = 'N.A'

            row_idx += 1
            party_count += 1

    # Setup totals row
    print(f"   Setting up totals row at row {totals_row}")
    ws.cell(row=totals_row, column=1).value = ''  # Clear Sr.No
    ws.cell(row=totals_row, column=2).value = 'TOTAL'  # Deductee Code column as label

    # Sum formulas for numeric columns
    if 'amount_paid' in col_map:
        ws.cell(row=totals_row, column=col_map['amount_paid']).value = f'=SUM(G{data_start}:G{row_idx-1})'

    if 'tds' in col_map:
        ws.cell(row=totals_row, column=col_map['tds']).value = f'=SUM(I{data_start}:I{row_idx-1})'

    if 'surcharge' in col_map:
        ws.cell(row=totals_row, column=col_map['surcharge']).value = f'=SUM(J{data_start}:J{row_idx-1})'

    if 'cess' in col_map:
        ws.cell(row=totals_row, column=col_map['cess']).value = f'=SUM(K{data_start}:K{row_idx-1})'

    if 'total_deducted' in col_map:
        ws.cell(row=totals_row, column=col_map['total_deducted']).value = f'=SUM(L{data_start}:L{row_idx-1})'

    if 'total_deposited' in col_map:
        ws.cell(row=totals_row, column=col_map['total_deposited']).value = f'=SUM(M{data_start}:M{row_idx-1})'

    if 'interest' in col_map:
        ws.cell(row=totals_row, column=col_map['interest']).value = f'=SUM(N{data_start}:N{row_idx-1})'

    if 'others' in col_map:
        ws.cell(row=totals_row, column=col_map['others']).value = f'=SUM(O{data_start}:O{row_idx-1})'

    if 'total' in col_map:
        ws.cell(row=totals_row, column=col_map['total']).value = f'=SUM(P{data_start}:P{row_idx-1})'

    # Clear any rows after totals to prevent overflow
    if ws.max_row > totals_row:
        delete_amount = ws.max_row - totals_row
        ws.delete_rows(totals_row + 1, delete_amount)
        print(f"   Cleared {delete_amount} overflow rows after totals. Final max_row: {ws.max_row}")

    print(f"✅ Updated {party_count} parties in DEDUCTEE BREAK-UP")
    print(f"   Totals row is at row {totals_row}")

def update_challan_details(ws, challan_data_list):
    """
    Helper function to update CHALLAN DETAILS sheet
    """
    print("\n📝 Updating CHALLAN DETAILS...")
    print(f"   Processing {len(challan_data_list)} unique challans...")

    # Find totals row
    totals_row = None
    for row in range(4, ws.max_row + 1):
        if str(ws.cell(row=row, column=2).value).lower() == 'total':
            totals_row = row
            break

    if not totals_row:
        totals_row = 8  # Default to row 8 if not found (based on template)
    print(f"   Found TOTAL row at row {totals_row}")

    data_start = 4
    available_slots = totals_row - data_start

    print(f"   Template has space for {available_slots} data rows")
    print(f"   Need {len(challan_data_list)} rows for unique challans")

    # Insert rows if needed
    if len(challan_data_list) > available_slots:
        rows_to_insert = len(challan_data_list) - available_slots
        ws.insert_rows(totals_row, amount=rows_to_insert)
        totals_row += rows_to_insert
        print(f"   Inserted {rows_to_insert} additional rows. New TOTAL row: {totals_row}")

    # Clear existing data
    for row in ws.iter_rows(min_row=4, max_row=totals_row-1):
        for cell in row:
            cell.value = None

    # Write challans
    row_idx = 4
    for idx, challan in enumerate(challan_data_list, start=1):
        tax_amt = Decimal(challan.get('tax_amount', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('tax_amount', '') else 0
        surcharge = Decimal(challan.get('surcharge', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('surcharge', '') else 0
        cess = Decimal(challan.get('cess', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('cess', '') else 0
        interest = Decimal(challan.get('interest', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('interest', '') else 0
        penalty = Decimal(challan.get('penalty', 0)).quantize(Decimal('1.'), rounding=ROUND_HALF_UP) if challan.get('penalty', '') else 0

        ws.cell(row=row_idx, column=1).value = idx
        ws.cell(row=row_idx, column=2).value = challan.get('nature_of_payment', '')
        ws.cell(row=row_idx, column=3).value = int(tax_amt)
        ws.cell(row=row_idx, column=4).value = int(surcharge)
        ws.cell(row=row_idx, column=5).value = int(cess)
        ws.cell(row=row_idx, column=6).value = int(interest)
        ws.cell(row=row_idx, column=7).value = int(penalty)
        ws.cell(row=row_idx, column=8).value = f'=SUM(C{row_idx}:G{row_idx})'
        ws.cell(row=row_idx, column=9).value = challan.get('mode_of_payment', '')
        ws.cell(row=row_idx, column=10).value = challan.get('bsr_code', '')
        date_str = challan.get('tender_date', '')
        if date_str:
            try:
                date_obj = datetime.strptime(date_str, '%d/%m/%Y')
                ws.cell(row=row_idx, column=11).value = date_obj
                ws.cell(row=row_idx, column=11).number_format = 'DD/MM/YYYY'
            except:
                ws.cell(row=row_idx, column=11).value = date_str
        ws.cell(row=row_idx, column=12).value = challan.get('challan_no', '')
        ws.cell(row=row_idx, column=13).value = 'NO'

        row_idx += 1

    # Setup TOTAL row
    ws.cell(row=totals_row, column=2).value = 'TOTAL'
    ws.cell(row=totals_row, column=3).value = f'=SUM(C{data_start}:C{totals_row-1})'
    ws.cell(row=totals_row, column=4).value = f'=SUM(D{data_start}:D{totals_row-1})'
    ws.cell(row=totals_row, column=5).value = f'=SUM(E{data_start}:E{totals_row-1})'
    ws.cell(row=totals_row, column=6).value = f'=SUM(F{data_start}:F{totals_row-1})'
    ws.cell(row=totals_row, column=7).value = f'=SUM(G{data_start}:G{totals_row-1})'
    ws.cell(row=totals_row, column=8).value = f'=SUM(H{data_start}:H{totals_row-1})'

    # Clear any rows after totals to prevent overflow
    if ws.max_row > totals_row:
        delete_amount = ws.max_row - totals_row
        ws.delete_rows(totals_row + 1, delete_amount)
        print(f"   Cleared {delete_amount} overflow rows after totals in CHALLAN DETAILS. Final max_row: {ws.max_row}")

    print(f"✅ Successfully updated CHALLAN DETAILS with {len(challan_data_list)} unique challans")
    print(f"   TOTAL row is at row {totals_row}")

def process_tds_returns(pdf_folder_path, masters_file_path, template_file_path):
    """
    Main function to process TDS returns
    FIXED: Challan amounts rounded up with ROUND_HALF_UP
    """
    print("\n============================================================\n🚀 STARTING TDS RETURN PROCESSING\n============================================================")

    print("\n📄 STEP 1: Extracting Challan Data from PDFs...\n--------------------------------------------------")
    challan_data_list = extract_all_challans(pdf_folder_path)
    if not challan_data_list:
        print("❌ No valid challans extracted. Aborting.")
        return None

    print(f"\n✅ Successfully extracted {len(challan_data_list)} unique challans")

    print("\n📊 STEP 2: Reading TDS Masters File...\n--------------------------------------------------")
    tds_masters_data = read_tds_masters(masters_file_path)
    if not tds_masters_data:
        print("❌ Failed to read TDS Masters. Aborting.")
        return None

    print("\n🔍 STEP 3: Pre-Update Validation...\n--------------------------------------------------")
    pre_validation = validate_tds_totals(tds_masters_data, challan_data_list)
    print("\n✅ Pre-Update Validation: " + ("PASSED" if pre_validation else "FAILED"))

    print("\n📝 STEP 4: Updating TDS Masters with Challan Information...\n--------------------------------------------------")
    updated_masters_data = update_tds_masters_with_challans(tds_masters_data, challan_data_list)
    if not updated_masters_data:
        print("❌ Failed to update TDS Masters. Aborting.")
        return None

    print("\n📋 STEP 5: Generating Output File...\n--------------------------------------------------")
    output_filename = get_output_filename_from_masters(updated_masters_data)
    print(f"   Output filename: {output_filename}")

    wb_template = load_workbook(template_file_path, data_only=False)
    print("✅ Loaded template")

    # Update CHALLAN DETAILS sheet
    ws_challan = wb_template['CHALLAN DETAILS']
    update_challan_details(ws_challan, challan_data_list)

    # Update DEDUCTEE BREAK-UP sheet
    ws_deductee = wb_template['DEDUCTEE BREAK-UP']
    update_deductee_breakup(ws_deductee, updated_masters_data, challan_data_list)

    output_path = os.path.join(OUTPUT_FOLDER, output_filename)
    wb_template.save(output_path)
    wb_template.close()
    print(f"✅ Generated output file: {output_path}")

    print("\n🔍 STEP 6: Final Validation...\n--------------------------------------------------")
    final_validation = validate_tds_totals(updated_masters_data, challan_data_list)
    print("\n✅ Final Validation: " + ("PASSED" if final_validation else "FAILED"))

    return {
        'updated_masters_path': updated_masters_data['file_path'],
        'output_path': output_path,
        'pre_validation': pre_validation,
        'final_validation': final_validation
    }

print("✅ Main processing function loaded - FIXED WITH TDS PRESERVATION!")
print("   ✓ Rate formatting fixed to show 2 decimal places (e.g., 0.10%)")
print("   ✓ Totals row preserved by protecting it during cleanup")
print("   ✓ Overflow rows cleared after totals in both sheets")
print("   ✓ Dynamic row insertion for DEDUCTEE BREAK-UP and CHALLAN DETAILS")
print("   ✓ Proper sum formulas adjusted for all rows")
print("   ✓ Parameter name pdf_folder_path for consistency")
print("   ✓ Validation uses in-memory data")

# To run the processing:
# results = process_tds_returns('/content/tds_processing/pdfs', '/content/tds_processing/TDS_Masters.xlsx', '/content/tds_processing/TDS_Template.xlsx')

✅ Main processing function loaded - FIXED WITH TDS PRESERVATION!
   ✓ Rate formatting fixed to show 2 decimal places (e.g., 0.10%)
   ✓ Totals row preserved by protecting it during cleanup
   ✓ Overflow rows cleared after totals in both sheets
   ✓ Dynamic row insertion for DEDUCTEE BREAK-UP and CHALLAN DETAILS
   ✓ Proper sum formulas adjusted for all rows
   ✓ Parameter name pdf_folder_path for consistency
   ✓ Validation uses in-memory data


In [47]:
# Cell 5: Functions to generate output file - COMPLETE FIX
# This cell handles creating the final TDS return file with proper data mapping

import math  # Import math module for ceiling function

def generate_output_file(tds_masters_data, challan_data_list, template_path, output_path=None):
    """
    Generate output file from TDS Masters data and challan information
    Handles dynamic rows and preserves formulas
    """
    try:
        # Load the template
        wb = load_workbook(template_path)

        # Get the sheets
        ws_deductor = wb['DEDUCTOR DETAILS']
        ws_challan = wb['CHALLAN DETAILS']
        ws_deductee = wb['DEDUCTEE BREAK-UP']

        print("✅ Loaded template")

        # Update CHALLAN DETAILS sheet with deduplicated challans
        print("\n📝 Updating CHALLAN DETAILS...")
        print(f"   Processing {len(challan_data_list)} unique challans...")
        update_challan_details_proper(ws_challan, challan_data_list)

        # Update DEDUCTEE BREAK-UP sheet
        print("\n📝 Updating DEDUCTEE BREAK-UP...")
        update_deductee_breakup_sheet_dynamic(ws_deductee, tds_masters_data, challan_data_list)

        # Save the file
        wb.save(output_path)
        wb.close()

        print(f"\n✅ Generated output file: {output_path}")
        return output_path

    except Exception as e:
        print(f"❌ Error generating output file: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

def find_totals_row(ws, start_row=4, end_row=None):
    """
    Find the row containing totals (usually has 'TOTAL' text or SUM formulas)
    """
    if end_row is None:
        end_row = ws.max_row

    for row in range(start_row, end_row + 1):
        # Check first few columns for 'TOTAL' text
        for col in range(1, 4):
            cell_value = ws.cell(row=row, column=col).value
            if cell_value and 'total' in str(cell_value).lower():
                return row

        # Also check if row has SUM formulas
        cell_formula = ws.cell(row=row, column=3).value
        if cell_formula and isinstance(cell_formula, str) and cell_formula.startswith('=SUM'):
            return row

    return None

def update_challan_details_proper(ws, challan_data_list):
    """
    Update the CHALLAN DETAILS sheet with proper TOTAL row handling
    FIXED: Preserves TOTAL row when clearing data and writes BSR/Challan as strings
    """
    # CRITICAL: Headers are in rows 1-3, data starts at row 4
    DATA_START_ROW = 4

    # Step 1: Find the current TOTAL row
    total_row = find_totals_row(ws, DATA_START_ROW)

    if total_row:
        print(f"   Found TOTAL row at row {total_row}")
        current_data_rows = total_row - DATA_START_ROW
        print(f"   Template has space for {current_data_rows} data rows")
    else:
        print("   No TOTAL row found - will add at end")
        current_data_rows = 0
        total_row = None

    # Step 2: Calculate how many rows we need (based on unique challans)
    needed_rows = len(challan_data_list)
    print(f"   Need {needed_rows} rows for unique challans")

    # Step 3: Adjust rows if needed
    if total_row and needed_rows > current_data_rows:
        # We need to insert rows BEFORE the total row
        rows_to_insert = needed_rows - current_data_rows
        print(f"   Inserting {rows_to_insert} rows before TOTAL row")
        ws.insert_rows(total_row, rows_to_insert)
        # Update total row position
        total_row = total_row + rows_to_insert

    elif total_row and needed_rows < current_data_rows:
        # We need to delete extra rows
        rows_to_delete = current_data_rows - needed_rows
        print(f"   Deleting {rows_to_delete} extra rows")
        # Delete rows just before the total row
        for _ in range(rows_to_delete):
            ws.delete_rows(total_row - 1)
            total_row -= 1

    # Step 4: Clear ONLY data cells (not the total row)
    print(f"   Clearing data from row {DATA_START_ROW} to row {DATA_START_ROW + needed_rows - 1}")
    end_clear_row = DATA_START_ROW + needed_rows
    if total_row:
        # Make sure we don't clear the total row
        end_clear_row = min(end_clear_row, total_row)

    for row in range(DATA_START_ROW, end_clear_row):
        for col in range(1, 14):  # Columns A to M
            ws.cell(row=row, column=col).value = None

    # Step 5: Write the data (only unique challans)
    current_row = DATA_START_ROW
    for idx, challan in enumerate(challan_data_list, start=1):
        # Round amounts UP to nearest rupee
        tax_amt = math.ceil(float(challan.get('tax_amount', 0))) if challan.get('tax_amount', '') else 0
        surcharge = math.ceil(float(challan.get('surcharge', 0))) if challan.get('surcharge', '') else 0
        cess = math.ceil(float(challan.get('cess', 0))) if challan.get('cess', '') else 0
        interest = math.ceil(float(challan.get('interest', 0))) if challan.get('interest', '') else 0
        penalty = math.ceil(float(challan.get('penalty', 0))) if challan.get('penalty', '') else 0

        # Write data to proper columns
        ws.cell(row=current_row, column=1).value = idx  # (401) SR NO

        # (402) SECTION CODE - Format with space
        section_code = challan.get('nature_of_payment', '')
        if len(section_code) >= 3 and ' ' not in section_code:
            section_code = section_code[:2] + ' ' + section_code[2:]
        ws.cell(row=current_row, column=2).value = section_code

        ws.cell(row=current_row, column=3).value = tax_amt  # (403) TDS Rs.
        ws.cell(row=current_row, column=4).value = surcharge  # (404) SURCHARGE Rs.
        ws.cell(row=current_row, column=5).value = cess  # (405) EDUCATION CESS Rs.
        ws.cell(row=current_row, column=6).value = interest  # (406) INTEREST Rs.
        ws.cell(row=current_row, column=7).value = penalty  # (407) OTHERS Rs.

        # (408) TOTAL TAX DEPOSITED - Formula for column H
        ws.cell(row=current_row, column=8).value = f'=SUM(C{current_row}:G{current_row})'

        # (409) CHEQUE/DD NO - Mode of payment already uppercase from PDF extraction
        ws.cell(row=current_row, column=9).value = challan.get('mode_of_payment', '')

        # (410) BSR CODE - as string to preserve leading zeros
        ws.cell(row=current_row, column=10).value = challan.get('bsr_code', '')

        # (411) DATE ON WHICH TAX DEPOSITED - parse and format date
        date_str = challan.get('tender_date', '')
        if date_str:
            try:
                date_obj = datetime.strptime(date_str, '%d/%m/%Y')
                ws.cell(row=current_row, column=11).value = date_obj
                ws.cell(row=current_row, column=11).number_format = 'DD/MM/YYYY'
            except:
                ws.cell(row=current_row, column=11).value = date_str

        # (412) CHALLAN SERIAL NO - as string to preserve leading zeros
        ws.cell(row=current_row, column=12).value = challan.get('challan_no', '')

        ws.cell(row=current_row, column=13).value = 'NO'  # (413) WHETHER TDS DEPOSITED BY BOOK ENTRY

        current_row += 1

    # Step 6: Create or Update TOTAL row
    if not total_row:
        # Create new total row if it doesn't exist
        total_row = DATA_START_ROW + needed_rows

    print(f"   Setting up TOTAL row at row {total_row}")

    # Ensure TOTAL text is in column A
    ws.cell(row=total_row, column=1).value = "TOTAL"
    ws.cell(row=total_row, column=1).font = Font(bold=True)

    # Update formulas to cover the correct range
    last_data_row = DATA_START_ROW + needed_rows - 1

    # Update formulas for columns C through H (only where totals exist in template)
    for col in [3, 4, 5, 6, 7, 8]:  # Columns C, D, E, F, G, H
        ws.cell(row=total_row, column=col).value = f'=SUM({get_column_letter(col)}{DATA_START_ROW}:{get_column_letter(col)}{last_data_row})'
        ws.cell(row=total_row, column=col).font = Font(bold=True)

    print(f"✅ Successfully updated CHALLAN DETAILS with {needed_rows} unique challans")
    print(f"   TOTAL row is at row {total_row}")

def update_deductee_breakup_sheet_dynamic(ws, tds_masters_data, challan_data_list):
    """
    Update the DEDUCTEE BREAK-UP sheet with dynamic row management
    FIXED: Properly handles rate formatting and preserves leading zeros
    """
    # Get TDS parties data and column mappings
    tds_parties_df = tds_masters_data['tds_parties']
    code_to_column_name = tds_masters_data.get('code_to_column_name', {})

    # Find columns by their codes
    col_name = code_to_column_name.get('(417)', None)  # Name
    col_pan = code_to_column_name.get('(416)', None)  # PAN
    col_type_payment = code_to_column_name.get('(415A)', None)  # Type of Payment
    col_code_415 = code_to_column_name.get('(415)', None)  # Individual/Company Code
    col_date_payment = code_to_column_name.get('(418)', None)  # Date of Payment
    col_amount = code_to_column_name.get('(419)', None)  # Amount Paid
    col_tds = code_to_column_name.get('(421)', None)  # TDS Amount
    col_tds_rate = code_to_column_name.get('(427)', None)  # TDS Deduction Rates
    col_bsr = code_to_column_name.get('(425D)', None)  # BSR Code
    col_challan_no = code_to_column_name.get('(425E)', None)  # Challan No
    col_date_deposited = code_to_column_name.get('(425F)', None)  # Date deposited

    # Debug: Print column mappings
    print(f"   Column Mappings Found:")
    print(f"   - Name (417): {col_name}")
    print(f"   - PAN (416): {col_pan}")
    print(f"   - Type of Payment (415A): {col_type_payment}")
    print(f"   - Code (415): {col_code_415}")
    print(f"   - Date of Payment (418): {col_date_payment}")
    print(f"   - Amount Paid (419): {col_amount}")
    print(f"   - TDS (421): {col_tds}")
    print(f"   - TDS Rate (427): {col_tds_rate}")
    print(f"   - BSR Code (425D): {col_bsr}")
    print(f"   - Challan No (425E): {col_challan_no}")
    print(f"   - Date Deposited (425F): {col_date_deposited}")

    # Find where data should start (after headers and column codes)
    DATA_START_ROW = 4

    # Count actual data rows needed (skip empty payment types)
    if col_type_payment:
        # Filter for valid payment types (should be like '94A', '94C', etc.)
        valid_parties = tds_parties_df[
            (tds_parties_df[col_type_payment].notna()) &
            (tds_parties_df[col_type_payment].astype(str).str.strip() != '') &
            (tds_parties_df[col_type_payment].astype(str) != 'nan')
        ]
    else:
        valid_parties = tds_parties_df

    needed_rows = len(valid_parties)
    print(f"   Found {needed_rows} valid parties to process")

    # Find the totals row
    total_row = None
    for row in range(DATA_START_ROW, ws.max_row + 1):
        cell_value = ws.cell(row=row, column=1).value
        if cell_value and 'total' in str(cell_value).lower():
            total_row = row
            break
        # Also check for SUM formulas in column G (Amount column)
        cell_g = ws.cell(row=row, column=7).value
        if cell_g and isinstance(cell_g, str) and '=SUM' in str(cell_g):
            total_row = row
            break

    if total_row:
        print(f"   Found totals row at row {total_row}")

        # Calculate how many data rows we currently have
        current_data_rows = total_row - DATA_START_ROW

        # Adjust rows to exactly match needed rows
        if needed_rows > current_data_rows:
            # Insert rows
            rows_to_insert = needed_rows - current_data_rows
            print(f"   Inserting {rows_to_insert} rows before TOTAL row")
            ws.insert_rows(total_row, rows_to_insert)
            total_row = total_row + rows_to_insert
        elif needed_rows < current_data_rows:
            # Delete excess rows
            rows_to_delete = current_data_rows - needed_rows
            print(f"   Deleting {rows_to_delete} excess rows")
            for _ in range(rows_to_delete):
                ws.delete_rows(total_row - 1)
                total_row -= 1

    # Clear existing data (but NOT the totals row)
    end_clear_row = DATA_START_ROW + needed_rows
    if total_row:
        end_clear_row = min(end_clear_row, total_row)

    for row in range(DATA_START_ROW, end_clear_row):
        for col in range(1, 23):  # Clear all columns A to V
            ws.cell(row=row, column=col).value = None

    # Create challan lookup for BSR codes
    challan_lookup = {}
    for challan in challan_data_list:
        nop = challan.get('nature_of_payment', '').replace(' ', '')
        challan_lookup[nop] = {
            'bsr_code': challan.get('bsr_code', ''),
            'challan_no': challan.get('challan_no', ''),
            'date_deposited': challan.get('tender_date', '')
        }

    # Add party data
    current_row = DATA_START_ROW
    sr_no = 1

    for _, party in valid_parties.iterrows():
        # (414) Sr.No
        ws.cell(row=current_row, column=1).value = sr_no

        # (415) Deductee Code - from column (415)
        deductee_code = ''
        if col_code_415:
            deductee_code = str(party.get(col_code_415, '')).strip()
            # Ensure deductee code is properly formatted (should be 01 or 02 with leading zero)
            if deductee_code and deductee_code.isdigit():
                deductee_code = deductee_code.zfill(2)  # Add leading zero if needed
        else:
            # Default based on PAN 4th character
            if col_pan:
                pan_value = str(party.get(col_pan, '')).strip()
                if len(pan_value) >= 4:
                    fourth_char = pan_value[3].upper()
                    deductee_code = '01' if fourth_char == 'P' else '02'
                else:
                    deductee_code = '01'
            else:
                deductee_code = '01'
        ws.cell(row=current_row, column=2).value = deductee_code

      # (415A) Section Under Payment Made - Format with space
        payment_type = str(party.get(col_type_payment, '')) if col_type_payment else ''
        payment_type_clean = payment_type.replace(' ', '')  # For lookup
        if payment_type and payment_type not in ['nan', 'None', '']:
            # Format payment type with space between number and letter
            payment_type = payment_type.strip()
            if len(payment_type) >= 3 and payment_type[:2].isdigit() and payment_type[2].isalpha():
                # Add space between number and letter (e.g., "94A" -> "94 A")
                formatted_payment = payment_type[:2] + ' ' + payment_type[2:]
            else:
                formatted_payment = payment_type
            ws.cell(row=current_row, column=3).value = formatted_payment

        # (416) PAN of Deductee
        pan_value = party.get(col_pan, '') if col_pan else ''
        ws.cell(row=current_row, column=4).value = pan_value

        # (417) Name of Deductee
        name_value = party.get(col_name, '') if col_name else ''
        ws.cell(row=current_row, column=5).value = name_value

        # (418) Date of Payment/credit
        if col_date_payment:
            date_val = party.get(col_date_payment)
            if pd.notna(date_val):
                ws.cell(row=current_row, column=6).value = date_val
                ws.cell(row=current_row, column=6).number_format = 'DD/MM/YYYY'

        # (419) Amount Paid/Credited Rs. - Round UP
        amount = 0
        if col_amount:
            amount_val = party.get(col_amount)
            if pd.notna(amount_val):
                try:
                    amount = math.ceil(float(amount_val))
                except:
                    amount = 0
        ws.cell(row=current_row, column=7).value = amount

        # (420) Paid by Book Entry or otherwise - Left blank as requested
        # ws.cell(row=current_row, column=8).value = ''

        # (421) TDS Rs. - Round UP
        tds = 0
        if col_tds:
            tds_val = party.get(col_tds)
            if pd.notna(tds_val):
                try:
                    tds = math.ceil(float(tds_val))
                except:
                    tds = 0
        ws.cell(row=current_row, column=9).value = tds

        # (422) Surcharge Rs.
        ws.cell(row=current_row, column=10).value = 0

        # (423) Educational Cess Rs.
        ws.cell(row=current_row, column=11).value = 0

        # (424) Total tax deducted - Formula
        ws.cell(row=current_row, column=12).value = f'=I{current_row}+J{current_row}+K{current_row}'

        # (425) Total tax deposited Rs. - Same as total tax deducted
        ws.cell(row=current_row, column=13).value = f'=L{current_row}'

        # (425A) Interest
        ws.cell(row=current_row, column=14).value = 0

        # (425B) Others
        ws.cell(row=current_row, column=15).value = 0

        # (425C) Total (425+Interest+Others) - Formula
        ws.cell(row=current_row, column=16).value = f'=M{current_row}+N{current_row}+O{current_row}'

        # (425D, 425E, 425F) - Get from updated TDS Masters or challan lookup
        challan_info = challan_lookup.get(payment_type_clean, {})

        # First check if data exists in party row (from updated TDS Masters)
        bsr_value = party.get(col_bsr, '') if col_bsr else ''
        challan_no_value = party.get(col_challan_no, '') if col_challan_no else ''
        date_dep_value = party.get(col_date_deposited, '') if col_date_deposited else ''

        # If not in party data, get from challan lookup
        if not bsr_value:
            bsr_value = challan_info.get('bsr_code', '')
        if not challan_no_value:
            challan_no_value = challan_info.get('challan_no', '')
        if not date_dep_value:
            date_dep_value = challan_info.get('date_deposited', '')

        # Write BSR code as string to preserve leading zeros
        ws.cell(row=current_row, column=17).value = str(bsr_value)

        # Write challan no as string to preserve leading zeros
        ws.cell(row=current_row, column=18).value = str(challan_no_value)

        # Write date deposited
        if date_dep_value:
            if isinstance(date_dep_value, str):
                try:
                    date_obj = datetime.strptime(date_dep_value, '%d/%m/%Y')
                    ws.cell(row=current_row, column=19).value = date_obj
                    ws.cell(row=current_row, column=19).number_format = 'DD/MM/YYYY'
                except:
                    ws.cell(row=current_row, column=19).value = date_dep_value
            else:
                ws.cell(row=current_row, column=19).value = date_dep_value
                ws.cell(row=current_row, column=19).number_format = 'DD/MM/YYYY'

        # (426) Date of deduction - Same as payment date
        if col_date_payment:
            date_val = party.get(col_date_payment)
            if pd.notna(date_val):
                ws.cell(row=current_row, column=20).value = date_val
                ws.cell(row=current_row, column=20).number_format = 'DD/MM/YYYY'

        # (427) Rate at which deducted - FIXED FORMATTING
        if col_tds_rate:
            tds_rate_value = party.get(col_tds_rate)
            if pd.notna(tds_rate_value):
                # Convert to string and clean
                rate_str = str(tds_rate_value).strip()
                # Remove % if present
                rate_str = rate_str.replace('%', '')
                # Try to convert to float to validate it's a number
                try:
                    rate_float = float(rate_str)
                    # If rate is decimal (like 0.1), multiply by 100 to get percentage
                    if rate_float < 1:
                        rate_float = rate_float * 100
                    # Format consistently
                    ws.cell(row=current_row, column=21).value = f'{rate_float:.0f}%'
                except:
                    # If conversion fails, use as is with % appended
                    ws.cell(row=current_row, column=21).value = f'{rate_str}%'
            else:
                # If no rate found, calculate from TDS/Amount
                if amount > 0 and tds > 0:
                    rate = (tds / amount) * 100
                    ws.cell(row=current_row, column=21).value = f'{rate:.0f}%'
                else:
                    ws.cell(row=current_row, column=21).value = '0%'
        else:
            # Fallback: Calculate rate if TDS rate column not found
            if amount > 0 and tds > 0:
                rate = (tds / amount) * 100
                ws.cell(row=current_row, column=21).value = f'{rate:.0f}%'
            else:
                ws.cell(row=current_row, column=21).value = '0%'

        # (428) Reason for non-deduction/lower deduction
        ws.cell(row=current_row, column=22).value = 'N.A'

        current_row += 1
        sr_no += 1

    # Create or Update totals row
    if not total_row:
        total_row = DATA_START_ROW + needed_rows

    print(f"   Setting up totals row at row {total_row}")

    last_data_row = DATA_START_ROW + needed_rows - 1

    # Ensure TOTAL text is in column A
    ws.cell(row=total_row, column=1).value = "TOTAL"
    ws.cell(row=total_row, column=1).font = Font(bold=True)

    # Update sum formulas for numeric columns (only where totals exist in template)
    total_columns = [
        (7, 'G'),   # Amount
        (9, 'I'),   # TDS
        (10, 'J'),  # Surcharge
        (11, 'K'),  # Cess
        (12, 'L'),  # Total tax deducted
        (13, 'M'),  # Total tax deposited
        (14, 'N'),  # Interest
        (15, 'O'),  # Others
        (16, 'P')   # Total
    ]

    for col, col_letter in total_columns:
        ws.cell(row=total_row, column=col).value = f'=SUM({col_letter}{DATA_START_ROW}:{col_letter}{last_data_row})'
        ws.cell(row=total_row, column=col).font = Font(bold=True)

    print(f"✅ Updated {sr_no - 1} parties in DEDUCTEE BREAK-UP")
    print(f"   Totals row is at row {total_row}")

print("✅ Output file generation functions loaded - COMPLETE FIX!")
print("   ✓ TOTAL rows are preserved in both sheets")
print("   ✓ All columns properly mapped from TDS Masters")
print("   ✓ BSR Code, Challan No written as strings to preserve leading zeros")
print("   ✓ Rate formatting fixed - decimals converted to percentages (0.1 → 10%)")
print("   ✓ Dynamic row management preserves TOTAL formulas")
print("   ✓ Column H (Paid by Book Entry) left blank as requested")
print("   ✓ Proper date formatting and string conversion where appropriate")

✅ Output file generation functions loaded - COMPLETE FIX!
   ✓ TOTAL rows are preserved in both sheets
   ✓ All columns properly mapped from TDS Masters
   ✓ BSR Code, Challan No written as strings to preserve leading zeros
   ✓ Rate formatting fixed - decimals converted to percentages (0.1 → 10%)
   ✓ Dynamic row management preserves TOTAL formulas
   ✓ Column H (Paid by Book Entry) left blank as requested
   ✓ Proper date formatting and string conversion where appropriate


In [48]:
# Cell 6: Testing individual components and troubleshooting - FIXED VERSION
# Use these functions to test each step separately if you encounter issues

def test_pdf_extraction_single(pdf_path):
    """Test extraction from a single PDF file"""
    print(f"🧪 Testing PDF extraction for: {pdf_path}")
    print("-" * 50)

    data = extract_challan_data_from_pdf(pdf_path)

    print("\n📊 Extracted Data:")
    for key, value in data.items():
        if key != 'error':
            print(f"   {key}: {value}")

    return data

def test_pdf_deduplication(pdf_folder):
    """Test PDF extraction with deduplication"""
    print(f"🧪 Testing PDF extraction with deduplication for: {pdf_folder}")
    print("-" * 50)

    # Get all PDF files
    pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
    print(f"Total PDF files found: {len(pdf_files)}")

    # Extract without deduplication to show duplicates
    all_challans = []
    challan_numbers = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        data = extract_challan_data_from_pdf(pdf_path)
        if data.get('challan_no'):
            all_challans.append(data)
            challan_numbers.append(data.get('challan_no'))

    # Count duplicates
    from collections import Counter
    challan_counts = Counter(challan_numbers)

    print(f"\n📊 Challan Analysis:")
    print(f"   Total PDFs processed: {len(pdf_files)}")
    print(f"   Total challans extracted: {len(all_challans)}")
    print(f"   Unique challan numbers: {len(set(challan_numbers))}")

    print("\n📊 Challan Number Frequency:")
    for challan_no, count in challan_counts.most_common():
        if count > 1:
            print(f"   Challan {challan_no}: appears {count} times (DUPLICATE)")
        else:
            print(f"   Challan {challan_no}: appears {count} time")

    # Now test with deduplication
    print("\n🧪 Testing with deduplication...")
    deduplicated_challans = extract_all_challans(pdf_folder)

    print(f"\n✅ After deduplication: {len(deduplicated_challans)} unique challans")

    return deduplicated_challans

def test_tds_masters_reading(file_path):
    """Test reading TDS Masters file"""
    print(f"🧪 Testing TDS Masters reading: {file_path}")
    print("-" * 50)

    data = read_tds_masters(file_path)

    if data:
        print("\n📊 Sheet Summary:")
        print(f"   TDS CODES: {len(data['tds_codes'])} entries")
        print(f"   TDS PARTIES: {len(data['tds_parties'])} parties")
        print(f"   Challan Details: {len(data['challan_details'])} challans")

        # Show column code mapping
        print("\n📊 Column Code Mapping Found:")
        code_to_column_name = data.get('code_to_column_name', {})
        for code, col_name in sorted(code_to_column_name.items())[:20]:  # Show first 20
            print(f"   {code} → {col_name}")

        # Check for critical columns
        critical_codes = ['(415)', '(415A)', '(416)', '(417)', '(418)', '(419)', '(421)', '(427)']
        missing_codes = [code for code in critical_codes if code not in code_to_column_name]

        if missing_codes:
            print("\n⚠️  Missing critical column codes:")
            for code in missing_codes:
                print(f"   - {code}")
        else:
            print("\n✅ All critical column codes found!")

        # Show sample party data
        print("\n📊 Sample TDS PARTIES data (first 3 rows):")
        print(data['tds_parties'].head(3))

    return data

def test_date_extraction(tds_masters_data):
    """Test date extraction for output filename"""
    print("🧪 Testing date extraction for output filename...")
    print("-" * 50)

    output_filename = get_output_filename_from_masters(tds_masters_data)
    print(f"\n📊 Generated output filename: {output_filename}")

    # Show the dates found
    code_to_column_name = tds_masters_data.get('code_to_column_name', {})
    date_col = code_to_column_name.get('(418)')  # Date of Payment/credit

    if date_col and date_col in tds_masters_data['tds_parties'].columns:
        dates = tds_masters_data['tds_parties'][date_col].dropna()
        if not dates.empty:
            print(f"\n📊 Payment dates found:")
            for i, date in enumerate(dates.head(5)):
                print(f"   Row {i}: {date}")

            first_date = pd.to_datetime(dates.iloc[0])
            print(f"\n📊 Using first date: {first_date.strftime('%d/%m/%Y')}")
            print(f"   Month: {first_date.strftime('%B')}")
            print(f"   Year: {first_date.strftime('%Y')}")

def create_test_summary(tds_masters_data, challan_data_list):
    """Create a summary report of the data before processing"""
    print("\n📊 PRE-PROCESSING SUMMARY REPORT")
    print("=" * 60)

    # Get column mappings
    code_to_column_name = tds_masters_data.get('code_to_column_name', {})

    # Find required columns by codes
    col_type_payment = code_to_column_name.get('(415A)', None)
    col_tds = code_to_column_name.get('(421)', None)

    if not col_type_payment or not col_tds:
        print("⚠️  Warning: Could not find required columns by codes")
        print(f"   Payment Type (415A): {col_type_payment}")
        print(f"   TDS Amount (421): {col_tds}")
        return

    # Challan summary (now with unique challans only)
    print("\n1️⃣ UNIQUE CHALLAN SUMMARY:")
    print("-" * 40)
    challan_summary = {}
    total_tax = 0

    for challan in challan_data_list:
        nop = challan.get('nature_of_payment', 'Unknown')
        tax = float(challan.get('tax_amount', 0))
        # Round UP
        tax = math.ceil(tax) if tax else 0

        if nop not in challan_summary:
            challan_summary[nop] = {'count': 0, 'total': 0, 'challan_numbers': []}

        challan_summary[nop]['count'] += 1
        challan_summary[nop]['total'] += tax
        challan_summary[nop]['challan_numbers'].append(challan.get('challan_no', 'N/A'))
        total_tax += tax

    for nop, data in sorted(challan_summary.items()):
        print(f"   {nop}: {data['count']} unique challan(s), Total: ₹{data['total']:,.0f}")
        print(f"      Challan Numbers: {', '.join(data['challan_numbers'])}")
    print(f"   TOTAL TAX: ₹{total_tax:,.0f}")

    # Party summary
    print("\n2️⃣ PARTY SUMMARY:")
    print("-" * 40)
    tds_parties = tds_masters_data['tds_parties']
    party_summary = {}

    for _, party in tds_parties.iterrows():
        payment_type = str(party.get(col_type_payment, '')).strip() if col_type_payment else ''
        tds = float(party.get(col_tds, 0)) if col_tds and pd.notna(party.get(col_tds)) else 0
        # Round UP
        tds = math.ceil(tds) if tds else 0

        if payment_type and payment_type != 'nan':
            if payment_type not in party_summary:
                party_summary[payment_type] = {'count': 0, 'total': 0}

            party_summary[payment_type]['count'] += 1
            party_summary[payment_type]['total'] += tds

    for payment_type, data in sorted(party_summary.items()):
        print(f"   {payment_type}: {data['count']} parties, Total TDS: ₹{data['total']:,.0f}")

    # Matching preview
    print("\n3️⃣ MATCHING PREVIEW:")
    print("-" * 40)
    for nop in challan_summary:
        challan_total = challan_summary[nop]['total']
        party_total = party_summary.get(nop.replace(' ', ''), {}).get('total', 0)
        diff = abs(challan_total - party_total)

        if diff <= 1:
            status = "✅"
        else:
            status = "❌"

        print(f"   {nop}: Challan ₹{challan_total:,.0f} vs Party ₹{party_total:,.0f} {status}")

    print("=" * 60)

# Troubleshooting tips - UPDATED
print("🔧 TROUBLESHOOTING TIPS - FIXED VERSION")
print("=" * 60)
print("\n1. PDF Extraction Issues:")
print("   - Ensure PDFs are text-based (not scanned images)")
print("   - Check if all PDFs are in ITNS 281 format")
print("   - Try testing a single PDF first")
print("   - BSR codes and challan numbers are now preserved as strings ✅")

print("\n2. Deduplication Issues:")
print("   - Check if duplicate PDFs have same challan numbers")
print("   - Use test_pdf_deduplication() to see duplicate analysis")
print("   - Ensure challan numbers are being extracted correctly")
print("   - Look for file names with (1), (2) suffixes - these are duplicates")

print("\n3. Excel File Issues:")
print("   - Ensure Excel files are not password protected")
print("   - Close Excel files before running the script")
print("   - Check if sheet names match exactly")
print("   - Verify column codes are in row 2 (can be (code) or -code format)")

print("\n4. Column Code Issues FIXED:")
print("   - ✅ Now handles both (415A) and -415A formats")
print("   - ✅ Falls back to column name matching if codes not found")
print("   - ✅ Maps all critical columns including those with - prefix")
print("   - Codes are normalized to (code) format internally")

print("\n5. NaN Issue FIXED:")
print("   - ✅ Uses openpyxl to read values directly, avoiding pandas type inference")
print("   - ✅ TDS values no longer become NaN after save/reload")
print("   - ✅ All numeric values preserved correctly throughout processing")

print("\n6. Date Format Issues FIXED:")
print("   - ✅ Dates are parsed from strings to datetime objects")
print("   - ✅ Excel date format is set explicitly to DD/MM/YYYY")
print("   - ✅ Prevents data corruption during save/reload")

print("\n7. Rate Formatting FIXED:")
print("   - ✅ Decimal rates (0.1) are multiplied by 100 to get percentage (10%)")
print("   - ✅ Rates are formatted consistently with % symbol")
print("   - ✅ Handles both decimal and percentage input formats")

print("\n8. Leading Zeros FIXED:")
print("   - ✅ BSR codes padded to 7 digits (e.g., 240020 → 0240020)")
print("   - ✅ Challan numbers preserved as strings (e.g., 03636 stays 03636)")
print("   - ✅ Written as strings to Excel to prevent conversion to numbers")

print("\n9. Validation Issues:")
print("   - Check if Nature of Payment codes match (94A vs 94 A)")
print("   - Verify TDS amounts in Masters match challan amounts")
print("   - Look for rounding differences (we round UP)")
print("   - Final validation should now pass with fixed data preservation ✅")

print("\n10. Common Error Messages:")
print("   - 'Permission denied': Close the Excel file")
print("   - 'Sheet not found': Check sheet names in Excel")
print("   - 'No PDF files found': Check PDF folder path")
print("   - 'KeyError': Column code not found (now has fallbacks)")

print("\n11. Expected File Names (UPDATED):")
print("   - Masters: Should start with 'TDS_Masters'")
print("   - Template: Should start with 'TDS_Template'")
print("   - Output: Will be 'TDS_[Month]_[Year].xlsx'")
print("=" * 60)

print("\n💡 Test individual components using:")
print("   test_pdf_extraction_single('path/to/single.pdf')")
print("   test_pdf_deduplication('path/to/pdf/folder')")
print("   test_tds_masters_reading('path/to/TDS_Masters.xlsx')")
print("   test_date_extraction(tds_masters_data)")
print("   create_test_summary(tds_masters_data, challan_data_list)")

print("\n🎯 KEY FIXES IMPLEMENTED:")
print("   ✅ NaN issue resolved with openpyxl direct reading")
print("   ✅ Leading zeros preserved for BSR codes and challan numbers")
print("   ✅ Rate formatting fixed (0.1 → 10%)")
print("   ✅ Column codes with - prefix properly handled")
print("   ✅ Fallback to column name matching for all fields")
print("   ✅ Dates parsed and formatted correctly")
print("   ✅ Mode of payment converted to uppercase")
print("   ✅ Data integrity preserved throughout processing")

🔧 TROUBLESHOOTING TIPS - FIXED VERSION

1. PDF Extraction Issues:
   - Ensure PDFs are text-based (not scanned images)
   - Check if all PDFs are in ITNS 281 format
   - Try testing a single PDF first
   - BSR codes and challan numbers are now preserved as strings ✅

2. Deduplication Issues:
   - Check if duplicate PDFs have same challan numbers
   - Use test_pdf_deduplication() to see duplicate analysis
   - Ensure challan numbers are being extracted correctly
   - Look for file names with (1), (2) suffixes - these are duplicates

3. Excel File Issues:
   - Ensure Excel files are not password protected
   - Close Excel files before running the script
   - Check if sheet names match exactly
   - Verify column codes are in row 2 (can be (code) or -code format)

4. Column Code Issues FIXED:
   - ✅ Now handles both (415A) and -415A formats
   - ✅ Falls back to column name matching if codes not found
   - ✅ Maps all critical columns including those with - prefix
   - Codes are normalized t

In [49]:
# Cell 7: Interactive file upload and processing
import os
import zipfile
import shutil
import time  # Added for cleanup countdown
import gc     # Added for memory cleanup
from google.colab import files
import pandas as pd
from datetime import datetime

# Setup directories
BASE_DIR = '/content/tds_processing'
PDF_DIR = os.path.join(BASE_DIR, 'pdfs')
OUTPUT_FOLDER = os.path.join(BASE_DIR, 'output')

for dir_path in [BASE_DIR, PDF_DIR, OUTPUT_FOLDER]:
    os.makedirs(dir_path, exist_ok=True)

# Global variables to store uploaded files
UPLOADED_FILES = {
    'pdfs': [],
    'tds_masters': None,
    'template': None
}

def clear_uploads():
    """Clear all uploaded files and reset directories"""
    global UPLOADED_FILES
    UPLOADED_FILES = {'pdfs': [], 'tds_masters': None, 'template': None}
    for dir_path in [PDF_DIR, OUTPUT_FOLDER]:
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
            os.makedirs(dir_path, exist_ok=True)
    print("✅ All uploads cleared. Ready for new files.")

def upload_zip_file():
    """Upload and extract a zip file containing PDFs and Excel files"""
    print("\n📦 Upload ZIP file containing PDFs and Excel files")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        if filename.endswith('.zip'):
            zip_path = os.path.join(BASE_DIR, filename)
            with open(zip_path, 'wb') as f:
                f.write(data)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(BASE_DIR)
            os.remove(zip_path)
            print(f"✅ Extracted: {filename}")
            # Process extracted files
            for root, _, files_list in os.walk(BASE_DIR):
                for f in files_list:
                    file_path = os.path.join(root, f)
                    if f.lower().endswith('.pdf'):
                        shutil.move(file_path, os.path.join(PDF_DIR, f))
                        UPLOADED_FILES['pdfs'].append(os.path.join(PDF_DIR, f))
                    elif f.startswith('TDS_Masters') and f.endswith('.xlsx'):
                        shutil.move(file_path, os.path.join(BASE_DIR, f))
                        UPLOADED_FILES['tds_masters'] = os.path.join(BASE_DIR, f)
                    elif f.startswith('TDS_Template') and f.endswith('.xlsx'):
                        shutil.move(file_path, os.path.join(BASE_DIR, f))
                        UPLOADED_FILES['template'] = os.path.join(BASE_DIR, f)
    print(f"\n✅ ZIP processing complete!")
    print(f"   PDFs: {len(UPLOADED_FILES['pdfs'])} files")
    print(f"   TDS Masters: {UPLOADED_FILES['tds_masters']}")
    print(f"   Template: {UPLOADED_FILES['template']}")

def process_uploaded_files():
    """Process all uploaded files"""
    if not UPLOADED_FILES['pdfs']:
        print("❌ No PDF files uploaded. Please upload PDFs first.")
        return
    if not UPLOADED_FILES['tds_masters']:
        print("❌ No TDS Masters file uploaded. Please upload TDS_Masters*.xlsx")
        return
    if not UPLOADED_FILES['template']:
        print("❌ No Template file uploaded. Please upload TDS_Template*.xlsx")
        return

    print("\n✅ All files uploaded successfully!")
    print("\n⏳ Starting automatic processing...")
    print(f"\n============================================================\n📁 Files Ready for Processing:\n   PDFs uploaded: {len(UPLOADED_FILES['pdfs'])} files")
    for pdf in UPLOADED_FILES['pdfs'][:4]:
        print(f"   - {os.path.basename(pdf)}")
        challan_data = extract_challan_data_from_pdf(pdf)
        print(f"✅ Extracted data from: {os.path.basename(pdf)}")
        print(f"   Nature of Payment: {challan_data.get('nature_of_payment', 'Not found')}")
        print(f"   Challan No: {challan_data.get('challan_no', 'Not found')}")
        print(f"   Tax Amount: ₹{challan_data.get('tax_amount', 'Not found')}")
        if challan_data.get('tax_amount'):
            print(f"   Surcharge: ₹{challan_data.get('surcharge', '0')}")
            print(f"   Cess: ₹{challan_data.get('cess', '0')}")
            print(f"   Total: ₹{challan_data.get('total_amount', 'Not found')}")
    print(f"   TDS Masters: {os.path.basename(UPLOADED_FILES['tds_masters'])}")
    print(f"   Template: {os.path.basename(UPLOADED_FILES['template'])}")
    print("============================================================\n")

    print("🚀 Starting TDS processing...")
    print("   ✓ Using openpyxl to read values directly (NaN issue fixed)")
    print("   ✓ BSR codes padded to 7 digits with leading zeros")
    print("   ✓ Challan numbers preserved as strings")
    print("   ✓ Handling both (code) and -code formats")
    print("   ✓ Deduplicating challans by challan number")
    print("   ✓ Rounding amounts UP to nearest rupee")
    print("   ✓ Dates properly parsed and formatted")
    print("   ✓ Mode of payment converted to uppercase")
    print("   ✓ Rate formatting fixed (0.1 → 10%)")

    # Run the main process with corrected parameter name
    success = process_tds_returns(
        pdf_folder_path='/content/tds_processing/pdfs',
        masters_file_path=UPLOADED_FILES['tds_masters'],
        template_file_path=UPLOADED_FILES['template']
    )

    if success:
        print("\n============================================================\n📊 PROCESSING SUMMARY\n============================================================")
        print(f"✅ Unique Challans Processed: {len(extract_all_challans('/content/tds_processing/pdfs'))}")
        print(f"✅ TDS Masters Updated: {success['updated_masters_path']}")
        print(f"✅ Output File Generated: {success['output_path']}")
        print(f"✅ Pre-Update Validation: {'PASSED' if success['pre_validation'] else 'FAILED'}")
        print(f"✅ Final Validation: {'PASSED' if success['final_validation'] else 'FAILED'}")
        print("============================================================\n")

        print("✅ TDS Return Processing Complete!")
        print("\n📥 DOWNLOADING OUTPUT FILES...\n----------------------------------------")
        files.download(success['updated_masters_path'])
        print(f"✅ Downloaded: {os.path.basename(success['updated_masters_path'])}")
        files.download(success['output_path'])
        print(f"✅ Downloaded: {os.path.basename(success['output_path'])}")
        print("\n✅ All files processed and downloaded successfully!")
        print("📋 Check your browser's download folder for the output files.")

        # ===== SMART CLEANUP WITH VISUAL ENHANCEMENT =====
        print("\n" + "="*60)
        print("🧹 SMART CLEANUP")
        print("="*60)

        # Quick summary of what will be cleared
        pdf_count = len(UPLOADED_FILES['pdfs'])
        has_masters = "✓" if UPLOADED_FILES['tds_masters'] else "✗"
        has_template = "✓" if UPLOADED_FILES['template'] else "✗"
        output_count = len(os.listdir(OUTPUT_FOLDER)) if os.path.exists(OUTPUT_FOLDER) else 0

        print("📊 Clearing all temporary memory data in 3 seconds...")
        print(f"""
   Data to be cleared:
   • PDF Files: {pdf_count}
   • TDS Masters: {has_masters}
   • Template File: {has_template}
   • Output Files: {output_count}
   • Temp Folders: /pdfs, /output
   • Memory Cache: All dataframes
""")

        # Visual countdown with animated cleanup indicator
        cleanup_frames = ["🧹", "🗑️", "♻️", "✨"]
        for i in range(3, 0, -1):
            frame = cleanup_frames[3-i]
            print(f"   {frame} Cleanup starting in... {i} ", end='\r')
            time.sleep(1)

        # Perform cleanup
        print("\n🔄 Executing cleanup...                    ")
        try:
            clear_uploads()
            gc.collect()  # Force garbage collection
            print("✅ All temporary data cleared!")
            print("✅ Ready for next batch processing.")
        except Exception as e:
            print(f"⚠️  Partial cleanup: {str(e)}")
            print("   You can manually run: clear_uploads()")

        print("="*60)
        # ===== END OF SMART CLEANUP =====

    print("\n============================================================\n📌 ADDITIONAL OPTIONS:\n")
    print("🔄 To try again: Run this cell again\n")
    print("📦 To upload as ZIP instead: Run this command:\n   upload_zip_file()\n")
    print("🗑️ To clear and start fresh: Run this command:\n   clear_uploads()\n   Then run this cell again\n")
    print("🧪 To test specific components:\n   test_tds_masters_reading('/content/tds_processing/[your_file].xlsx')\n   test_pdf_deduplication('/content/tds_processing/pdfs')\n============================================================\n")

def start_interactive_upload():
    """Start the interactive upload process"""
    clear_uploads()
    print("\n📋 STEP 1: Upload PDF Files\n----------------------------------------")
    print("Click 'Choose Files' and select your PDF challan files")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        if filename.lower().endswith('.pdf'):
            file_path = os.path.join(PDF_DIR, filename)
            with open(file_path, 'wb') as f:
                f.write(data)
            UPLOADED_FILES['pdfs'].append(file_path)
            print(f"✅ Uploaded: {filename}")
    print(f"\n✅ Uploaded {len(UPLOADED_FILES['pdfs'])} PDF files")

    print("\n📋 STEP 2: Upload TDS Masters File\n----------------------------------------")
    print("Click 'Choose Files' and select your 'TDS_Masters*.xlsx' file")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        if filename.startswith('TDS_Masters') and filename.endswith('.xlsx'):
            file_path = os.path.join(BASE_DIR, filename)
            with open(file_path, 'wb') as f:
                f.write(data)
            UPLOADED_FILES['tds_masters'] = file_path
            print(f"✅ Uploaded: {filename}")

    print("\n📋 STEP 3: Upload Template File\n----------------------------------------")
    print("Click 'Choose Files' and select your 'TDS_Template*.xlsx' file")
    uploaded = files.upload()
    for filename, data in uploaded.items():
        if filename.startswith('TDS_Template') and filename.endswith('.xlsx'):
            file_path = os.path.join(BASE_DIR, filename)
            with open(file_path, 'wb') as f:
                f.write(data)
            UPLOADED_FILES['template'] = file_path
            print(f"✅ Uploaded: {filename}")

    process_uploaded_files()

# Automatically start the upload process
start_interactive_upload()

# If upload fails, show additional options
print("\n📌 If upload fails, use these commands:")
print("   upload_zip_file()  # To upload a zip file")
print("   clear_uploads()    # To clear all uploads and start fresh")

✅ All uploads cleared. Ready for new files.

📋 STEP 1: Upload PDF Files
----------------------------------------
Click 'Choose Files' and select your PDF challan files


Saving UFC Cont Jun 2025.pdf to UFC Cont Jun 2025 (6).pdf
Saving UFC Int Jun 2025.pdf to UFC Int Jun 2025 (6).pdf
Saving UFC Pur Jun 2025.pdf to UFC Pur Jun 2025 (6).pdf
Saving UFC Rent Jun 2025.pdf to UFC Rent Jun 2025 (6).pdf
✅ Uploaded: UFC Cont Jun 2025 (6).pdf
✅ Uploaded: UFC Int Jun 2025 (6).pdf
✅ Uploaded: UFC Pur Jun 2025 (6).pdf
✅ Uploaded: UFC Rent Jun 2025 (6).pdf

✅ Uploaded 4 PDF files

📋 STEP 2: Upload TDS Masters File
----------------------------------------
Click 'Choose Files' and select your 'TDS_Masters*.xlsx' file


Saving TDS_Masters_Elaborate.xlsx to TDS_Masters_Elaborate (6).xlsx
✅ Uploaded: TDS_Masters_Elaborate (6).xlsx

📋 STEP 3: Upload Template File
----------------------------------------
Click 'Choose Files' and select your 'TDS_Template*.xlsx' file


Saving TDS_Template.xlsx to TDS_Template (6).xlsx
✅ Uploaded: TDS_Template (6).xlsx

✅ All files uploaded successfully!

⏳ Starting automatic processing...

📁 Files Ready for Processing:
   PDFs uploaded: 4 files
   - UFC Cont Jun 2025 (6).pdf
✅ Extracted data from: UFC Cont Jun 2025 (6).pdf
   Nature of Payment: 94C
   Challan No: 06737
   Tax Amount: ₹3288
   Surcharge: ₹0
   Cess: ₹0
   Total: ₹3288
✅ Extracted data from: UFC Cont Jun 2025 (6).pdf
   Nature of Payment: 94C
   Challan No: 06737
   Tax Amount: ₹3288
   Surcharge: ₹0
   Cess: ₹0
   Total: ₹3288
   - UFC Int Jun 2025 (6).pdf
✅ Extracted data from: UFC Int Jun 2025 (6).pdf
   Nature of Payment: 94A
   Challan No: 06501
   Tax Amount: ₹3917
   Surcharge: ₹0
   Cess: ₹0
   Total: ₹3917
✅ Extracted data from: UFC Int Jun 2025 (6).pdf
   Nature of Payment: 94A
   Challan No: 06501
   Tax Amount: ₹3917
   Surcharge: ₹0
   Cess: ₹0
   Total: ₹3917
   - UFC Pur Jun 2025 (6).pdf
✅ Extracted data from: UFC Pur Jun 2025 (6).pdf
  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Downloaded: TDS_Masters_Elaborate (6)_UPDATED.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Downloaded: TDS_June_2025.xlsx

✅ All files processed and downloaded successfully!
📋 Check your browser's download folder for the output files.

🧹 SMART CLEANUP
📊 Clearing all temporary memory data in 3 seconds...

   Data to be cleared:
   • PDF Files: 4
   • TDS Masters: ✓
   • Template File: ✓  
   • Output Files: 1
   • Temp Folders: /pdfs, /output
   • Memory Cache: All dataframes


🔄 Executing cleanup...                    
✅ All uploads cleared. Ready for new files.
✅ All temporary data cleared!
✅ Ready for next batch processing.

📌 ADDITIONAL OPTIONS:

🔄 To try again: Run this cell again

📦 To upload as ZIP instead: Run this command:
   upload_zip_file()

🗑️ To clear and start fresh: Run this command:
   clear_uploads()
   Then run this cell again

🧪 To test specific components:
   test_tds_masters_reading('/content/tds_processing/[your_file].xlsx')
   test_pdf_deduplication('/content/tds_processing/pdfs')


📌 If upload fails, use these commands:
   upload_zip_file()  # To up