In [None]:
import warnings
import requests
import re
import pdfplumber
import pandas as pd
import os
import io
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import logging
# silence all of pdfminer’s page-parsing warnings
logging.getLogger("pdfminer.pdfpage").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", message="CropBox missing from /Page, defaulting to MediaBox")


In [None]:
# get df with all links
year = 2024
mainurl = f'https://www.cmdachennai.gov.in/ApprovedLayout/{year}.htm'
df = pd.read_html(mainurl)[0]
df = df.iloc[:-1,:-1]
df


resp = requests.get(mainurl)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
pdflist = soup.find_all('a')
pdflist = [i for i in pdflist if i.get('href').endswith('.pdf')]
df['bs4'] = pdflist

def extractpplfromurl(url):
    return url.split('/')[-1][:-4].replace('-','/')

df['url'] = df.bs4.apply(lambda x: 'https://www.cmdachennai.gov.in/' + x.get('href'))
df['extracted_ppl'] = df.url.apply(extractpplfromurl)
df['pdf_savepath'] = df.extracted_ppl.apply(lambda x: x.split('/')[-1] +'/'+ x.split('/')[0] + '.pdf' )


In [None]:
df

In [None]:
# verify layout numbers match
assert pd.DataFrame(df.iloc[:,1] == df.iloc[:,-2]).all()[0]



In [None]:
os.path.exists('data/2024/01.pdf')

In [None]:
def extract_text_from_pdf(pdf_file, x_start_ratio)  :        
    all_text = []
    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            width, height = page.width, page.height
            try:
                words = page.extract_words()
                extentfound = [i for i in words if i['text'].lower().startswith('ext')][0]
                bbox_start = int(extentfound['x0'] - 70)
            except:
                print("extent not found")
                bbox_start = width * x_start_ratio
            finally:
                crop = page.within_bbox((bbox_start, 0, width, height))
                text = crop.extract_text()
                if text:
                    all_text.append(text)
    return "\n".join(all_text)


async def download_and_extract_pdf(session, df_row, x_start_ratio=0.4, retries=3, backoff_factor=1.0):
    """
    Downloads a PDF from a URL, saves it to disk, and extracts text from a cropped region of each page.
    Retries the download on failure with exponential backoff.
    """
    pdf_url = df_row['url']
    savepath = os.path.join('data', df_row['pdf_savepath'])

    saved_to_disk = False
    # check if pdf already exists in disk
    if os.path.exists(savepath):
        saved_to_disk = True
        print(f"{savepath} already on disk, extracting text...")
        # extract text
        textlist = extract_text_from_pdf(savepath, x_start_ratio)
        return textlist, saved_to_disk

    else:

        for attempt in range(1, retries + 1):
            try:
                async with session.get(pdf_url, ssl=False) as response:
                    response.raise_for_status()
                    content = await response.read()

                    # Save PDF to disk
                    os.makedirs(os.path.dirname(os.path.abspath(savepath)), exist_ok=True)
                    with open(savepath, 'wb') as fh:
                        fh.write(content)
                    saved_to_disk = True
                    print(f"Saved PDF to disk: {savepath}")

                    # Extract text from cropped region
                    pdf_file_content = io.BytesIO(content)
                    all_text = extract_text_from_pdf(pdf_file_content, x_start_ratio)

                # Successfully downloaded and processed
                return all_text, saved_to_disk

            except Exception as e:
                if attempt == retries:
                    print(f"Attempt {attempt} failed; no more retries.")
                    return f"ERROR: {e}", saved_to_disk
                wait_time = backoff_factor * (2 ** (attempt - 1))
                print(f"Attempt {attempt} failed with error: {e}. Retrying in {wait_time} seconds...")
                await asyncio.sleep(wait_time)


def extract_multiple_patterns(text: str) -> dict[str|None]:
    """
    Extract multiple patterns from PDF text
    
    Returns:
        Dictionary with pattern matches
    """
    # Define patterns with descriptive keys
    patterns = {
        'total_no_plots': [
            r'NO\.?\s*OF\.?\s*PLO(?:T|TS)[^\n]*\s*[=:]\s*(\d+)',
            r'No\.?\s*of\s*[Pp]lots\s*[=:]\s*(\d+)',
            r'TOTAL\s*PLOTS?\s*[=:]\s*(\d+)'
        ],
        'ews': [
            r'(?i)\bE\.?W\.?S\.?\s+(?:Provided|Plots)\b[^\n]*[:=]\s*(\d+)\s*Nos\.?\b'
        ],
        'regular':[
            r'(?i)REGULAR PLOTS.\s*[^\n]*?[:=]\s*(\d+)\s*No'
        ],
        'date': [
            r'DATE\s*[=:]\s*(\d{1,2}\s?[-\/]\s?\d{2}\s?[-\/]\s?\d{4})',
            r'Date\s*:\s*(\d{1,2}(?:st|nd|rd|th)?\s+(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{4})',
            r'(\d{4}-\d{2}-\d{2})'
        ],
        'reference_number': [
            r'REF(?:ERENCE)?\s*NO\.?\s*[=:]\s*([A-Z0-9-/]+)',
            r'File\s*No\.?\s*[=:]\s*([A-Z0-9-/]+)',
            r'Application\s*No\.?\s*[=:]\s*([A-Z0-9-/]+)'
        ]
    }

    try:
        
        # Dictionary to store results
        results = {}
        
        # Search for each pattern category
        for category, pattern_list in patterns.items():
            results[category] = None
            for pattern in pattern_list:
                matches = re.findall(pattern, text, re.IGNORECASE)
                if matches:
                    results[category] = matches[0]
                    break
        
        return results
        
    except Exception as e:
        print(f"Error processing URL {text}: {str(e)}")
        return {key: None for key in patterns.keys()}, text


In [None]:
# Now populate the columns from the extracted patterns
def extract_and_expand(text):
    if text and not isinstance(text, str) or (isinstance(text, str) and not text.startswith("ERROR:")):
        patterns = extract_multiple_patterns(text)
        return patterns
    return {key: None for key in ['total_no_plots', 'ews', 'regular', 'date', 'reference_number']}

# Usage with asyncio
async def process_df(df):
    results_text = []
    results_saved = []
    async with aiohttp.ClientSession() as session:
        tasks = [ download_and_extract_pdf(session, row) for _, row in df.iterrows()]
        batch_results = await asyncio.gather(*tasks)
        
        # Unpack results
        batch_text, batch_saved = zip(*batch_results)
        results_text.extend(batch_text)
        results_saved.extend(batch_saved)
        
    result_df = df.copy()
    result_df['extracted_text'] = results_text
    result_df['saved_to_disk'] = results_saved
    return result_df


async def main():
    # Process first 20 rows as an example
    # result_df = await process_df(df)
    
    # Now you can use the DataFrame as usual
    print(f"Successfully saved {result_df['saved_to_disk'].sum()} of {len(result_df)} PDFs")
    
    # Apply pattern extraction to the extracted text and expand into separate columns
    for key in ['total_no_plots', 'ews', 'regular', 'date', 'reference_number']:
        result_df[key] = None  # Initialize columns


    # Apply the extraction function
    pattern_dicts = result_df['extracted_text'].apply(extract_and_expand)

    # Expand the dictionaries into separate columns
    for key in ['total_no_plots', 'ews', 'regular', 'date', 'reference_number']:
        result_df[key] = pattern_dicts.apply(lambda x: x.get(key))
    
    return result_df

# Run the async function
result_df = await main()

In [None]:
pqdf = result_df.drop(columns=['bs4', 'extracted_ppl'])
# pqdf.to_parquet('./2024extract.parquet')
# pqdf.to_excel('./pathtoxl.xlsx')

In [None]:
# pqdf = pd.read_parquet('./2024extract.parquet')
pqdf[pqdf.extracted_text.str.startswith("ERROR")]

In [None]:
pqdf[pqdf.total_no_plots.isna()]

In [None]:
pqdf.head()

In [None]:
pqdf.loc[204].apply(extract_and_expand)