In [1]:
#pip install pytesseract pdf2image openai pandas Pillow pdfplumber

In [2]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import openai
import pdfplumber
import re
import pandas as pd

In [3]:
openai.api_key = "abx"

In [4]:
# Function to extract text from an image
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

In [5]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

In [6]:
# Function to extract key KYC details using regex
def extract_kyc_details(text):
    details = {
        "Name": None,
        "Date of Birth": None,
        "Address": None,
        "Document Type": None
    }
    # Extract Name (Assuming first uppercase word sequence)
    name_match = re.search(r"Name[:\s]+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)", text)
    if name_match:
        details["Name"] = name_match.group(1)

    # Extract Date of Birth (DOB)
    dob_match = re.search(r"Date of Birth[:\s]+(\d{2}/\d{2}/\d{4}|\d{2}-\d{2}-\d{4})", text)
    if dob_match:
        details["Date of Birth"] = dob_match.group(1)

    # Extract Address (Basic heuristic)
    address_match = re.search(r"Address[:\s]+([\w\s,]+)", text)
    if address_match:
        details["Address"] = address_match.group(1)

    # Identify document type (e.g., Passport, Driver's License)
    if "passport" in text.lower():
        details["Document Type"] = "Passport"
    elif "license" in text.lower():
        details["Document Type"] = "Driver's License"
    else:
        details["Document Type"] = "Unknown"

    return details

In [7]:
import time
import openai

def validate_kyc_details(kyc_data):
    prompt = f"""
    Validate the following KYC details:
    - Name: {kyc_data['Name']}
    - Date of Birth: {kyc_data['Date of Birth']}
    - Address: {kyc_data['Address']}
    - Document Type: {kyc_data['Document Type']}
    """

    for attempt in range(3):  # Retry up to 3 times
        try:
            response = openai.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=200
            )
            return response.choices[0].message.content

        except openai.RateLimitError:
            print(f"Rate limit hit! Retrying in {5 * (attempt + 1)} seconds...")
            time.sleep(5 * (attempt + 1))  # Exponential backoff delay

    return "Error: OpenAI API rate limit exceeded. Try again later."


In [8]:
# Main function to process a KYC document
def process_kyc_document(file_path):
    if file_path.lower().endswith((".png", ".jpg", ".jpeg")):
        text = extract_text_from_image(file_path)
    elif file_path.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    else:
        print("Unsupported file format!")
        return

    print("\nExtracted Text:\n", text)
    
    kyc_details = extract_kyc_details(text)
    print("\nExtracted KYC Details:", kyc_details)

    validation_result = validate_kyc_details(kyc_details)
    print("\nValidation & Risk Assessment:\n", validation_result)

# Example usage
if __name__ == "__main__":
    file_path = "/Users/rupeshshivsharan/Downloads/Yashvi Passport.pdf"  # Replace with your actual file
    process_kyc_document(file_path)


Extracted Text:
 Scanned with CamScanner
Scanned with CamScanner


Extracted KYC Details: {'Name': None, 'Date of Birth': None, 'Address': None, 'Document Type': 'Unknown'}

Validation & Risk Assessment:
 The KYC details provided are invalid. Name, Date of Birth, Address, and Document Type are all mandatory fields and should not be None or Unknown. Please provide valid information.
