In [94]:
import re
import json
from PyPDF2 import PdfReader

def extract_text_from_invoice(PDF_File):
    pdf_reader = PdfReader(PDF_File)
    raw_text = ''
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            raw_text += text
    return raw_text


def parse_invoice_data(text, multi_word_brands):
    # Extract vehicle information
    sorted_brands = sorted(multi_word_brands, key=len, reverse=True)
    
    # Create the regex pattern
    brands_pattern = '|'.join(map(re.escape, sorted_brands))
    pattern = rf'Job Job Total Unit Approve Decline.*?(\d{{4}})\s+((?:{brands_pattern}|\w+))\s+(.*?)(?=\n|$)'
    
    vehicle_info = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if vehicle_info:
        year, make, model = vehicle_info.groups()
        make = make.strip()
        model = model.split('\n')[0].strip()
    else:
        year, make, model = "Not found", "Not found", "Not found"

    # Extract VIN
    vin = re.search(r'VIN/Serial No:?\s*(?:Plate:?\s*(?:Out:)?)?\s*(\w+)', text)
    vin = vin.group(1) if vin else "VIN not found"

    # Extract part numbers and descriptions
    parts = re.findall(r'(\w+)\s+(\d+\.\d+)(.*?)(?:\$\d+\.\d+|\n)', text, re.DOTALL)

    # Create a structured output
    structured_data = {
        "Vehicle": {
            "Year": year,
            "Make": make,
            "Model": model,
            "VIN": vin
        },
        "Parts": [
            {
                "Part Number": part[0],
                "Description": part[2].strip()
            } for part in parts if part[2].strip() and not part[0].isalpha()
        ]
    }

    return structured_data

file_path = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/repair_orders/RO312890_Hayward_780361697751871381_1722973591799.pdf"
file_path1 = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/repair_orders/RO309938_Six_750730961510460578_1722973478612.pdf"
file_path2 = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/repair_orders/RO314016_Lea_790682494578296497_1722973660438.pdf"
file_path3 = "/Users/skylerwilson/Desktop/PartsWise/co-pilot-v1/data/repair_orders/RO314164_Campbell_792060373979552623_1722973702846.pdf"


multi_word_brands = [
    "Aston Martin",
    "Alfa Romeo",
    "Alpine Renault",
    "American Motors",
    "Ariel Motor",
    "Bac Mono",
    "BMW Motorrad",
    "Bugatti Automobiles",
    "Can Am",
    "Caterham Cars",
    "CFMoto Motorcycles",
    "Dacia Automobiles",
    "David Brown",
    "De Tomaso",
    "Drako Motors",
    "DS Automobiles",
    "Harley Davidson",
    "Honda Motorcycles",
    "Hudson Motor",
    "Indian Motorcycle",
    "Isdera Automobile",
    "Iso Rivolta",
    "Karma Automotive",
    "KTM Motorcycles",
    "Land Rover",
    "Lehman Trikes",
    "Lincoln Motor",
    "Lucid Motors",
    "Morgan Motor",
    "MV Agusta",
    "Piaggio Motorcycles",
    "Pininfarina Automobili",
    "Polaris Industries",
    "Rezvani Motors",
    "Rimac Automobili",
    "Rolls Royce",
    "Royal Enfield",
    "Scuderia Cameron",
    "SSC North",
    "Suzuki Motorcycles",
    "Triumph Motorcycles",
    "TVR Automotive",
    "Vanderhall Motor",
    "Victory Motorcycles",
    "Yamaha Motorcycles",
    "Zero Motorcycles"
]
text_with_pyPDF = extract_text_from_invoice(file_path3)
parsed_data = parse_invoice_data(text_with_pyPDF, multi_word_brands)

print(json.dumps(parsed_data, indent=2))

{
  "Vehicle": {
    "Year": "2009",
    "Make": "BMW",
    "Model": "F 800 GS",
    "VIN": "WB10219049ZT77875"
  },
  "Parts": [
    {
      "Part Number": "61317727067",
      "Description": "HANDLE LEFT Parts"
    },
    {
      "Part Number": "34217722884",
      "Description": "BRAKE PADS KIT, REAR Parts"
    }
  ]
}
