In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
csv_path = "/content/drive/MyDrive/NLP_driven_Invoice_Management_System/invoice_texts.csv"
df = pd.read_csv(csv_path)
print(df.head(2).to_string(index=False))

                 filename                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                extracted_text
Template5_Instance101.jpg                                                       

In [3]:
!pip install -U spacy mysql-connector-python
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4/457.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import spacy
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")

In [5]:
inv_patterns = [
    r"Invoice\s*No[:\s]*([\w\-]+)",
    r"Invoice\s*#[:\s]*([\w\-]+)",
    r"Invoice\s*number[:\s]*([\w\-]+)",
    r"\b[A-Z0-9]{2,}-[A-Z0-9]{2,}\b"
]

In [9]:
import re
from dateutil import parser
def extract_invoice_data(doc, filename):
    result = {"invoice_no": None, "date": None, "total": None, "vendor": None}

    # Extract from NER
    for ent in doc.ents:
        if ent.label_ == "DATE" and not result["date"]:
            try:
                result["date"] = parser.parse(ent.text).strftime("%Y-%m-%d")
            except:
                result["date"] = ent.text
        elif ent.label_ == "MONEY" and not result["total"]:
            # Ensure we are extracting only the numeric part with decimal
            m = re.search(r"[\d,]+\.\d{2}", ent.text.replace(",", ""))
            if m:
                result["total"] = float(m.group())
        elif ent.label_ == "ORG" and not result["vendor"]:
            result["vendor"] = ent.text.strip()

    # Regex for Invoice No
    for pat in inv_patterns:
        m = re.search(pat, doc.text, flags=re.IGNORECASE)
        if m:
            # Check if the pattern has a capturing group before accessing group(1)
            if m.groups(): # m.groups() returns a tuple of all captured groups, empty if none
                 result["invoice_no"] = m.group(1).strip()
            else:
                 # If no capturing group, the whole match is the invoice number
                 result["invoice_no"] = m.group(0).strip()
            break # Stop after the first successful match

    result["filename"] = filename
    return result

In [10]:
def process_chunk(chunk_df, batch_size=4):
    extracted = []
    data = list(zip(chunk_df["extracted_text"].tolist(), chunk_df["filename"].tolist()))

    for doc, filename in nlp.pipe(data, batch_size=batch_size, as_tuples=True):
        row = extract_invoice_data(doc, filename)
        extracted.append(row)

    return pd.DataFrame(extracted)


In [11]:
final_dfs = []
chunk_size = 1000

for i in range(0, len(df), chunk_size):
    print(f"Processing chunk {i} to {i+chunk_size}")
    chunk_df = df.iloc[i:i+chunk_size]
    chunk_result = process_chunk(chunk_df)
    final_dfs.append(chunk_result)
    print(f"Done chunk {i} to {i+chunk_size}")

extracted_df = pd.concat(final_dfs, ignore_index=True)
extracted_df.to_csv("extracted_invoice_data.csv", index=False)
print("All chunks processed and CSV saved.")


Processing chunk 0 to 1000
Done chunk 0 to 1000
Processing chunk 1000 to 2000
Done chunk 1000 to 2000
Processing chunk 2000 to 3000
Done chunk 2000 to 3000
Processing chunk 3000 to 4000
Done chunk 3000 to 4000
Processing chunk 4000 to 5000
Done chunk 4000 to 5000
Processing chunk 5000 to 6000
Done chunk 5000 to 6000
Processing chunk 6000 to 7000
Done chunk 6000 to 7000
Processing chunk 7000 to 8000
Done chunk 7000 to 8000
Processing chunk 8000 to 9000
Done chunk 8000 to 9000
Processing chunk 9000 to 10000
Done chunk 9000 to 10000
All chunks processed and CSV saved.


In [12]:
from google.colab import files
files.download("extracted_invoice_data.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>