In [None]:
!pip install easyocr
!pip install img2table[easyocr]



In [None]:
import easyocr
import re
import datetime
import cv2
import matplotlib.pyplot as plt
import pandas as pd

from IPython.display import display_html
from PIL import Image as PILImage
from img2table.document import Image
from img2table.ocr import EasyOCR
from google.colab import files

In [None]:
class DateExtraction:
    def __init__(self) -> None:
        self.months = {
            "JAN": "01",
            "01-": "01",
            "FEB": "02",
            "02-": "02",
            "MAR": "03",
            "03-": "03",
            "APR": "04",
            "04-": "04",
            "MAY": "05",
            "05-": "05",
            "JUN": "06",
            "06-": "06",
            "JUL": "07",
            "07-": "07",
            "AGU": "08",
            "08-": "08",
            "SEP": "09",
            "09-": "09",
            "OKT": "10",
            "10-": "10",
            "NOV": "11",
            "11-": "11",
            "DES": "12",
            "12-": "12"
        }
        self.reader = easyocr.Reader(["id"])

    def month_to_num(self, s: str) -> str:
        res = ""
        start = 0
        try:
            for end in range(len(s)):
                rightChar = s[end]
                res += rightChar
                if len(res) == 3:
                    if res.upper() in self.months.keys():
                        numeric_date = self.months[res.upper()]
                        return numeric_date
                    start += 1
                    res = res[1:]
        except Exception as e:
          pass

        return ""

    def get_date_object(self, date_type_1_list: list):
        dates = []
        for date_str in date_type_1_list:
            day_str = date_str[1:3]
            month_str = date_str[3:-4]
            year_str = date_str[-5:-1]

            month_number = self.month_to_num(month_str)
            if month_number == "":
                return ""

            result_date_str = f"{day_str}-{month_number}-{year_str}"
            date_object = datetime.datetime.strptime(result_date_str, "%d-%m-%Y")
            dates.append(date_object)

        return dates

    def find_date_string(self, s: str) -> list:
        s1 = " ".join(re.split(r"([a-zA-Z])([0-9]+)", s))
        s2 = " ".join(re.split(r"([0-9]+)([a-zA-Z]+)", s1))
        text = "-" + "-".join(re.split(r"[-;,.\s]\s*", s2)) + "-"
        dates_type_1 = re.findall(r"-[0-9][0-9]-.*?-[0-9][0-9][0-9][0-9]-|[0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]", text)
        date_objects = []
        if len(dates_type_1) > 0:
            date_objs = self.get_date_object(dates_type_1)
            for date_obj in date_objs:
                date_objects.append(date_obj)

        return date_objects

    def get_date_from_img(self, img_path: str):
        result = []

        text_strings = self.reader.readtext(img_path, detail=0)

        for s in text_strings:
            date_obj_list = self.find_date_string(s)
            if len(date_obj_list) > 0:
                result.append(date_obj_list)

        return result

In [None]:
class CompanyNameExtraction:
    def __init__(self) -> None:
        self.reader = easyocr.Reader(["id"])

    def is_start_of_company_name(self, item):
        if item.startswith('CV') or item.startswith('PT'):
            if item.startswith('CV PRIMA PRINT'):
                return False
            return True
        return False

    def get_company_name(self, s):
        company_name_texts = []

        if self.is_start_of_company_name(s):
            company_name_texts.append(s)

        return company_name_texts

    def get_from_image(self, img_path):
        result = []

        text_results = self.reader.readtext(img_path, detail=0)

        for s in text_results:
            company_name_list = self.get_company_name(s)
            if len(company_name_list) > 0:
              result.append(company_name_list)

        return result

In [None]:
class ProductName:
    def __init__(self):
        self.name = {
            "ART": "AP",
            "DUP": "Duplex",
            "FOI": "Foil",
            "IVO": "Ivory"
        }
        self.width = {
            "100": "100",
            "I00": "100",
            "1O0": "100",
            "10O": "100",
            "IO0": "100",
            "1OO": "100",
            "I0O": "100",
            "IOO": "100",
            "109": "109",
            "I09": "109",
            "1O9": "109",
            "10g": "109",
            "IO9": "109",
            "1Og": "109",
            "I0g": "109",
            "IOg": "109"
        }

    def get_product_name(self, s):
        res = ""
        for i in range(len(s)):
            leftChar = s[i]
            res += leftChar
            if len(res) == 3:
                if res.upper() in self.name.keys():
                    code_name = self.name[res.upper()]
                    return code_name
        return ""

    def get_product_code(self, s):
        a = re.findall(r"[0-9][0-9][X][0-9I][O0-9][O0-9]", s)
        b = re.split(r"[xX]", a[0])
        c = re.findall(r"\d*\s[G][R]", s)

        b[1] = b[1].replace('I', '1')
        b[1] = b[1].replace('O', '0')

        length = b[0]
        width = b[1]

        fixed_size = f"{length}x{width}"

        thickness = c[0].lower()

        fixed_codename = f"{thickness} {fixed_size}"

        return fixed_codename

    def get_final_codename(self, s):
        product = s

        code1 = self.get_product_name(product)
        code2 = self.get_product_code(product)

        fixed_name = f"{code1} {code2}"

        return fixed_name

In [None]:
class QuantityAndPrice:
    def __init__(self, number=0, units=0):
        self.number = number
        self.units = units

    def get_quantity_number(self, lst):
        sp = re.split(r"\s", lst[0])
        number = sp[0]
        units = sp[1]

        number = number.replace(',', '.')

        return float(number), units

    def get_price(self, s):
        s = s.replace('\n', '')
        s = s.replace(',', '')
        s = s.replace('Rp', '')

        return int(s)

    def get_total_price(self, p, n):
        price = self.get_price(p)
        qty, _ = self.get_quantity_number(n)

        total_price = price * qty

        return total_price

In [None]:
# print(img_path.shape)

In [None]:
date_extraction = DateExtraction()



In [None]:
company_name_extraction = CompanyNameExtraction()



In [None]:
product_name_extraction = ProductName()

In [None]:
quantity_price_extraction = QuantityAndPrice()

In [None]:
uploaded = files.upload()
print(uploaded.keys())

Saving cahaya.jpg to cahaya (1).jpg
dict_keys(['cahaya (1).jpg'])


In [None]:
img_path = cv2.imread('cahaya.jpg')

In [None]:
img_path_resized = cv2.resize(img_path, (3808, 2960))

In [None]:
invoice_date = date_extraction.get_date_from_img(img_path_resized)[0][0]
invoice_date.strftime("%d-%m-%Y")

'29-07-2023'

In [None]:
company_name = company_name_extraction.get_from_image(img_path)[0][0]
company_name

'CV CAHAYA MAJU'

In [None]:
img_cv = cv2.resize(img_path_resized, (1000, 750))

In [None]:
img_cv = img_cv[0:550, 0:1000]

In [None]:
# plt.imshow(cv2.cvtColor(img_cv, cv2.COLOR_BGR2RGB))
# plt.axis('off')
# plt.show()

In [None]:
bgr2gray_img = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
gray_img = cv2.cvtColor(bgr2gray_img, cv2.COLOR_BGR2RGB)

In [None]:
# plt.imshow(gray_img)
# plt.axis('off')
# plt.show()

In [None]:
cv2.imwrite('gray.jpg', gray_img)

True

In [None]:
img_table = Image(src="gray.jpg")

In [None]:
easyocr = EasyOCR(lang=["id"])

In [None]:
extracted_tables = img_table.extract_tables(ocr=easyocr, implicit_rows=True)

  .rename({col: f"{col}_" for col in df_h_lines.columns})
  .rename({col: f"{col}_" for col in df_cells.columns})


In [None]:
# table_img = cv2.imread("gray.jpg")

# for table in extracted_tables:
#     for row in table.content.values():
#         for cell in row:
#             cv2.rectangle(table_img, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (255, 0, 0), 2)

# PILImage.fromarray(table_img)

In [None]:
table = extracted_tables.pop()

In [None]:
display_html(table.html_repr(title="Title"), raw=True)

Unnamed: 0,0,1,2,3,4
0,,Deskripsi Barang,Quantity,Harga Satuan,Julah
1,,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM,530\n541,054\n054
2,Terbilang\nRatus Sembilan Puluh Ribu Rupiah\nLina Puluh Delapan\nJuta Delapan,Terbilang\nRatus Sembilan Puluh Ribu Rupiah\nLina Puluh Delapan\nJuta Delapan,Terbilang\nRatus Sembilan Puluh Ribu Rupiah\nLina Puluh Delapan\nJuta Delapan,"DPP\nPotongan (0""):\nPPN :\nToual","53.054.054\nRp.\nRp.\n5.835.946\nRp\n58,890\n800\nRp."
3,CAIATAN\nKeterangan: Transaksi dianggap lunas jika pembayaran masuk kc no. rckening\nperusahaan\nBukúi pembayaran harap diinfokan via WhatsapplSMSICall di 0822-3388-1655,CAIATAN\nKeterangan: Transaksi dianggap lunas jika pembayaran masuk kc no. rckening\nperusahaan\nBukúi pembayaran harap diinfokan via WhatsapplSMSICall di 0822-3388-1655,CAIATAN\nKeterangan: Transaksi dianggap lunas jika pembayaran masuk kc no. rckening\nperusahaan\nBukúi pembayaran harap diinfokan via WhatsapplSMSICall di 0822-3388-1655,"DPP\nPotongan (0""):\nPPN :\nToual","53.054.054\nRp.\nRp.\n5.835.946\nRp\n58,890\n800\nRp."


In [None]:
img_table.to_xlsx(
    'tables.xlsx',
    ocr=easyocr,
    implicit_rows=False,
)

  .rename({col: f"{col}_" for col in df_h_lines.columns})
  .rename({col: f"{col}_" for col in df_cells.columns})


In [None]:
df = pd.read_excel('tables.xlsx')
df

Unnamed: 0.1,Unnamed: 0,Deskripsi Barang,Quantity,Harga Satuan,Julah
0,,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM,530\n541,054\n054
1,Terbilang\nRatus Sembilan Puluh Ribu Rupiah\nL...,,,"DPP\nPotongan (0""):\nPPN :\nToual","53.054.054\nRp.\nRp.\n5.835.946\nRp\n58,890\n8..."
2,CAIATAN\nKeterangan: Transaksi dianggap lunas ...,,,,


In [None]:
df.columns.values[1] = "Deskripsi Barang"
df.columns.values[2] = "Quantity"
df.columns.values[3] = "Harga Satuan"

In [None]:
# df.columns.values[0] = "No"

# df

In [None]:
df = df.iloc[:, :-1]
df

Unnamed: 0.1,Unnamed: 0,Deskripsi Barang,Quantity,Harga Satuan
0,,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM,530\n541
1,Terbilang\nRatus Sembilan Puluh Ribu Rupiah\nL...,,,"DPP\nPotongan (0""):\nPPN :\nToual"
2,CAIATAN\nKeterangan: Transaksi dianggap lunas ...,,,


In [None]:
df = df.iloc[:-2]
df

Unnamed: 0.1,Unnamed: 0,Deskripsi Barang,Quantity,Harga Satuan
0,,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM,530\n541


In [None]:
products_list = []

for i in range(len(df['Deskripsi Barang'])):
    products_list.append(df['Deskripsi Barang'][i])

print(products_list)

['ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220']


In [None]:
for i in range(len(products_list)):
    x = product_name_extraction.get_final_codename(products_list[i])

print(x)

AP 120 gr 65x100


In [None]:
qty_list = []

for i in range(len(df['Quantity'])):
    qty_list.append(df['Quantity'][i])

print(qty_list)

['100 RIM']


In [None]:
price_list = []

for i in range(len(df['Harga Satuan'])):
    price_list.append(df['Harga Satuan'][i])

print(price_list)

['530\n541']


In [None]:
qty_list[0]

'100 RIM'

In [None]:
total_price = []

for i in range(len(df)):
    total_price.append(quantity_price_extraction.get_total_price(price_list[i], qty_list))

print(total_price)

[53054100.0]


In [None]:
price_int = []

for i in range(len(df['Harga Satuan'])):
    price_int.append(quantity_price_extraction.get_price(price_list[i]))

print(price_int)

[530541]


In [None]:
df = df[['Deskripsi Barang', 'Quantity']]
df

Unnamed: 0,Deskripsi Barang,Quantity
0,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM


In [None]:
add_data = {
    'Company Name': [company_name],
    'Tanggal': [invoice_date],
    'Kode': [x],
    'Harga Satuan': price_int,
    'Total Harga': total_price
}

df = df.assign(**add_data)
df

Unnamed: 0,Deskripsi Barang,Quantity,Company Name,Tanggal,Kode,Harga Satuan,Total Harga
0,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,100 RIM,CV CAHAYA MAJU,2023-07-29,AP 120 gr 65x100,530541,53054100.0


In [None]:
df = df.iloc[:,[3,2,0,4,1,5,6]]
df

Unnamed: 0,Tanggal,Company Name,Deskripsi Barang,Kode,Quantity,Harga Satuan,Total Harga
0,2023-07-29,CV CAHAYA MAJU,ART PAPER PD 65XI00 120 GR GOLDEN COIN NEWO220,AP 120 gr 65x100,100 RIM,530541,53054100.0
