# From unstructure invoices document to buisness insights in minutes

In [None]:
# Simple setup for dataset
# Uncomment the following to create a new dataset

# %%bigquery
# CREATE DATASET lohk-da-coaching.my_new_dataset


## Access unstructured data in BigQuery

Create a new object table to point to invoice PDF files in a storage bucket

In [None]:
# invoices are stored in gs://lohk-da-demo/signify/invoicepdf
%%bigquery
CREATE OR REPLACE EXTERNAL TABLE `document_ai.invoice_object_table`
WITH CONNECTION `us.vertex_llm`
OPTIONS(
  object_metadata = 'SIMPLE',
  uris = ['gs://lohk-da-demo/signify/invoicepdf/*']
);

# preview the table content
SELECT * FROM `document_ai.invoice_object_table` LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,generation,content_type,size,md5_hash,updated,metadata
0,gs://lohk-da-demo/signify/invoicepdf/invoice1.pdf,1714744362765785,application/pdf,421093,265c3602eb283b855e45665217e70758,2024-05-03 13:52:42.804000+00:00,[]
1,gs://lohk-da-demo/signify/invoicepdf/invoice2.pdf,1714744362874062,application/pdf,619796,658f619232f9e78399b9adb4b583fb46,2024-05-03 13:52:42.912000+00:00,[]
2,gs://lohk-da-demo/signify/invoicepdf/invoice3.pdf,1714744362371628,application/pdf,285251,0ef5e7bf15063bd6b00ae9c6fa37f4a0,2024-05-03 13:52:42.410000+00:00,[]
3,gs://lohk-da-demo/signify/invoicepdf/invoice4.pdf,1714744364135107,application/pdf,554785,6c71702b0ddaf8e0f5254aa2b560c528,2024-05-03 13:52:44.180000+00:00,[]
4,gs://lohk-da-demo/signify/invoicepdf/invoice5.pdf,1714744363693909,application/pdf,274957,86c65fb7650cd1cfaadc980a1e9656ba,2024-05-03 13:52:43.732000+00:00,[]
5,gs://lohk-da-demo/signify/invoicepdf/invoice6.pdf,1714744364248134,application/pdf,611885,dd33f66f7be62964ff196f842b84bb4c,2024-05-03 13:52:44.294000+00:00,[]
6,gs://lohk-da-demo/signify/invoicepdf/invoice7.pdf,1714744365215769,application/pdf,993543,12a774e6112a4311ec9ea270f15d6eba,2024-05-03 13:52:45.253000+00:00,[]


## We are all set to process all invoices with Google's advanced document AI

Simply link BigQuery the document AI model, and then point it to the object table

In [None]:
# create a document AI BQ model
%%bigquery
CREATE OR REPLACE MODEL `lohk-da-coaching.document_ai.lohk_invoice_processor`
REMOTE WITH CONNECTION `us.vertex_llm`
OPTIONS (
  REMOTE_SERVICE_TYPE = 'CLOUD_AI_DOCUMENT_V1',
  DOCUMENT_PROCESSOR = 'projects/458398081798/locations/us/processors/bea2aeb9d9f216d3/processorVersions/pretrained-invoice-v2.0-2023-12-06'
);

Query is running:   0%|          |

In [None]:
# process invoice PDF with document AI and store the result in a table
%%bigquery
CREATE OR REPLACE TABLE `lohk-da-coaching.document_ai.document_processed`
AS
SELECT * except (ml_process_document_result, ml_process_document_status)
FROM
  ML.PROCESS_DOCUMENT(
    MODEL `lohk-da-coaching.document_ai.lohk_invoice_processor`,
    TABLE `lohk-da-coaching.document_ai.invoice_object_table`);

# preview the table content
SELECT * FROM `lohk-da-coaching.document_ai.document_processed` LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,invoice_type,currency,due_date,invoice_date,invoice_id,net_amount,purchase_order,receiver_name,receiver_tax_id,supplier_iban,...,supplier_email,supplier_phone,supplier_website,uri,generation,content_type,size,md5_hash,updated,metadata
0,,,,09-06-2022,en73h2319825,,169769050229.0,Trendy Outfits,,,...,,,,gs://lohk-da-demo/signify/invoicepdf/invoice5.pdf,1714744363693909,application/pdf,274957,86c65fb7650cd1cfaadc980a1e9656ba,2024-05-03 13:52:43.732000+00:00,[]
1,,$,,15/08/2028,2000-15,,,Olivia Wilson,,,...,hello@reallygreatsite.com,,www.reallygreatsite.com,gs://lohk-da-demo/signify/invoicepdf/invoice3.pdf,1714744362371628,application/pdf,285251,0ef5e7bf15063bd6b00ae9c6fa37f4a0,2024-05-03 13:52:42.410000+00:00,[]
2,,$,12/11/2023,12/10/2023,,,,Jane Doe,,,...,johnsmith@example.com,,,gs://lohk-da-demo/signify/invoicepdf/invoice6.pdf,1714744364248134,application/pdf,611885,dd33f66f7be62964ff196f842b84bb4c,2024-05-03 13:52:44.294000+00:00,[]
3,,,,19-06-2022,FKAR211234250348,,,,,,...,customercare@ajio.com,1800-889-9991,www.relianceretail.com,gs://lohk-da-demo/signify/invoicepdf/invoice4.pdf,1714744364135107,application/pdf,554785,6c71702b0ddaf8e0f5254aa2b560c528,2024-05-03 13:52:44.180000+00:00,[]
4,,*,,"25 Jul, 2021",1627147360-22,78.57,,Ayushi Saini,,,...,hello@myglamm.com,,myglamm.com,gs://lohk-da-demo/signify/invoicepdf/invoice7.pdf,1714744365215769,application/pdf,993543,12a774e6112a4311ec9ea270f15d6eba,2024-05-03 13:52:45.253000+00:00,[]
5,,$,,12/06/2023,1000-15088,895.0,,,,0123 0000 1111 2323,...,cleaningservices@email.com,(000) 123 456 7890,TemplateLab.com,gs://lohk-da-demo/signify/invoicepdf/invoice2.pdf,1714744362874062,application/pdf,619796,658f619232f9e78399b9adb4b583fb46,2024-05-03 13:52:42.912000+00:00,[]
6,,$,12/01/18,11/11/18,#INV02081,2590.0,,Allen Smith,,,...,(bob@stanfordplumbing.com,990-120-4560,www.plumbingstanford.com,gs://lohk-da-demo/signify/invoicepdf/invoice1.pdf,1714744362765785,application/pdf,421093,265c3602eb283b855e45665217e70758,2024-05-03 13:52:42.804000+00:00,[]


## Let's go beyond and use GenAI to unlock insights

We can use GenAI to unlock more insights, detect anomalies and curate actions to take.

First, we combine all collected information into a json column

In [None]:
# create a new column 'json_info' to store all invoice information in a json string
%%bigquery
ALTER TABLE `lohk-da-coaching.document_ai.document_processed`
ADD COLUMN json_info STRING;

# jsonify all column data of a row into a json string and store it in the new column
UPDATE `lohk-da-coaching.document_ai.document_processed` t
SET json_info = TO_JSON_STRING(t)
where true;

# preview the table content
SELECT uri, json_info FROM `lohk-da-coaching.document_ai.document_processed` LIMIT 10;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,json_info
0,gs://lohk-da-demo/signify/invoicepdf/invoice2.pdf,"{""invoice_type"":"""",""currency"":""$"",""due_date"":n..."
1,gs://lohk-da-demo/signify/invoicepdf/invoice3.pdf,"{""invoice_type"":"""",""currency"":""$"",""due_date"":n..."
2,gs://lohk-da-demo/signify/invoicepdf/invoice6.pdf,"{""invoice_type"":"""",""currency"":""$"",""due_date"":""..."
3,gs://lohk-da-demo/signify/invoicepdf/invoice1.pdf,"{""invoice_type"":"""",""currency"":""$"",""due_date"":""..."
4,gs://lohk-da-demo/signify/invoicepdf/invoice4.pdf,"{""invoice_type"":"""",""currency"":null,""due_date"":..."
5,gs://lohk-da-demo/signify/invoicepdf/invoice5.pdf,"{""invoice_type"":"""",""currency"":null,""due_date"":..."
6,gs://lohk-da-demo/signify/invoicepdf/invoice7.pdf,"{""invoice_type"":"""",""currency"":""*"",""due_date"":n..."


In [None]:
# retrieve description, insights and action items over the invoices
%%bigquery
SELECT uri, ml_generate_text_llm_result AS generated_text,
FROM
  ML.GENERATE_TEXT(
    MODEL `bqml_llm.llm_model`,
    (
      SELECT uri, json_info,
        CONCAT('From the provided json, please explain what this invoice is about, what is missing as information, any anomaly, and suggest actions to take: ', json_info) AS prompt
      FROM `lohk-da-coaching.document_ai.document_processed`
      LIMIT 3
    ),
    STRUCT(
       1 AS temperature,
       1000 AS max_output_tokens,
       1 AS top_p,
       30 AS top_k,
       TRUE AS flatten_json_output));



Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,generated_text
0,gs://lohk-da-demo/signify/invoicepdf/invoice2.pdf,**Invoice Overview:**\n\nThis invoice appears...
1,gs://lohk-da-demo/signify/invoicepdf/invoice3.pdf,### Invoice Information\n- Invoice Type: Miss...
2,gs://lohk-da-demo/signify/invoicepdf/invoice6.pdf,**Invoice Summary:**\n\nThis invoice appears ...


## Now let's unlock semantic search on our document in just a 2 commands

AI semantic search is essential for getting insights from documents because it goes beyond simple keyword matching and understands the meaning and context behind words and phrases. This enables it to:

1. Uncover hidden relationships: Semantic search can identify connections between concepts that might not be explicitly stated, revealing deeper insights and patterns within the data.

2. Improve relevance: By understanding the intent behind a query, semantic search delivers more relevant results, even if the exact keywords are not present in the document.

3. Handle ambiguity: Semantic search can disambiguate words with multiple meanings, ensuring that the results align with the intended context.

4. Support complex queries: Semantic search can process complex questions and provide accurate answers, even when the information is scattered across multiple documents.

5. Enhance personalization: By analyzing user behavior and preferences, semantic search can tailor results to individual needs, making the search experience more efficient and effective.

6. Enable knowledge discovery: Semantic search can help users discover new information and connections they might not have been aware of, leading to new insights and ideas.

In [None]:
# generate embeddings
%%bigquery
CREATE OR REPLACE TABLE `lohk-da-coaching.document_ai.document_processed_embedded`
AS SELECT uri, json_info, text_embedding
FROM
  ML.GENERATE_TEXT_EMBEDDING(
    MODEL `embeddings.llm_embedding_model`,
    (SELECT uri, json_info, json_info as content
      FROM `lohk-da-coaching.document_ai.document_processed` limit 10
    ),
    STRUCT(TRUE AS flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
# search invoice using natural language
# Example 1: Which invoice has cost associated with logo design?
# Example 2: invoice that could be related to cleaning services
%%bigquery
SELECT
  base.uri, distance
FROM
  VECTOR_SEARCH( TABLE `lohk-da-coaching.document_ai.document_processed_embedded`, 'text_embedding', (
    SELECT
      ml_generate_embedding_result,
      content AS query
    FROM
      ML.GENERATE_EMBEDDING( MODEL `embeddings.llm_embedding_model`,
        (
        SELECT 'Which invoice has cost associated with logo design?' AS content
        ))
    ),
    top_k => 5) order by distance;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,distance
0,gs://lohk-da-demo/signify/invoicepdf/invoice3.pdf,0.713207
1,gs://lohk-da-demo/signify/invoicepdf/invoice6.pdf,0.755833
2,gs://lohk-da-demo/signify/invoicepdf/invoice2.pdf,0.782949
3,gs://lohk-da-demo/signify/invoicepdf/invoice4.pdf,0.786253
4,gs://lohk-da-demo/signify/invoicepdf/invoice5.pdf,0.805808
