### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

### Use conda environment `dataexpl_p37_cpu_v3`

### Ensure correct package versions

In [None]:
!pip show oci

In [None]:
!pip install oci -U

In [None]:
!pip show oci-cli

In [None]:
!pip install oci-cli -U

In [None]:
!pip instsall oracle-ads -U

In [None]:
import os
import ads

In [None]:
ads.hello()

### Set parameters for API call

In [None]:
input_object_storage_namespace_name=
output_object_storage_namespace_name=
input_bucket_name=
output_bucket_name=input_bucket_name
output_object_storage_prefix="Document_Analysis"
object_name=
compartment_id=os.environ["NB_SESSION_COMPARTMENT_OCID"]
display_name="document_analysis_processor_job"
language="en"
opc_retry_token=None
opc_request_id=None
document_type="BANK_STATEMENT"

### API call for generating document analysis

In [None]:
import oci

# Refer to
# https://docs.cloud.oracle.com/en-us/iaas/Content/API/Concepts/sdkconfig.htm#SDK_and_CLI_Configuration_File
# for more info
ads.set_auth(auth='resource_principal')
rps = oci.auth.signers.get_resource_principals_signer()

# Initialize service client with resource principal authentication
ai_document_client = oci.ai_document.AIServiceDocumentClient(config={}, signer=rps)


# Send the request to service, some parameters are not required, see API
# doc for more info
create_processor_job_response = ai_document_client.create_processor_job(
    create_processor_job_details=oci.ai_document.models.CreateProcessorJobDetails(
        input_location=oci.ai_document.models.ObjectStorageLocations(
            source_type="OBJECT_STORAGE_LOCATIONS",
            object_locations=[
                oci.ai_document.models.ObjectLocation(
                    namespace_name=input_object_storage_namespace_name,
                    bucket_name=input_bucket_name,
                    object_name=object_name)]),
        output_location=oci.ai_document.models.OutputLocation(
            namespace_name=output_object_storage_namespace_name,
            bucket_name=output_bucket_name,
            prefix=output_object_storage_prefix),
        compartment_id=compartment_id,
        processor_config=oci.ai_document.models.GeneralProcessorConfig(
            processor_type="GENERAL",
            features=[oci.ai_document.models.DocumentTextExtractionFeature(
                    feature_type="LANGUAGE_CLASSIFICATION",generate_searchable_pdf=True),
                     oci.ai_document.models.DocumentTextExtractionFeature(
                    feature_type="TABLE_EXTRACTION",generate_searchable_pdf=True),
                     oci.ai_document.models.DocumentTextExtractionFeature(
                    feature_type="TEXT_EXTRACTION",generate_searchable_pdf=True),
                     oci.ai_document.models.DocumentTextExtractionFeature(
                    feature_type="KEY_VALUE_EXTRACTION",generate_searchable_pdf=True),
                     oci.ai_document.models.DocumentTextExtractionFeature(
                    feature_type="DOCUMENT_CLASSIFICATION",generate_searchable_pdf=True)],
            document_type=document_type,
            is_zip_output_enabled=True,
            language=language),
        display_name=display_name),
    opc_retry_token=opc_retry_token,
    opc_request_id=opc_request_id)

# Get the data from response
print(create_processor_job_response.data)

In [None]:
create_processor_job_id = create_processor_job_response.data.id

### Start here if document analysis has already completed and variables are no longer stored in memory

In [None]:
create_processor_job_id = # use create_processor_job_response.data.id from output of API call

In [None]:
download_dir_list=!pwd
download_dir=download_dir_list[0]

In [None]:
download_dir

### OCI-CLI call for downloading the document analysis

In [None]:
!oci --auth resource_principal os object bulk-download -bn {output_bucket_name} --overwrite --namespace {output_object_storage_namespace_name} --prefix {output_object_storage_prefix}/{create_processor_job_id} --download-dir {download_dir}


### Show where JSON analysis is

In [None]:
json_path_list = !ls {download_dir}/{output_object_storage_prefix}/{create_processor_job_id}/{output_object_storage_namespace_name}_{output_bucket_name}/results/*.json
json_path = json_path_list[0]

In [None]:
json_path

### Define function
#### Add empty elements to preserve relational structure according to analysis

In [None]:
# add empty elements where there are gaps before, between, and after columnIndex values in list_column_index
def add_empty_elements(list_column_index, list_text_sorted, list_column_index_header=None):
    
    list_text_sorted_with_empties=[]
    # empty_value="INSERTED_VALUE_SC_CONFORM_TO_HEADER"
    empty_value=None

    for i in range(len(list_column_index)):
        curr_element=list_column_index[i]
        # print("index of curr_element, curr_element, list_text_sorted[i]: ", i, curr_element, list_text_sorted[i])
        
        # if last element
        if i==len(list_column_index)-1:
            
            # if first element
            if i==0:
                # add empties at beginning
                for j in range(0,curr_element):
                    list_text_sorted_with_empties.append(empty_value)
            
            # add current element
            list_text_sorted_with_empties.append(curr_element_text)
            
            # add empties at the end to conform number of body cells to number of header cells
            if list_column_index_header:
                for j in range(curr_element,max(list_column_index_header)):
                    list_text_sorted_with_empties.append(empty_value)
        
        else:
            
            # if first element
            if i==0:
                # add empties at beginning
                for j in range(0,curr_element):
                    list_text_sorted_with_empties.append(empty_value)
                    
            # add current element
            list_text_sorted_with_empties.append(curr_element_text)
            
            # add empties between current element and next element
            diff=list_column_index[i+1]-curr_element
            for j in range(1,diff):
                list_text_sorted_with_empties.append(empty_value)
                
    return list_text_sorted_with_empties

### Parse JSON analysis to CSV

In [None]:
# set parameters
output_file_prefix_list=!pwd
output_file_prefix=output_file_prefix_list[0]
output_file_dir_name="CSV_output"
output_file_full_path=f"{output_file_prefix}/{output_file_dir_name}"

In [None]:
!mkdir -p {output_file_full_path}

In [None]:
import csv

for page in data["pages"]:
    page_number=data["pages"].index(page)
    
    for table in page["tables"]:
        table_number=page["tables"].index(table)
        
        # print(f"p_{page_number}_t_{table_number}")
        
        output_file = f"{output_file_full_path}/p_{page_number}_t_{table_number}.csv"
        
        for headerRow in table["headerRows"]:
            row=[]
            row_text=[]
            row_column_index=[]
            
            for cell in headerRow["cells"]:
                # Sort the cells based on columnIndex
                row_text.append(cell["text"])
                row_column_index.append(cell["columnIndex"])
                
            row_text_sorted = [x for _, x in sorted(zip(row_column_index,row_text))]
            row_column_index_sorted_h = sorted(row_column_index)
            
            row_text_sorted = add_empty_elements(row_column_index_sorted_h, row_text_sorted)
                
            with open(output_file, "a", encoding="utf-8") as f:
                write = csv.writer(f)
                write.writerow(row_text_sorted)
        
        for bodyRow in table["bodyRows"]:
            row=[]
            row_text=[]
            row_column_index=[]
            
            for cell in bodyRow["cells"]:
                # Sort the cells based on columnIndex
                row_text.append(cell["text"])
                row_column_index.append(cell["columnIndex"])
                
            row_text_sorted = [x for _, x in sorted(zip(row_column_index,row_text))]
            row_column_index_sorted = sorted(row_column_index)
                
            row_text_sorted = add_empty_elements(row_column_index_sorted, row_text_sorted, row_column_index_sorted_h)
                
            with open(output_file, "a", encoding="utf-8") as f:
                write = csv.writer(f)
                write.writerows([row_text_sorted])

### Remove CSV files

In [None]:
!rm -rf {output_file_full_path}/*.csv