<a href="https://colab.research.google.com/github/rendalamili/ml-for-table-extraction/blob/main/Final_Metrics_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas numpy



In [211]:
import os
from google.colab import files
import zipfile
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Upload zip file containing both ground truth and output csvs - metrics.zip
uploaded = files.upload()

Saving metrics.zip to metrics.zip


In [4]:
# Extract zip
with zipfile.ZipFile('metrics.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [9]:
# Ref: https://stackoverflow.com/questions/4623446/how-do-you-sort-files-numerically/4623518#4623518
import re

def tryint(s):
    try:
        return int(s)
    except:
        return s

def alphanum_key(s):
    """ Turn a string into a list of string and number chunks.
        "z23a" -> ["z", 23, "a"]
    """
    return [ tryint(c) for c in re.split('([0-9]+)', s) ]

def sort_nicely(l):
    """ Sort the given list in the way that humans expect.
    """
    l.sort(key=alphanum_key)

In [131]:
def match_files(n):
    output_dir = '/content/output_csvs/'
    ground_truth_dir = '/content/ground_truth_csvs/'
    # Get sorted list of files
    output_files = os.listdir(output_dir)
    sort_nicely(output_files)

    ground_truth_files = os.listdir(ground_truth_dir)
    sort_nicely(ground_truth_files)

    # match files based on first 10 characters
    matched_files = []
    for output_file in output_files:
      output_name = output_file[:10]
      ground_truth_file = None
      for ground_truth_file in ground_truth_files:
        if ground_truth_file[:10] == output_name:
          matched_files.append((output_file, ground_truth_file))
    return(matched_files[n-1])

In [427]:
def format_output_df(output_df):
    ## format output_csv df for accuracy comparison
    # header should be first row, not row[0]
    headers = pd.DataFrame([output_df.columns])
    output_df.columns = range(len(output_df.columns))
    output_df = pd.concat([headers, output_df])
    output_df = output_df.astype('object')
    output_df = output_df.reset_index(drop=True)
    return output_df

In [465]:
def load_output_dfs(file):
    output_dir = '/content/output_csvs/'
    output_csv = pd.read_csv(output_dir + file)
    output_df = pd.DataFrame(output_csv)
    output_df = format_output_df(output_df)
    return output_df
df = load_output_dfs(match_files(1)[0])
df

Unnamed: 0,0,1,2,3,4,5,6
0,Form,Average,retail price,Preparation yield factor,Size ec,of a cup equivalent,Average price per cup equivalent
1,Fresh1,$1.85,per pound,90.00%,24.30%,Pounds,$0.50
2,Applesauce?,$1.17,per pound,100.00%,54.00%,Pounds,$0.63
3,Juice,,,,,,
4,Ready to drink,$0.87,per pint,100.00%,800.00%,Fluid ounces,$0.43
5,Frozen 4,$0.61,per pint,100.00%,800.00%,Fluid ounces,$0.30


In [418]:
def format_ground_truth_df(ground_truth_df):
    ## format ground_truth_csv df for accuracy comparison
    # first row should be header, not row[0]
    # ground_truth_df.columns = ground_truth_df.iloc[0]
    # ground_truth_df = ground_truth_df[1:]
    # ground_truth_df

    # Continuously remove rows where all values from column 2 onward are NaN
    while not ground_truth_df.empty:
        if ground_truth_df.iloc[-1, 1:].isna().all():
            # Drop the last row
            ground_truth_df = ground_truth_df.iloc[:-1]
        else:
            break  # Stop when a valid row is found
    ground_truth_df.columns = range(len(ground_truth_df.columns))
    return ground_truth_df


In [419]:
def load_ground_truth_dfs(file):
    # Read ground_truth_csv 1 as df
    ground_truth_dir = '/content/ground_truth_csvs/'
    ground_truth_csv =pd.read_csv(ground_truth_dir + file, encoding='unicode_escape')
    ground_truth_df = pd.DataFrame(ground_truth_csv)
    ground_truth_df = format_ground_truth_df(ground_truth_df)
    return ground_truth_df
df2 = load_ground_truth_dfs(match_files(1)[1])
df2


Unnamed: 0,0,1,2,3,4,5,6
0,Form,Average retail price,,Preparation yield factor,Size of a cup equivalent,,Average price per cup equivalent
1,Fresh1,$1.85,per pound,0.9,0.243,Pounds,$0.50
2,Applesauce2,$1.17,per pound,1,0.540,Pounds,$0.63
3,Juice,,,,,,
4,Ready to drink3,$0.87,per pint,1,8,Fluid ounces,$0.43
5,Frozen4,$0.61,per pint,1,8,Fluid ounces,$0.30


In [429]:
def structure_accuracy(output_df, ground_truth_df):
    # Structure metrics - no. of rows and columns match
    structure_metrics = {
        "row_count_match": output_df.shape[0] == ground_truth_df.shape[0],
        "column_count_match": output_df.shape[1] == ground_truth_df.shape[1]}
    structure_accuracy = sum(structure_metrics.values()) / len(structure_metrics)
    return structure_accuracy

In [438]:
def row_accuracy(output_df, ground_truth_df):
    ot_row_number = len(output_df)
    gt_row_number = len(ground_truth_df)
    min_row = min(ot_row_number, gt_row_number)
    matching_rows = sum(ground_truth_df.iloc[i].equals(output_df.iloc[i]) for i in range(min_row))
    row_accuracy = matching_rows / len(ground_truth_df)
    return row_accuracy

In [447]:
def cell_accuracy(output_df, ground_truth_df):
    y_true = ground_truth_df.values.flatten()
    y_pred = output_df.values.flatten()
    cell_found = 0
    total_cells = len(y_true)
    if output_df.shape[0] == ground_truth_df.shape[0]:  #number of rows equal
        for n in range(len(y_pred)):
            if y_pred[n] in y_true:
                cell_found += 1
    else:
        cell_found = 0
    cell_accuracy = cell_found / total_cells
    return cell_accuracy

In [221]:
def detailed_results(n):
    output_file = match_files(n)[0]
    ground_truth_file = match_files(n)[1]
    output_df = load_output_dfs(output_file)
    ground_truth_df = load_ground_truth_dfs(ground_truth_file)
    print("OUTPUT DF", n, output_df)
    print("GROUND TRUTH DF", n, ground_truth_df)
    print(f"STRUCTURE ACCURACY FOR FILE {n}: {structure_accuracy(output_df, ground_truth_df):.2%}")

In [224]:
def all_detailed_results():
    for n in range(1,46):
        detailed_results(n)
        print("\n" + "-"*50 + "\n")

In [454]:
def results_df():
    results = []
    for n in range(1,46):
        output_file = match_files(n)[0]
        ground_truth_file = match_files(n)[1]
        output_df = load_output_dfs(output_file)
        ground_truth_df = load_ground_truth_dfs(ground_truth_file)
        results.append({
            'Output File': output_file,
            'Ground Truth File': ground_truth_file,
            'Structure Accuracy': structure_accuracy(output_df, ground_truth_df),
            'Row Accuracy': row_accuracy(output_df, ground_truth_df),
            'Cell Accuracy': cell_accuracy(output_df, ground_truth_df)
        })
    pd.options.display.float_format = '{:.2%}'.format
    return pd.DataFrame(results)

results_df()

Unnamed: 0,Output File,Ground Truth File,Structure Accuracy,Row Accuracy,Cell Accuracy
0,table_1_Apples-2022_page_1_table_1.csv,table_1_Apples-2022.csv,100.00%,16.67%,40.48%
1,table_2_Apricots-2022_page_1_table_1.csv,table_2_Apricots-2022.csv,100.00%,0.00%,35.71%
2,table_3_Artichoke-2022_page_1_table_1.csv,table_3_Artichoke-2022.csv,100.00%,0.00%,47.62%
3,table_4_Beets-2022_page_1_table_1.csv,table_4_Beets-2022.csv,100.00%,0.00%,42.86%
4,table_5_Black-beans-2022_page_1_table_1.csv,table_5_Black-beans-2022.csv,100.00%,0.00%,38.10%
5,table_6_Blackberries-2022_page_1_table_1.csv,table_6_Blackberries-2022.csv,100.00%,0.00%,42.86%
6,table_7_Blueberries-2022_page_1_table_1.csv,table_7_Blueberries-2022.csv,100.00%,0.00%,42.86%
7,table_8_Cabbage-2022_page_1_table_1.csv,table_8_Cabbage-2022.csv,100.00%,0.00%,42.86%
8,table_9_Cantaloupe-2022_page_1_table_1.csv,table_9_Cantaloupe-2022.csv,0.00%,0.00%,0.00%
9,table_10_Carrots-2022_page_1_table_1.csv,table_10_Carrots-2022.csv,100.00%,14.29%,38.78%


In [463]:
result_df = results_df()
result_df.to_csv('results.csv', index=False)

In [468]:
y_pred = df.values.flatten()

In [469]:
y_true = df2.values.flatten()

In [470]:
print(y_pred)
print(y_true)

['Form' 'Average' 'retail price' 'Preparation yield factor' 'Size ec'
 'of a cup equivalent' 'Average price per cup equivalent' 'Fresh1' '$1.85'
 'per pound' 0.9 0.243 'Pounds' '$0.50' 'Applesauce?' '$1.17' 'per pound'
 1.0 0.54 'Pounds' '$0.63' 'Juice' nan nan nan nan nan nan
 'Ready to drink' '$0.87' 'per pint' 1.0 8.0 'Fluid ounces' '$0.43'
 'Frozen 4' '$0.61' 'per pint' 1.0 8.0 'Fluid ounces' '$0.30']
['Form' 'Average retail price ' nan 'Preparation yield factor'
 'Size of a cup equivalent ' nan 'Average price per cup equivalent'
 'Fresh1' '$1.85' ' per pound' '0.9' '0.243' 'Pounds' '$0.50'
 'Applesauce2' '$1.17' ' per pound' '1' '0.540' 'Pounds' '$0.63' 'Juice'
 nan nan nan nan nan nan 'Ready to drink3' '$0.87' ' per pint' '1' '8'
 'Fluid ounces' '$0.43' 'Frozen4' '$0.61' ' per pint' '1' '8'
 'Fluid ounces' '$0.30']


In [457]:
def metrics(output_df, ground_truth_df):
    y_true = ground_truth_df.values.flatten()
    y_pred = output_df.values.flatten()

    cell_precision = precision_score(y_true, y_pred)
    cell_recall = recall_score(y_true, y_pred)
    cell_f1 = f1_score(y_true, y_pred)
    return {
        "Cell Precision": cell_precision,
        "Cell Recall": cell_recall,
        "Cell F1 Score": cell_f1}


In [458]:
def test(n):
    output_file = match_files(n)[0]
    ground_truth_file = match_files(n)[1]
    output_df = load_output_dfs(output_file)
    ground_truth_df = load_ground_truth_dfs(ground_truth_file)
    results = metrics(output_df, ground_truth_df)
    print(results)

In [459]:
def test_all():
    for n in range(1,46):
        test(n)
        print("\n" + "-"*50 + "\n")

In [460]:
test_all()

TypeError: '<' not supported between instances of 'float' and 'str'