# Schema Linking Accuracy Metric (SLAM):

### 1. Table-Level Accuracy:
For each question:

At: Number of tables correctly identified by the system.

Mt: Number of tables missed by the system (present in gold label but not identified).

Et: Number of erroneous tables identified by the system (not present in the gold label).


TA: Table Accuracy for the question

ATA: Aggregate Table-Level Accuracy across all questions 

TA = At/(At+Mt+Et)

ATA = sum(TA)/len(TA)


### 2. Column-Level Accuracy (within correctly identified tables):
For each correctly identified table:

At: Number of tables correctly identified by the system.

Mt: Number of tables missed by the system (present in gold label but not identified).

Et: Number of erroneous tables identified by the system (not present in the gold label).


TA: Table Accuracy for the question

ATA: Aggregate Table-Level Accuracy across all questions 

Column Accuracy for the table = 

 
​


### 3. Overall Schema Linking Accuracy Metric (SLAM):
SLAM = a * ATA + (1-a) * ACA

In [6]:
import json

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Specify the path to your JSON file
json_file_path = 'dev/dev.json'  # Replace with the actual path to your JSON file

# Read the JSON file
json_data_list = read_json_file(json_file_path)


In [7]:
for json_data in json_data_list:
    # Access and print individual fields
    question_id = json_data['question_id']
    db_id = json_data['db_id']
    question = json_data['question']
    evidence = json_data['evidence']
    sql_query = json_data['SQL']
    difficulty = json_data['difficulty']
    tables = json_data['tables']
    columns = json_data['columns']

    # Print the values
    #print("Question ID:", question_id)
    #print("Database ID:", db_id)
    #print("Question:", question)
    #print("Evidence:", evidence)
    #print("SQL Query:", sql_query)
    #print("Difficulty:", difficulty)
    #print("Tables:", tables)
    print("Columns:", columns)

Columns: ['"county name"', '"enrollment (k-12)"', '"free meal count (k-12)"']
Columns: ['"educational option type"', '"enrollment (ages 5-17)"', '"free meal count (ages 5-17)"']
Columns: ['"charter school (y/n)"', '"district name"', 'cdscode', 'zip']
Columns: ['"frpm count (k-12)"', 'cdscode', 'mailstreet']
Columns: ['"charter funding type"', '"charter school (y/n)"', 'cdscode', 'opendate', 'phone']
Columns: ['avgscrmath', 'cds', 'cdscode', 'school', 'virtual']
Columns: ['cds', 'cdscode', 'magnet', 'numtsttakr', 'school']
Columns: ['cds', 'cdscode', 'numge1500', 'phone']
Columns: ['"frpm count (k-12)"', 'cds', 'cdscode', 'numtsttakr']
Columns: ['"charter funding type"', '"school code"', 'avgscrmath', 'cds', 'cdscode']
Columns: ['"frpm count (ages 5-17)"', 'avgscrread', 'cds', 'cdscode']
Columns: ['"enrollment (ages 5-17)"', '"enrollment (k-12)"', 'cdscode']
Columns: ['"enrollment (ages 5-17)"', '"free meal count (ages 5-17)"', 'cds', 'cdscode', 'numge1500', 'numtsttakr']
Columns: ['cds

# DFs

In [11]:
import pandas as pd
import json

# Load the JSON data from the file
with open('dev/dev.json', 'r') as json_file:
    json_data = json.load(json_file)

## Extract the fields from the JSON data
#question_id = data['question_id']
#db_id = data['db_id']
#tables = data['tables']

# Create a DataFrame from the extracted data
df_dev_set = pd.DataFrame(json_data, columns=["question_id", "db_id", "tables"])

# Print the DataFrame
df_dev_set.head()

Unnamed: 0,question_id,db_id,tables
0,0,california_schools,[frpm]
1,1,california_schools,[frpm]
2,2,california_schools,"[frpm, schools]"
3,3,california_schools,"[frpm, schools]"
4,4,california_schools,"[frpm, schools]"


In [32]:
import pandas as pd
import ast

# Function to import a CSV file into a pandas DataFrame with the given schema
def import_csv_file(file_path):
    # Define a custom converter to parse the string representation of lists
    def parse_list(x):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return []

    # Specify the column data types
    dtype_dict = {
        'question_id': int,
        'tables': str,
        'gen_tables': str,
        'total_tables': int
    }

    # Specify the converters for list columns
    converters = {
        'tables': parse_list,
        'gen_tables': parse_list
    }

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path, dtype=dtype_dict, converters=converters)
    return df

In [33]:
df2 = import_csv_file('gen_tables.csv')
df2.head()

Unnamed: 0,question_id,tables,gen_tables,total_tables;
0,1,"['Banana','Apple','Cherry']","['Orange','Grapes','Lemon']",12;
1,2,"['Orange','Mango','Strawberry']","['Apple','Blueberry','Pineapple']",17;
2,3,"['Grapes','Kiwi','Peach']","['Lemon','Banana','Cherry']",25;
3,4,"['Blueberry','Pear','Watermelon']","['Mango','Strawberry','Apple']",8;
4,5,"['Pineapple','Lime','Raspberry']","['Kiwi','Grapes','Peach']",36;


In [13]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Function to compare two DataFrames and calculate evaluation metrics for each row
def compare_dataframes(df1, df2):
    # Initialize variables for counting metrics
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    # Iterate over rows of both DataFrames
    for index, (row1, row2) in enumerate(zip(df1.iterrows(), df2.iterrows())):
        row1 = row1[1]
        row2 = row2[1]

        # Perform comparison and calculate metrics
        if set(row1['tables']) == set(row2['gen_tables']):
            if row1['total_tables'] == row2['total_tables']:
                true_positives += 1
            else:
                false_positives += 1
        else:
            if row1['total_tables'] == row2['total_tables']:
                true_negatives += 1
            else:
                false_negatives += 1

        # Calculate precision, recall, and F1 score for each row
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        f1 = 2 * (precision * recall) / (precision + recall)

        print(f"Row {index + 1}:")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
        print("")

# Example: Compare two DataFrames and calculate evaluation metrics
df1 = df_dev_set  # Replace with your DataFrames
df2 = import_csv_file('gen_tables.csv')

compare_dataframes(df1, df2)

ValueError: invalid literal for int() with base 10: '["Orange"'