In [2]:
import pandas as pd
import sqlite3
import json

write_conn = sqlite3.connect("../data/automated_metrics.db")
write_conn.execute("PRAGMA foreign_keys = 1")
cur = write_conn.cursor()

In [None]:
cur.execute("CREATE TABLE IF NOT EXISTS context(context_id TEXT PRIMARY KEY, " \
            "context TEXT, " \
            "justice TEXT, " \
            "turn_text TEXT, " \
            "transcript_id TEXT, " \
            "FOREIGN KEY (transcript_id) REFERENCES transcript(transcript_id) ON DELETE CASCADE);")

cur.execute("CREATE TABLE IF NOT EXISTS transcript(transcript_id TEXT PRIMARY KEY, " \
            "case_facts TEXT, " \
            "legal_question TEXT, " \
            "conclusion TEXT);")

cur.execute("CREATE TABLE IF NOT EXISTS remark ( " \
        	"remark_id TEXT PRIMARY KEY, " \
        	"model TEXT, " \
        	"prompting_strategy TEXT," \
        	"justice TEXT, " \
        	"remark_text TEXT, " \
        	"log_id TEXT, " \
        	"context_id TEXT, " \
        	"FOREIGN KEY (context_id) REFERENCES context(context_id));")

cur.execute("CREATE TABLE IF NOT EXISTS distributional_metrics ( " \
        	"distributional_metric_id TEXT PRIMARY KEY, " \
        	"classification_model TEXT, " \
        	"metric_name TEXT, " \
        	"classification TEXT, " \
        	"remark_id TEXT, " \
        	"log_id TEXT, " \
        	"FOREIGN KEY (remark_id) REFERENCES remark(remark_id));")

cur.execute("CREATE TABLE IF NOT EXISTS comparative_metrics ( " \
        	"comparative_metric_id TEXT PRIMARY KEY, " \
        	"classification_model TEXT, " \
        	"metric_name TEXT, " \
        	"classification TEXT, " \
        	"remark1_id TEXT REFERENCES remark(remark_id), " \
            "remark2_id TEXT REFERENCES remark(remark_id), " \
        	"log_id TEXT);")

In [None]:
def restructure_context(context_as_string):
    context_as_list = json.loads(context_as_string)
    restructured_context_list = []
    for turn in context_as_list:
        restructured_turn = {"content": turn["text"], "role": turn["speaker"]["role"]}
        restructured_context_list.append(restructured_turn)
    return str(restructured_context_list)

In [3]:
read_conn = sqlite3.connect("../data/raw_data/transcripts_raw.db")
transcripts_df = pd.read_sql_query("SELECT * from transcript WHERE CAST(SUBSTR(transcript_id, 1, 4) AS INTEGER) > 2023;", read_conn)
contexts_df = pd.read_sql_query("SELECT * from context WHERE CAST(SUBSTR(context_id, 1, 4) AS INTEGER) > 2023 AND turn_text LIKE '% % % %';", read_conn)

# reformat the context column
contexts_df["context"] = contexts_df["context"].apply(restructure_context)

transcripts_df.to_sql('transcript', write_conn, if_exists='append', index=False)
contexts_df.to_sql('context', write_conn, if_exists='append', index=False)