In [None]:

import os
import sys
import csv
import django
import json
import pandas as pd

from pathlib import Path
from django.conf import settings
from django.db import connection
from hawc.apps.vocab.models import GuidelineProfile

## Export spreadsheet with corresponding HAWC vocab ids

In [None]:
"""
SELECT
    endpoint_target.id,
    endpoint_category.name AS endpoint_category,
	endpoint_type.name AS endpoint_type,
    endpoint_target.name AS endpoint_target
FROM
    public.vocab_term AS endpoint_target
LEFT JOIN
    public.vocab_term AS endpoint_type ON endpoint_target.parent_id = endpoint_type.id
LEFT JOIN
    public.vocab_term AS endpoint_category ON endpoint_type.parent_id = endpoint_category.id
WHERE
    endpoint_target.type = 4 AND endpoint_target.namespace=2;
"""

In [4]:
# Combine the spreadsheet and replace the endpoint_ids with the Hawc toxref
hawc_df = pd.read_csv('C:/Users/63080/Downloads/hawc/hawc_toxref_export.csv')
guideline_df = pd.read_csv('C:/Users/63080/Downloads/hawc/guideline_profile.csv')

for df in [hawc_df, guideline_df]:
    df["endpoint_category"] = df["endpoint_category"].str.strip()
    df["endpoint_type"] = df["endpoint_type"].str.strip()
    df["endpoint_target"] = df["endpoint_target"].str.strip()

# Merge the DataFrames on vocab columns
df = pd.merge(guideline_df, hawc_df[['id', 'endpoint_category', 'endpoint_type', 'endpoint_target']],
                     on=['endpoint_type', 'endpoint_category', 'endpoint_target'], how='left')

# Replace the endpoint_id column values
df['endpoint_id'] = df['id']

# Drop the term columns as they're no longer needed
df.drop(columns=['id'], inplace=True)
df.drop(columns=['endpoint_type'], inplace=True)
df.drop(columns=['endpoint_category'], inplace=True)
df.drop(columns=['endpoint_target'], inplace=True)
df.rename(columns={'guideline_profile_id': 'id'}, inplace=True)

# Clean up data
df["obs_status"] = df["obs_status"].str.strip()
df["description"] = df["description"].str.strip()

# # Save the updated DataFrame back to a CSV file
df.to_csv('guideline_updated.csv', index=False)

ModuleNotFoundError: No module named 'pandas'

## Read the CSV file and create guideline objects

In [None]:
## Create new GuidelineProfile Objects
GuidelineProfile.objects.all().delete()

with open('guideline_updated.csv', 'r') as csv_file:
    csv_reader = csv.DictReader(csv_file)

    for row in csv_reader:
        print("endpoint", row['endpoint_id'])
        # Add more fields as needed

        row["endpoint_id"] = int(row["endpoint_id"])
        # Create a new instance
        GuidelineProfile.objects.create(**row)

In [None]:
# Generate the data fixture in JSONL instead of CSV
file = settings.PROJECT_PATH / "apps/vocab/fixtures/guideline.jsonl"

def _get_headers(cursor) -> list[str]:
    cursor.execute("Select * FROM vocab_guidelineprofile LIMIT 0")
    return [desc[0] for desc in cursor.description]


with connection.client.connection.cursor() as cursor:
    headers = _get_headers(cursor)
    cursor.execute("SELECT * FROM vocab_guidelineprofile")
    data = cursor.fetchall()

    jsonl_data = []

    for row in data:
        row_dict = {header: row[i] for i, header in enumerate(headers)}
        pk = row_dict["id"]

        fields = {}
        for i, header in enumerate(headers):
            # id should not be in fields
            if header != "id":
                fields[header] = row[i]

        # JSONL
        json_obj = {"model": "vocab.GuidelineProfile", "pk": pk, "fields": fields}
        jsonl_data.append(json_obj)


with open(file, "w") as jsonl_file:
    for json_obj in jsonl_data:
        jsonl_file.write(json.dumps(json_obj) + "\n")