# Quality assurance / sanity checks

In [None]:
from pyprojroot import here
import pandas as pd

### Checking that the ground truth doesn't include any NON- drug-disease pairs

In [5]:
# Define the root folder using pyprojroot
ROOT_FOLDER = here()

# Step 1: Read the CSV files
nodes_df = pd.read_csv(f'{ROOT_FOLDER}/data/original/Nodes.csv')
ground_truth_df = pd.read_csv(f'{ROOT_FOLDER}/data/original/Ground Truth.csv')

# Merge the 'source' information
source_merge = pd.merge(
    ground_truth_df, 
    nodes_df[['id', 'name', 'category']],  # selecting only the needed columns
    how='left',
    left_on='source', 
    right_on='id'
)

# Rename the columns to reflect that they are for the source node
source_merge = source_merge.rename(
    columns={
        'name': 'source_name',
        'category': 'source_category'
    }
)

# Drop the extra 'id' column that came from nodes_df
source_merge = source_merge.drop(columns=['id'])

# Merge the 'target' information
final_df = pd.merge(
    source_merge, 
    nodes_df[['id', 'name', 'category']],
    how='left',
    left_on='target', 
    right_on='id'
)

# Rename the columns to indicate that these attributes are for the target node
final_df = final_df.rename(
    columns={
        'name': 'target_name',
        'category': 'target_category'
    }
)

# Drop the extra 'id' column from the second merge
final_df = final_df.drop(columns=['id'])

# Filter rows based on allowed categories

# List of allowed categories
allowed_categories = {
    'biolink:Disease',
    'biolink:Drug',
    'biolink:DiseaseOrPhenotypicFeature',
    'biolink:SmallMolecule',
    'biolink:PhenotypicFeature'
}

# Identify rows where either the source_category or target_category is NOT allowed
invalid_mask = (
    ~final_df['source_category'].isin(allowed_categories) |
    ~final_df['target_category'].isin(allowed_categories)
)

# Create two separate dataframes:
# 1. Valid rows (where both categories are allowed)
# 2. Rows that are filtered out (where at least one category is not allowed)
df_valid = final_df[~invalid_mask]
df_invalid = final_df[invalid_mask]

# Check the number of invalid rows
print(f"Number of invalid rows: {df_invalid.shape[0]}")

# Save the results to CSV files
df_valid.to_csv("filtered_valid.csv", index=False)
df_invalid.to_csv("filtered_invalid.csv", index=False)

Number of invalid rows: 0
