In [1]:
import os
import pandas as pd
import yaml
#%pip install tqdm
from tqdm import tqdm_notebook
import altair as alt

#Set working directory
os.chdir('/your/working/directory/path/Entity_analysis/KG2.9.0c')

### Node ID match

Only node id's from DrugMechDB matched to KG2

In [2]:
# Load file
file_path = 'DrugDB_nodeID_match.yaml'

with open(file_path, 'r') as file:
    node_id = yaml.safe_load(file)

# Display the loaded YAML data
#node_id


In [3]:
# Extracting the 'pct_matched' values from the data
pct_matched_values = [graph['graph']['pct_matched'] for graph in node_id if 'pct_matched' in graph['graph']]

# Calculating the average of the 'pct_matched' values
average_pct_matched = sum(pct_matched_values) / len(pct_matched_values)
print("Average percent match based on node ID's:", average_pct_matched)


Average percent match based on node ID's: 89.33674740471444


In [4]:
graph_id = [graph['graph']['_id'] for graph in node_id if 'pct_matched' in graph['graph']]
pct_matched = [graph['graph']['pct_matched'] for graph in node_id if 'pct_matched' in graph['graph']]

# Constructing the DataFrame
df_graphs = pd.DataFrame({
    'graph_id': graph_id,
    'pct_matched': pct_matched
})

# Display dataframe
#df_graphs


In [5]:
# Make histogram

chart = alt.Chart(df_graphs).mark_bar(color='#FF6F61').encode(
    x=alt.X('pct_matched:Q', bin=alt.Bin(maxbins=20), title='Percentage Matched', sort='descending'),
    y=alt.Y('count()', title='Count of Graph IDs',scale=alt.Scale(domain=(0, 4000)))
).properties(
    width=800,
    height=400,
    title='Histogram of Percentage Matched by ID: KG2.9.0c'
)

chart.display()


In [6]:
# Create a scatter plot
chart = alt.Chart(df_graphs).mark_point().encode(
    x=alt.X('graph_id:N', title='Graph ID', axis=alt.Axis(tickCount=10)),  # Adjust tickCount for clarity
    y=alt.Y('pct_matched:Q', title='Percentage Matched'),
    tooltip=[alt.Tooltip('graph_id:N', title='Graph ID'), alt.Tooltip('pct_matched:Q', title='Percentage Matched')]  # Tooltip configuration
).properties(
    width=800,
    height=400,
    title='Scatter Plot of Percentage Matched by ID:KG2.9.0c'
).configure_mark(
    color='#45B8AC'  # Color specified by hex code
).interactive()  # Makes the chart interactive

chart.display()

In [7]:
# Sort the DataFrame by 'pct_matched' in descending order
df_graphs_sorted = df_graphs.sort_values(by='pct_matched', ascending=False)
#df_graphs_sorted

In [8]:
# Instead of using real graph id, the sequential numbering (Rank) will be used for better readability
df_graphs_sorted['Rank'] = range(1, len(df_graphs_sorted) + 1)

# Create a scatter plot using the Rank for x-axis
chart = alt.Chart(df_graphs_sorted).mark_point(size=30).encode(
    x=alt.X('Rank:Q', title='Graph ID', axis=alt.Axis(tickCount=50,values=list(range(0, 4850, 50)))),  # Adjust tickCount for clarity
    y=alt.Y('pct_matched:Q', title='Percentage Matched'),
    tooltip=[alt.Tooltip('graph_id:N', title='Graph ID'), alt.Tooltip('pct_matched:Q', title='Percentage Matched')]
).properties(
    width=800,
    height=400,
    title='Scatter Plot of Percentage Matched by ID:KG2.9.0c'
).configure_mark(
    color='#9B2335'  # Color specified by hex code
).interactive()  # Makes the chart interactive

chart.display()


## Node Name Match

Nodes that didn't match by node id's are screend using node name.

In [9]:
# Load file
file_path = 'DrugDB_nodeName_match.yaml'

with open(file_path, 'r') as file:
    node_name = yaml.safe_load(file)

# Display the loaded YAML data
#node_name


In [10]:
# Extracting the 'pct_matched' values from the data
pct_matched_values = [graph['graph']['pct_matched'] for graph in node_name if 'pct_matched' in graph['graph']]

# Calculating the average of the 'pct_matched' values
average_pct_matched = sum(pct_matched_values) / len(pct_matched_values)
print("Average percent match based on node Name:", average_pct_matched)


Average percent match based on node Name: 96.94337800298689


In [11]:
graph_name = [graph['graph']['_id'] for graph in node_name if 'pct_matched' in graph['graph']]
pct_matched = [graph['graph']['pct_matched'] for graph in node_name if 'pct_matched' in graph['graph']]

# Constructing the DataFrame
df_graph_names = pd.DataFrame({
    'graph_id': graph_id,
    'pct_matched': pct_matched
})

# Display dataframe
#df_graphs_names

In [12]:
# Make histogram

chart = alt.Chart(df_graph_names).mark_bar(color='#0071BB').encode(
    x=alt.X('pct_matched:Q', bin=alt.Bin(maxbins=20), title='Percentage Matched', sort='descending'),
    y=alt.Y('count()', title='Count of Graph IDs',scale=alt.Scale(domain=(0, 4100)))
).properties(
    width=800,
    height=400,
    title='Histogram of Percentage Matched by Graph ID:KG2.9.0c'
)

chart.display()

In [13]:
# Create a scatter plot
chart = alt.Chart(df_graph_names).mark_point().encode(
    x=alt.X('graph_id:N', title='Graph ID', axis=alt.Axis(tickCount=10)),  # Adjust tickCount for clarity
    y=alt.Y('pct_matched:Q', title='Percentage Matched'),
    tooltip=[alt.Tooltip('graph_id:N', title='Graph ID'), alt.Tooltip('pct_matched:Q', title='Percentage Matched')]  # Tooltip configuration
).properties(
    width=800,
    height=400,
    title='Scatter Plot of Percentage Matched by Name:KG2.9.0c'
).configure_mark(
    color='#FD8D3C'  # Color specified by hex code
).interactive()  # Makes the chart interactive

chart.display()

In [14]:
# Sort the DataFrame by 'pct_matched' in descending order
df_names_sorted = df_graph_names.sort_values(by='pct_matched', ascending=False)
#df_graphs_sorted

# Instead of using real graph id, the sequential numbering (Rank) will be used for better readability
df_names_sorted['Rank'] = range(1, len(df_names_sorted) + 1)

# Create a scatter plot using the Rank for x-axis
chart = alt.Chart(df_names_sorted).mark_point(size=30).encode(
    x=alt.X('Rank:Q', title='Graph ID', axis=alt.Axis(tickCount=50,values=list(range(0, 4850, 50)))),  # Adjust tickCount for clarity
    y=alt.Y('pct_matched:Q', title='Percentage Matched'),
    tooltip=[alt.Tooltip('graph_id:N', title='Graph ID'), alt.Tooltip('pct_matched:Q', title='Percentage Matched')]
).properties(
    width=800,
    height=400,
    title='Scatter Plot of Percentage Matched by Name:KG2.9.0c'
).configure_mark(
    color='#A6761D'  # Color specified by hex code
).interactive()  # Makes the chart interactive

chart.display()

#### Analysis Summary:

When conducting ID alignment only `KG2.9.0` showed 89.34% match but combining name query match it showed 96.94% match; 8.17% increase in performance. For `KG2.8.4`, 89.62% with node ID only and 96.97% with name query match; 8.20% increase in performance.