Install Required Packages

In [9]:
!pip install pandas numpy<2 scikit-learn

/bin/bash: line 1: 2: No such file or directory


In [5]:
!pip install rdkit-pypi



In [6]:
!pip install torch torchvision torchaudio torch-geometric



In [7]:
!pip install tqdm matplotlib seaborn



Import Libraries

In [10]:
# ===== Data handling =====
import pandas as pd
import numpy as np
import pickle

# ===== Machine learning / preprocessing =====
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, f1_score

# ===== Chemistry / molecular graph processing =====
from rdkit import Chem
from rdkit.Chem import AllChem

# ===== Deep learning (PyTorch + PyG) =====
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, SAGEConv, GATConv

# ===== Utilities =====
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import pandas as pd
import pickle

# ===== Load side-effect relationships (numpy array) =====
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")
print("Side-effect links shape:", drug_se_links.shape)
print(drug_se_links[:5])  # preview first 5 rows

# ===== Load list of valid drug CIDs =====
drug_list = pd.read_pickle("drugs.pkl")
print("\nNumber of drugs:", len(drug_list))
print("First 5 drug IDs:", drug_list[:5])

# ===== Load list of side effects =====
side_effect_list = pd.read_pickle("side_effects.pkl")
print("\nNumber of side effects:", len(side_effect_list))
print("First 5 side effect codes:", side_effect_list[:5])

# ===== Load drug metadata (from PubChem) =====
pubchem_df = pd.read_csv("pubchem_output.csv")
print("\nPubChem dataframe shape:", pubchem_df.shape)
print(pubchem_df.head())

# ===== Optional: ensure CID format matches drug_list =====
# If your drug_list has 'CID100000085' and pubchem_df has '85', we need to align them
pubchem_df['cid'] = pubchem_df['cid'].apply(lambda x: f"CID{int(x):09d}")

Side-effect links shape: (96454, 6)
[['CID100000085' 'CID000010917' 'C0000729' 'PT' 'C0000737'
  'Abdominal pain']
 ['CID100000085' 'CID000010917' 'C0000737' 'PT' 'C0687713'
  'Gastrointestinal pain']
 ['CID100000085' 'CID000010917' 'C0002418' 'PT' 'C0002418' 'Amblyopia']
 ['CID100000085' 'CID000010917' 'C0002871' 'PT' 'C0002871' 'Anaemia']
 ['CID100000085' 'CID000010917' 'C0003123' 'PT' 'C0232462'
  'Decreased appetite']]

Number of drugs: 1339
First 5 drug IDs: ['CID100000085', 'CID100000119', 'CID100000137', 'CID100000158', 'CID100000159']

Number of side effects: 360
First 5 side effect codes: ['C0000731', 'C0000737', 'C0000833', 'C0001807', 'C0001824']

PubChem dataframe shape: (1343, 24)
   cid                                           cmpdname  \
0   85                                        Carnitinium   
1  119                            gamma-Aminobutyric acid   
2  137                                Aminolevulinic acid   
3  158                             8-Iso PROSTAGLANDI

In [12]:
import pandas as pd
import pickle

# Load files
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")
drug_list = pd.read_pickle("drugs.pkl")
pubchem_df = pd.read_csv("pubchem_output.csv")

# Print first 5 drug IDs from each source
print("From drug_side_effect_links.pkl (drug1_id, drug2_id):")
for row in drug_se_links[:5]:
    print(row[0], row[1])

print("\nFrom drugs.pkl:")
print(drug_list[:5])

print("\nFrom pubchem_output.csv (raw cid values):")
print(pubchem_df['cid'].head())

# Optional: format pubchem IDs to match 'CID#########'
pubchem_df['cid_formatted'] = pubchem_df['cid'].apply(lambda x: f"CID{int(x):09d}")
print("\nFormatted pubchem IDs:")
print(pubchem_df['cid_formatted'].head())

From drug_side_effect_links.pkl (drug1_id, drug2_id):
CID100000085 CID000010917
CID100000085 CID000010917
CID100000085 CID000010917
CID100000085 CID000010917
CID100000085 CID000010917

From drugs.pkl:
['CID100000085', 'CID100000119', 'CID100000137', 'CID100000158', 'CID100000159']

From pubchem_output.csv (raw cid values):
0     85
1    119
2    137
3    158
4    159
Name: cid, dtype: int64

Formatted pubchem IDs:
0    CID000000085
1    CID000000119
2    CID000000137
3    CID000000158
4    CID000000159
Name: cid_formatted, dtype: object


In [13]:
import pandas as pd

# Load the file
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")

# Extract and print the first 5 original drug1_id values
for row in drug_se_links[:5]:
    print(row[0])

CID100000085
CID100000085
CID100000085
CID100000085
CID100000085


In [14]:
import pandas as pd

# Load side-effect relationships
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")

# Clean only drug1_id by removing "CID1"
drug1_ids_clean = [row[0].replace("CID1", "") for row in drug_se_links]

# Preview first 5 cleaned IDs
print(drug1_ids_clean[:5])

['00000085', '00000085', '00000085', '00000085', '00000085']


In [15]:
import pandas as pd

# Load side-effect relationships
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")

# Clean drug1_id: remove "CID1", then strip leading zeros
drug1_ids_clean = [row[0].replace("CID1", "").lstrip("0") for row in drug_se_links]

# Preview first 5 cleaned IDs
print(drug1_ids_clean[:5])

['85', '85', '85', '85', '85']


In [16]:
import pandas as pd
import random

# Load side-effect relationships
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")

# Clean drug1_id: remove "CID1", then strip leading zeros
drug1_ids_clean = [row[0].replace("CID1", "").lstrip("0") for row in drug_se_links]

# Pick 20 random IDs
sample_ids = random.sample(drug1_ids_clean, 20)

# Print them
for cid in sample_ids:
    print(cid)


60183
2622
5544
3878
37392
3373
2828
3108
54840
216210
27993
60183
3827
2344
5672
5665
52421
16129629
71436
3475


In [17]:
import pandas as pd

# Load side-effect relationships
drug_se_links = pd.read_pickle("drug_side_effect_links.pkl")

# Replace drug1_id with cleaned version
cleaned_links = []
for row in drug_se_links:
    drug1_clean = row[0].replace("CID1", "").lstrip("0")
    cleaned_links.append((drug1_clean, row[1]) + tuple(row[2:]))

# Convert to DataFrame
drug_se_df_cleaned = pd.DataFrame(
    cleaned_links,
    columns=["drug1_id", "drug2_id", "side_effect_code_1", "type", "side_effect_code_2", "side_effect_name"]
)

# Save to pickle
drug_se_df_cleaned.to_pickle("drug_side_effect_links_drug1.pkl")

print("✅ Saved cleaned file as drug_side_effect_links_drug1.pkl")

✅ Saved cleaned file as drug_side_effect_links_drug1.pkl


In [18]:
import pandas as pd

# Load file where only drug1_id is cleaned
drug_se_df = pd.read_pickle("drug_side_effect_links_drug1.pkl")

# Clean drug2_id: remove "CID" and strip leading zeros
drug_se_df["drug2_id"] = drug_se_df["drug2_id"].str.replace("CID", "", regex=False).str.lstrip("0")

# Print 10 random drug1 IDs
print("Random drug1 IDs:")
print(drug_se_df["drug1_id"].sample(10).to_list())

# Print 10 random drug2 IDs
print("\nRandom drug2 IDs:")
print(drug_se_df["drug2_id"].sample(10).to_list())

Random drug1 IDs:
['5210', '4485', '4679', '3291', '3086686', '3363', '2369', '25077405', '132999', '27661']

Random drug2 IDs:
['2725', '477468', '338', '6726', '33741', '213039', '4178', '5735', '392622', '119569']


In [19]:
import pandas as pd

# Load file where only drug1_id is cleaned
drug_se_df = pd.read_pickle("drug_side_effect_links_drug1.pkl")

# Clean drug2_id: remove "CID" and strip leading zeros
drug_se_df["drug2_id"] = drug_se_df["drug2_id"].str.replace("CID", "", regex=False).str.lstrip("0")

# Print 10 random drug1 IDs
print("Random drug1 IDs:")
print(drug_se_df["drug1_id"].sample(10).to_list())

# Print 10 random drug2 IDs
print("\nRandom drug2 IDs:")
print(drug_se_df["drug2_id"].sample(10).to_list())

# Save the fully cleaned file
drug_se_df.to_pickle("drug_se_links_cleaned.pkl")
print("\n✅ Saved as drug_se_links_cleaned.pkl")

Random drug1 IDs:
['3690', '208898', '1065', '70695640', '5064', '5487', '125889', '4048', '3222', '9831414']

Random drug2 IDs:
['5362118', '10631', '20469', '83898', '55480', '63009', '3446', '62956', '5070', '753']

✅ Saved as drug_se_links_cleaned.pkl


In [20]:
import pickle
import random

# Load drugs.pkl
with open("drugs.pkl", "rb") as f:
    drugs_list = pickle.load(f)

# Pick 10 random drug IDs
sample_drugs = random.sample(drugs_list, 10)

# Display
for drug in sample_drugs:
    print(drug)

CID100003392
CID111531537
CID100003793
CID100002350
CID100007699
CID100005311
CID116065945
CID100004095
CID100004946
CID100005515


In [21]:
import pickle

# Load drugs.pkl
with open("drugs.pkl", "rb") as f:
    drugs_list = pickle.load(f)

# Clean IDs: remove "CID1", then strip leading zeros
drugs_cleaned = [drug.replace("CID1", "").lstrip("0") for drug in drugs_list]

# Preview 10 random cleaned IDs
import random
print(random.sample(drugs_cleaned, 10))

# Save as new file
with open("drugs_cleaned.pkl", "wb") as f:
    pickle.dump(drugs_cleaned, f)

print("✅ Saved as drugs_cleaned.pkl")

['53477714', '4456', '1875', '3143', '656892', '71360', '3081361', '10660', '5206', '16213095']
✅ Saved as drugs_cleaned.pkl


In [22]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('pubchem_output.csv')

# Print column names
print(df.columns.tolist())

['cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea', 'complexity', 'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds', 'inchi', 'isosmiles', 'inchikey', 'iupacname', 'meshheadings', 'annothits', 'annothitcnt', 'aids', 'cidcdate', 'sidsrcname', 'depcatg', 'annotation']


In [23]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('pubchem_output.csv')

# Print the first 5 rows
print(df.head(5))

   cid                                           cmpdname  \
0   85                                        Carnitinium   
1  119                            gamma-Aminobutyric acid   
2  137                                Aminolevulinic acid   
3  158                             8-Iso PROSTAGLANDIN E2   
4  159  5-[5-Hydroxy-4-(3-hydroxyoct-1-enyl)-3,3a,4,5,...   

                                         cmpdsynonym      mw         mf  \
0  carnitinium|Carnitine|461-05-2|CHEBI:3424|3-ca...  162.21  C7H16NO3+   
1  4-aminobutyric acid|4-Aminobutanoic acid|gamma...  103.12    C4H9NO2   
2  5-Aminolevulinic acid|Aminolevulinic acid|106-...  131.13    C5H9NO3   
3  8-iso PROSTAGLANDIN E2|ent-Prostaglandin E2|15...  352.50   C20H32O5   
4  61849-14-7|5-[5-hydroxy-4-(3-hydroxyoct-1-enyl...  352.50   C20H32O5   

   polararea  complexity  xlogp  heavycnt  hbonddonor  ...  \
0       57.5       139.0   -0.8        11           2  ...   
1       63.3        62.7   -3.2         7           2  ...

In [24]:
import pandas as pd

df = pd.read_csv('pubchem_output.csv')

print(df['cid'].head(5))

0     85
1    119
2    137
3    158
4    159
Name: cid, dtype: int64


In [25]:
import pandas as pd

drug_se_links = pd.read_pickle('drug_se_links_cleaned.pkl')

print("Checking drug_se_links_cleaned.pkl:")

if isinstance(drug_se_links, pd.DataFrame):
    if 'drug1_id' in drug_se_links.columns:
        print("drug1_id dtype:", drug_se_links['drug1_id'].dtype)
    else:
        print("No 'drug1_id' column found.")

    if 'drug2_id' in drug_se_links.columns:
        print("drug2_id dtype:", drug_se_links['drug2_id'].dtype)
    else:
        print("No 'drug2_id' column found.")
else:
    print("Data is not a DataFrame.")


Checking drug_se_links_cleaned.pkl:
drug1_id dtype: object
drug2_id dtype: object


In [26]:
import pandas as pd

drugs_cleaned = pd.read_pickle('drugs_cleaned.pkl')

print("Checking drugs_cleaned.pkl:")

if isinstance(drugs_cleaned, pd.DataFrame):
    if 'drug_id' in drugs_cleaned.columns:
        print("drug_id dtype:", drugs_cleaned['drug_id'].dtype)
    elif 'cid' in drugs_cleaned.columns:
        print("cid dtype:", drugs_cleaned['cid'].dtype)
    else:
        print("No 'drug_id' or 'cid' column found.")
else:
    print("Data is not a DataFrame.")


Checking drugs_cleaned.pkl:
Data is not a DataFrame.


In [27]:
import pandas as pd

pubchem_df = pd.read_csv('pubchem_output.csv')

print("Checking pubchem_output.csv:")

if 'cid' in pubchem_df.columns:
    print("cid dtype:", pubchem_df['cid'].dtype)
else:
    print("No 'cid' column found.")


Checking pubchem_output.csv:
cid dtype: int64


In [28]:
import pandas as pd

# Load the pickle file
drug_se_links = pd.read_pickle('drug_se_links_cleaned.pkl')

# Convert drug1_id and drug2_id columns to string type
drug_se_links['drug1_id'] = drug_se_links['drug1_id'].astype(str)
drug_se_links['drug2_id'] = drug_se_links['drug2_id'].astype(str)

# Print first 5 rows of drug1_id and drug2_id
print("First 5 drug1_id values:")
print(drug_se_links['drug1_id'].head(5))

print("\nFirst 5 drug2_id values:")
print(drug_se_links['drug2_id'].head(5))

# Print datatypes of the columns
print("\nDatatypes after conversion:")
print(drug_se_links[['drug1_id', 'drug2_id']].dtypes)

First 5 drug1_id values:
0    85
1    85
2    85
3    85
4    85
Name: drug1_id, dtype: object

First 5 drug2_id values:
0    10917
1    10917
2    10917
3    10917
4    10917
Name: drug2_id, dtype: object

Datatypes after conversion:
drug1_id    object
drug2_id    object
dtype: object


In [29]:
# Remove 'CID1' prefix from drug2_id if it exists
drug_se_links['drug2_id'] = drug_se_links['drug2_id'].str.replace(r'^CID1', '', regex=True)

# Print first 5 cleaned drug2_id values to confirm
print(drug_se_links['drug2_id'].head(5))


0    10917
1    10917
2    10917
3    10917
4    10917
Name: drug2_id, dtype: object


In [30]:
# Remove leading '1' if present at the start
drug_se_links['drug2_id'] = drug_se_links['drug2_id'].str.replace(r'^1', '', regex=True)

print(drug_se_links['drug2_id'].head(5))


0    0917
1    0917
2    0917
3    0917
4    0917
Name: drug2_id, dtype: object


In [31]:
# Remove all leading zeros from drug2_id
drug_se_links['drug2_id'] = drug_se_links['drug2_id'].str.replace(r'^0+', '', regex=True)

print(drug_se_links['drug2_id'].head(5))


0    917
1    917
2    917
3    917
4    917
Name: drug2_id, dtype: object


In [32]:
print(drug_se_links.head(5))


  drug1_id drug2_id side_effect_code_1 type side_effect_code_2  \
0       85      917           C0000729   PT           C0000737   
1       85      917           C0000737   PT           C0687713   
2       85      917           C0002418   PT           C0002418   
3       85      917           C0002871   PT           C0002871   
4       85      917           C0003123   PT           C0232462   

        side_effect_name  
0         Abdominal pain  
1  Gastrointestinal pain  
2              Amblyopia  
3                Anaemia  
4     Decreased appetite  


In [33]:
drug_se_links.to_pickle('drug_se_links_cleaned_v2.pkl')

In [34]:
import pandas as pd

data = pd.read_pickle('drugs_cleaned.pkl')

print("Type of data:", type(data))

# If it’s a list, check first element type and keys (if dict)
if isinstance(data, list) and len(data) > 0:
    print("Type of first element:", type(data[0]))
    if isinstance(data[0], dict):
        print("Keys in first element:", data[0].keys())

Type of data: <class 'list'>
Type of first element: <class 'str'>


In [35]:
data = pd.read_pickle('drugs_cleaned.pkl')

print("First 5 drug IDs:")
print(data[:5])

print("\nType of first 5 elements:")
print([type(x) for x in data[:5]])

First 5 drug IDs:
['85', '119', '137', '158', '159']

Type of first 5 elements:
[<class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'str'>]


In [36]:
import pandas as pd

data = pd.read_pickle('drugs_cleaned.pkl')

# Save the list of strings as a new pickle file
pd.to_pickle(data, 'drugs_cleaned_v2.pkl')

In [37]:
import pandas as pd

# Load CSV
pubchem_df = pd.read_csv('pubchem_output.csv')

# Convert 'cid' to string
pubchem_df['cid'] = pubchem_df['cid'].astype(str)

# Print first 5 'cid' values
print(pubchem_df['cid'].head(5))

# Print dtype of 'cid' column
print(pubchem_df['cid'].dtype)

0     85
1    119
2    137
3    158
4    159
Name: cid, dtype: object
object


In [38]:
pubchem_df.to_pickle('pubchem_output_v2.pkl')

In [39]:
import pandas as pd

# Load the pickle file
side_effects = pd.read_pickle('side_effects.pkl')

# Check its type
print(type(side_effects))

# If it's a DataFrame, print columns and first few rows
if isinstance(side_effects, pd.DataFrame):
    print(side_effects.columns.tolist())
    print(side_effects.head())
elif isinstance(side_effects, list):
    print(f"Loaded a list with {len(side_effects)} items.")
    # Optionally print the first item to see its structure
    print(side_effects[0])
else:
    print("Unknown data structure loaded.")

<class 'list'>
Loaded a list with 360 items.
C0000731


In [40]:
side_effects = pd.read_pickle('side_effects.pkl')

print("First 10 items:")
for item in side_effects[:10]:
    print(item)

First 10 items:
C0000731
C0000737
C0000833
C0001807
C0001824
C0001925
C0002170
C0002418
C0002453
C0002622


In [41]:
import pandas as pd

# Load the side effects list
side_effects = pd.read_pickle('side_effects.pkl')

# Clean IDs: remove leading 'C' and preceding zeros
cleaned_ids = [id_str.lstrip('C').lstrip('0') for id_str in side_effects]

# Print first 10 cleaned IDs to verify
print(cleaned_ids[:10])

# Save cleaned IDs to new pickle file
pd.to_pickle(cleaned_ids, 'side_effects_cleaned.pkl')

['731', '737', '833', '1807', '1824', '1925', '2170', '2418', '2453', '2622']


In [42]:
import pickle

with open('side_effects_cleaned.pkl', 'rb') as f:
    data = pickle.load(f)

# If it's a list
if isinstance(data, list):
    for row in data[:5]:
        print(row)

# If it's a dictionary
elif isinstance(data, dict):
    for i, (k, v) in enumerate(data.items()):
        if i >= 5:
            break
        print(k, '=>', v)

731
737
833
1807
1824


In [43]:
import pickle

with open('drug_se_links_cleaned_v2.pkl', 'rb') as f:
    data = pickle.load(f)

# Print first 5 rows/items
if isinstance(data, list):
    for row in data[:5]:
        print(row)
elif isinstance(data, dict):
    for i, (k, v) in enumerate(data.items()):
        if i >= 5:
            break
        print(k, '=>', v)
else:
    print(type(data))
    print(data)

<class 'pandas.core.frame.DataFrame'>
       drug1_id  drug2_id side_effect_code_1 type side_effect_code_2  \
0            85       917           C0000729   PT           C0000737   
1            85       917           C0000737   PT           C0687713   
2            85       917           C0002418   PT           C0002418   
3            85       917           C0002871   PT           C0002871   
4            85       917           C0003123   PT           C0232462   
...         ...       ...                ...  ...                ...   
96449  71306834  71306834           C1527344   PT           C1527344   
96450  71306834  71306834           C1565489   PT           C1565489   
96451  71306834  71306834           C2364111   PT           C2364111   
96452  71306834  71306834           C2830004   PT           C2830004   
96453  71306834  71306834           C2979982   PT           C2979982   

            side_effect_name  
0             Abdominal pain  
1      Gastrointestinal pain  
2   

In [44]:
import pandas as pd

# Load the data
df = pd.read_pickle('drug_se_links_cleaned_v2.pkl')

# Remove rows with null SE codes in code_1
df = df.dropna(subset=['side_effect_code_1'])

# Clean `side_effect_code_1`
df['side_effect_code_1_cleaned'] = (
    df['side_effect_code_1']
    .str.replace('^C0*', '', regex=True)  # Remove 'C' and leading 0s
)

# Optional: convert to int (if you want to work with integers)
# df['side_effect_code_1_cleaned'] = df['side_effect_code_1_cleaned'].astype(int)

# Preview cleaned codes
print(df[['side_effect_code_1', 'side_effect_code_1_cleaned']].head())

  side_effect_code_1 side_effect_code_1_cleaned
0           C0000729                        729
1           C0000737                        737
2           C0002418                       2418
3           C0002871                       2871
4           C0003123                       3123


In [45]:
# Make sure nulls are handled
df = df.dropna(subset=['side_effect_code_2'])

# Clean `side_effect_code_2`
df['side_effect_code_2_cleaned'] = (
    df['side_effect_code_2']
    .str.replace('^C0*', '', regex=True)  # Remove 'C' and leading zeros
)

# Optional: convert to int if needed
# df['side_effect_code_2_cleaned'] = df['side_effect_code_2_cleaned'].astype(int)

# Preview both cleaned columns
print(df[['side_effect_code_1', 'side_effect_code_1_cleaned',
          'side_effect_code_2', 'side_effect_code_2_cleaned']].head())

  side_effect_code_1 side_effect_code_1_cleaned side_effect_code_2  \
0           C0000729                        729           C0000737   
1           C0000737                        737           C0687713   
2           C0002418                       2418           C0002418   
3           C0002871                       2871           C0002871   
4           C0003123                       3123           C0232462   

  side_effect_code_2_cleaned  
0                        737  
1                     687713  
2                       2418  
3                       2871  
4                     232462  


In [46]:
# Drop the original uncleaned SE code columns
df = df.drop(columns=['side_effect_code_1', 'side_effect_code_2'])

# Rename cleaned columns to the original names (optional, for consistency)
df = df.rename(columns={
    'side_effect_code_1_cleaned': 'side_effect_code_1',
    'side_effect_code_2_cleaned': 'side_effect_code_2'
})

# Preview result
print(df.head())

  drug1_id drug2_id type       side_effect_name side_effect_code_1  \
0       85      917   PT         Abdominal pain                729   
1       85      917   PT  Gastrointestinal pain                737   
2       85      917   PT              Amblyopia               2418   
3       85      917   PT                Anaemia               2871   
4       85      917   PT     Decreased appetite               3123   

  side_effect_code_2  
0                737  
1             687713  
2               2418  
3               2871  
4             232462  


In [47]:
df.to_pickle('drug_se_links_cleaned_v2_final.pkl')

In [48]:
# 1. Load the original data (the one that had uncleaned codes and names)
original_df = pd.read_pickle('drug_se_links_cleaned_v2.pkl')

# 2. Build mapping from original SE codes to names (remove 'C' and leading 0s in keys)
original_df['cleaned_code'] = (
    original_df['side_effect_code_1'].str.replace('^C0*', '', regex=True)
)

# 3. Build dictionary
se_code_to_name = dict(zip(original_df['cleaned_code'], original_df['side_effect_name']))

# 4. Map to current cleaned DataFrame (df)
df['side_effect_name_1'] = df['side_effect_code_1'].map(se_code_to_name)
df['side_effect_name_2'] = df['side_effect_code_2'].map(se_code_to_name)

# 5. Preview
print(df[['side_effect_code_1', 'side_effect_name_1',
          'side_effect_code_2', 'side_effect_name_2']].head())


  side_effect_code_1  side_effect_name_1 side_effect_code_2  \
0                729      Abdominal pain                737   
1                737      Abdominal pain             687713   
2               2418           Amblyopia               2418   
3               2871             Anaemia               2871   
4               3123  Decreased appetite             232462   

      side_effect_name_2  
0         Abdominal pain  
1  Gastrointestinal pain  
2              Amblyopia  
3                Anaemia  
4     Decreased appetite  


In [49]:
from collections import defaultdict
import math

drug_to_side_effects = defaultdict(set)

for _, row in df.iterrows():
    se1 = row['side_effect_name_1']
    se2 = row['side_effect_name_2']

    d1 = row['drug1_id']
    d2 = row['drug2_id']

    # Only add if se name is a valid string (not NaN)
    if isinstance(se1, str) and se1.strip() != '':
        drug_to_side_effects[d1].add(se1)
        drug_to_side_effects[d2].add(se1)
    if isinstance(se2, str) and se2.strip() != '':
        drug_to_side_effects[d1].add(se2)
        drug_to_side_effects[d2].add(se2)

# Now print safely
for drug, se_names in list(drug_to_side_effects.items())[:5]:
    print(f"Drug {drug} → {sorted(se_names)}")

Drug 85 → ['Abdominal pain', 'Amblyopia', 'Anaemia', 'Angiopathy', 'Anxiety', 'Arrhythmia', 'Asthenia', 'Atrial fibrillation', 'Back pain', 'Body temperature increased', 'Bronchitis', 'Cardiovascular disorder', 'Chest pain', 'Constipation', 'Convulsion', 'Cough', 'Decreased appetite', 'Depression', 'Dermatitis', 'Diarrhoea', 'Dizziness', 'Dysgeusia', 'Dyspepsia', 'Dyspnoea', 'Eye disorder', 'Gastritis', 'Gastrointestinal disorder', 'Gastrointestinal pain', 'Haemoglobin', 'Headache', 'Hypercalcaemia', 'Hyperkalaemia', 'Hypersensitivity', 'Hypertension', 'Hypertonia', 'Hypotension', 'Infection', 'Influenza', 'Injection site reaction', 'Injury', 'Insomnia', 'Melaena', 'Muscle spasms', 'Muscular weakness', 'Musculoskeletal discomfort', 'Myalgia', 'Nausea', 'Oedema peripheral', 'Pain', 'Palpitations', 'Paraesthesia', 'Pharyngitis', 'Pruritus', 'Rash', 'Renal failure', 'Rhinitis', 'Sinusitis', 'Tachycardia', 'Urinary tract infection', 'Vertigo', 'Vomiting', 'Weight decreased', 'Weight increa

In [51]:
# Load pubchem DataFrame if not loaded yet
pubchem_df = pd.read_pickle('pubchem_output_v2.pkl')

# Build a dictionary mapping cid → cmpdname
drug_id_to_name = dict(zip(pubchem_df['cid'], pubchem_df['cmpdname']))

# Example: map drug1_id and drug2_id to names in your main df
df['drug1_name'] = df['drug1_id'].map(drug_id_to_name)
df['drug2_name'] = df['drug2_id'].map(drug_id_to_name)

# Preview result
print(df[['drug1_id', 'drug1_name', 'drug2_id', 'drug2_name']].head())

  drug1_id   drug1_name drug2_id drug2_name
0       85  Carnitinium      917        NaN
1       85  Carnitinium      917        NaN
2       85  Carnitinium      917        NaN
3       85  Carnitinium      917        NaN
4       85  Carnitinium      917        NaN


In [52]:
# Assuming df already has 'drug1_name' and 'drug2_name' columns

# Drop rows where either drug1_name or drug2_name is missing (NaN)
df_clean = df.dropna(subset=['drug1_name', 'drug2_name']).copy()

print(f"Rows before cleaning: {len(df)}")
print(f"Rows after dropping missing drug names: {len(df_clean)}")

# Get unique named drugs from both columns
unique_drugs = set(df_clean['drug1_name']).union(set(df_clean['drug2_name']))

print(f"Number of unique named drugs: {len(unique_drugs)}")


Rows before cleaning: 96454
Rows after dropping missing drug names: 52294
Number of unique named drugs: 749


In [53]:
import pickle

# Filter out drug IDs without names from the mapping
clean_drug_id_to_name = {k: v for k, v in drug_id_to_name.items() if isinstance(v, str) and v.strip() != ''}

# Save to a pickle file
with open('drug_id_to_name.pkl', 'wb') as f:
    pickle.dump(clean_drug_id_to_name, f)

print(f"Saved {len(clean_drug_id_to_name)} drug ID → name mappings to 'drug_id_to_name.pkl'")

Saved 1343 drug ID → name mappings to 'drug_id_to_name.pkl'


In [54]:
import pickle

# Load the mapping
with open('drug_id_to_name.pkl', 'rb') as f:
    drug_id_to_name = pickle.load(f)

# Print first 10 items
for drug_id, name in list(drug_id_to_name.items())[:10]:
    print(f"Drug ID: {drug_id} → Name: {name}")

Drug ID: 85 → Name: Carnitinium
Drug ID: 119 → Name: gamma-Aminobutyric acid
Drug ID: 137 → Name: Aminolevulinic acid
Drug ID: 158 → Name: 8-Iso PROSTAGLANDIN E2
Drug ID: 159 → Name: 5-[5-Hydroxy-4-(3-hydroxyoct-1-enyl)-3,3a,4,5,6,6a-hexahydrocyclopenta[b]furan-2-ylidene]pentanoic acid
Drug ID: 160 → Name: 7-[3,5-Dihydroxy-2-(3-hydroxyoct-1-enyl)cyclopentyl]hept-5-enoic acid
Drug ID: 175 → Name: Acetate
Drug ID: 187 → Name: Acetylcholine
Drug ID: 191 → Name: 2-(6-Aminopurin-9-yl)-5-(hydroxymethyl)oxolane-3,4-diol
Drug ID: 206 → Name: Hexose


In [55]:
import pickle
from collections import defaultdict

# Load the drug ID → name mapping
with open('drug_id_to_name.pkl', 'rb') as f:
    drug_id_to_name = pickle.load(f)

# Assuming df_clean has drug1_id, drug2_id, side_effect_name_1, side_effect_name_2

drug_name_to_side_effects = defaultdict(set)

for _, row in df_clean.iterrows():
    d1_id = row['drug1_id']
    d2_id = row['drug2_id']
    se1 = row['side_effect_name_1']
    se2 = row['side_effect_name_2']

    d1_name = drug_id_to_name.get(d1_id)
    d2_name = drug_id_to_name.get(d2_id)

    # Only map if drug names exist
    if d1_name:
        if isinstance(se1, str) and se1.strip():
            drug_name_to_side_effects[d1_name].add(se1)
        if isinstance(se2, str) and se2.strip():
            drug_name_to_side_effects[d1_name].add(se2)
    if d2_name:
        if isinstance(se1, str) and se1.strip():
            drug_name_to_side_effects[d2_name].add(se1)
        if isinstance(se2, str) and se2.strip():
            drug_name_to_side_effects[d2_name].add(se2)

# Convert to dict for ease
drug_name_to_side_effects = dict(drug_name_to_side_effects)

# Preview
for drug, ses in list(drug_name_to_side_effects.items())[:5]:
    print(f"{drug} → {sorted(ses)}")

Hexose → ['Abscess', 'Agitation', 'Asthenia', 'Body temperature increased', 'Cardiac failure congestive', 'Coma', 'Convulsion', 'Dehydration', 'Dizziness', 'Extravasation', 'Feeling abnormal', 'Fluid retention', 'Headache', 'Hypertension', 'Hypokalaemia', 'Irritability', 'Lacrimation increased', 'Loss of consciousness', 'Nervous system disorder', 'Oedema', 'Pain', 'Phlebitis', 'Pulmonary embolism', 'Respiratory failure', 'Salivary hypersecretion', 'Shock', 'Somnolence', 'Tachycardia', 'Tenderness', 'Thirst', 'Thrombophlebitis']
Ammonia → ['Body temperature increased', 'Extravasation', 'Infection', 'Pain', 'Phlebitis']
Benzyl alcohol → ['Dermatitis', 'Erythema', 'Eye irritation', 'Hypoaesthesia', 'Oedema', 'Pain', 'Paraesthesia', 'Pruritus', 'Rash', 'Skin exfoliation']
Betaine → ['Abdominal discomfort', 'Agitation', 'Alopecia', 'Decreased appetite', 'Diarrhoea', 'Dysgeusia', 'Gastrointestinal disorder', 'Glossitis', 'Headache', 'Irritability', 'Malnutrition', 'Mental disorder', 'Nausea'

In [56]:
import pickle

with open('drug_name_to_side_effects.pkl', 'wb') as f:
    pickle.dump(drug_name_to_side_effects, f)

print(f"Saved {len(drug_name_to_side_effects)} drug name → side effects mappings to 'drug_name_to_side_effects.pkl'")

Saved 749 drug name → side effects mappings to 'drug_name_to_side_effects.pkl'


In [57]:
import pandas as pd

# Load the file
df = pd.read_pickle('pubchem_output_v2.pkl')

# Show the columns
print("Columns in the dataset:")
print(df.columns)

# Display the first 5 rows
print("\nFirst 5 rows:")
print(df.head())

Columns in the dataset:
Index(['cid', 'cmpdname', 'cmpdsynonym', 'mw', 'mf', 'polararea', 'complexity',
       'xlogp', 'heavycnt', 'hbonddonor', 'hbondacc', 'rotbonds', 'inchi',
       'isosmiles', 'inchikey', 'iupacname', 'meshheadings', 'annothits',
       'annothitcnt', 'aids', 'cidcdate', 'sidsrcname', 'depcatg',
       'annotation'],
      dtype='object')

First 5 rows:
   cid                                           cmpdname  \
0   85                                        Carnitinium   
1  119                            gamma-Aminobutyric acid   
2  137                                Aminolevulinic acid   
3  158                             8-Iso PROSTAGLANDIN E2   
4  159  5-[5-Hydroxy-4-(3-hydroxyoct-1-enyl)-3,3a,4,5,...   

                                         cmpdsynonym      mw         mf  \
0  carnitinium|Carnitine|461-05-2|CHEBI:3424|3-ca...  162.21  C7H16NO3+   
1  4-aminobutyric acid|4-Aminobutanoic acid|gamma...  103.12    C4H9NO2   
2  5-Aminolevulinic acid|Amin

Convert SMILES to Molecular Graphs with RDKit

In [58]:
import pickle

with open("drug_name_to_side_effects.pkl", "rb") as f:
    drug_to_se = pickle.load(f)

In [59]:
# Get unique side effects
all_side_effects = sorted(set(se for se_list in drug_to_se.values() for se in se_list))
se_to_index = {se: idx for idx, se in enumerate(all_side_effects)}

In [60]:
import torch

def get_label_tensor(se_list, se_to_index):
    y = torch.zeros(len(se_to_index))
    for se in se_list:
        if se in se_to_index:
            y[se_to_index[se]] = 1
    return y

In [61]:
print("Aspirin" in drug_to_se)  # Should print: True

True


In [62]:
label_tensor = get_label_tensor(drug_to_se["Aspirin"], se_to_index)
print(label_tensor)

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1.,
        0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        1., 0., 0., 0., 0., 1., 0., 0., 

Map Drug Names → SMILES

In [63]:
import pandas as pd

# Load drug metadata
df_pubchem = pd.read_pickle("pubchem_output_v2.pkl")

# Create mapping: drug name → SMILES
drug_to_smiles = dict(zip(df_pubchem["cmpdname"], df_pubchem["isosmiles"]))

# Preview
for name, smiles in list(drug_to_smiles.items())[:5]:
    print(f"{name} → {smiles}")

Carnitinium → C[N+](C)(C)CC(CC(=O)O)O
gamma-Aminobutyric acid → C(CC(=O)O)CN
Aminolevulinic acid → C(CC(=O)O)C(=O)CN
8-Iso PROSTAGLANDIN E2 → CCCCCC(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O
5-[5-Hydroxy-4-(3-hydroxyoct-1-enyl)-3,3a,4,5,6,6a-hexahydrocyclopenta[b]furan-2-ylidene]pentanoic acid → CCCCCC(C=CC1C(CC2C1CC(=CCCCC(=O)O)O2)O)O


In [64]:
# Load your existing drug → side effects mapping
with open("drug_name_to_side_effects.pkl", "rb") as f:
    drug_to_se = pickle.load(f)

# Get intersection of available drug names
common_drugs = set(drug_to_smiles).intersection(drug_to_se)

print(f"✅ Number of drugs with both SMILES and side effects: {len(common_drugs)}")

✅ Number of drugs with both SMILES and side effects: 749


Convert SMILES → Graphs

In [65]:
from rdkit import Chem
from torch_geometric.data import Data
import torch

def smiles_to_graph(smiles, label_tensor):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # skip if RDKit can't parse

    # Node features: just atom number for now
    atom_features = []
    for atom in mol.GetAtoms():
        atom_features.append([atom.GetAtomicNum()])
    x = torch.tensor(atom_features, dtype=torch.float)

    # Edge index
    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index += [[i, j], [j, i]]  # Undirected edges

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    return Data(x=x, edge_index=edge_index, y=label_tensor.unsqueeze(0))

In [66]:
required_vars = {
    "common_drugs": "set of drug names",
    "drug_to_smiles": "drug name → SMILES mapping",
    "drug_to_se": "drug name → list of side effects",
    "se_to_index": "side effect → index mapping"
}

for var, desc in required_vars.items():
    if var in globals():
        value = eval(var)
        print(f"✅ {var} ({desc}) found. Type: {type(value)}, Length: {len(value)}")
    else:
        print(f"❌ {var} ({desc}) is missing!")

✅ common_drugs (set of drug names) found. Type: <class 'set'>, Length: 749
✅ drug_to_smiles (drug name → SMILES mapping) found. Type: <class 'dict'>, Length: 1343
✅ drug_to_se (drug name → list of side effects) found. Type: <class 'dict'>, Length: 749
✅ se_to_index (side effect → index mapping) found. Type: <class 'dict'>, Length: 356


In [67]:
# Pick a sample drug
sample_drug = next(iter(common_drugs))
print(f"Sample drug: {sample_drug}")

# Get SMILES and side effect list
sample_smiles = drug_to_smiles[sample_drug]
sample_se_list = drug_to_se[sample_drug]

# Convert side effect list to label tensor
sample_label_tensor = get_label_tensor(sample_se_list, se_to_index)

# Convert to graph data
sample_graph = smiles_to_graph(sample_smiles, sample_label_tensor)

# Check the graph object
print(sample_graph)
print("Node features shape:", sample_graph.x.shape)
print("Edge index shape:", sample_graph.edge_index.shape)
print("Label tensor shape:", sample_graph.y.shape)

Sample drug: Thioguanine
Data(x=[11, 1], edge_index=[2, 24], y=[1, 356])
Node features shape: torch.Size([11, 1])
Edge index shape: torch.Size([2, 24])
Label tensor shape: torch.Size([1, 356])


In [68]:
from tqdm import tqdm

dataset = []

for drug in tqdm(common_drugs):
    smiles = drug_to_smiles.get(drug, None)
    se_list = drug_to_se.get(drug, [])
    if smiles is None or not se_list:
        continue
    label_tensor = get_label_tensor(se_list, se_to_index)
    graph = smiles_to_graph(smiles, label_tensor)
    if graph is not None:
        dataset.append(graph)

print(f"✅ Created dataset with {len(dataset)} graph objects.")

100%|██████████| 749/749 [00:00<00:00, 942.97it/s]

✅ Created dataset with 749 graph objects.





In [69]:
import torch
torch.save(dataset, "drug_side_effect_graphs.pt")
print("Dataset saved as drug_side_effect_graphs.pt")

Dataset saved as drug_side_effect_graphs.pt


In [70]:
from torch_geometric.data import DataLoader

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)



In [71]:
for data in dataloader:
    print(data.x.shape)          # Node features for all nodes in the batch
    print(data.edge_index.shape) # Edge connections for all edges in the batch
    print(data.y.shape)          # Labels for all graphs in the batch
    print(data.batch.shape)      # Maps each node to its graph in the batch
    break  # To print just the first batch info

torch.Size([687, 1])
torch.Size([2, 1438])
torch.Size([32, 356])
torch.Size([687])


In [72]:
print(dataset[0].y.shape)  # should be [num_side_effects]
print(dataset[0].y)

torch.Size([1, 356])
tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       

In [73]:
from torch_geometric.data import Batch

batch = Batch.from_data_list(dataset[:2])
print(batch.y.shape)

torch.Size([2, 356])


Define the GNN model architecture

In [74]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GNN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_dim, num_side_effects):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.lin = torch.nn.Linear(hidden_dim, num_side_effects)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)  # pooling over nodes in each graph
        out = self.lin(x)
        return out

In [75]:
print(dataset[0].x.shape[1])

1


In [76]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GNN(num_node_features=1, hidden_dim=64, num_side_effects=len(se_to_index)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

def train():
    model.train()
    total_loss = 0
    for data in dataloader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(dataloader.dataset)

for epoch in range(1, 51):
    loss = train()
    print(f"Epoch {epoch}, Loss: {loss:.4f}")

Epoch 1, Loss: 0.6237
Epoch 2, Loss: 0.4748
Epoch 3, Loss: 0.4365
Epoch 4, Loss: 0.4319
Epoch 5, Loss: 0.4308
Epoch 6, Loss: 0.4303
Epoch 7, Loss: 0.4300
Epoch 8, Loss: 0.4299
Epoch 9, Loss: 0.4294
Epoch 10, Loss: 0.4304
Epoch 11, Loss: 0.4295
Epoch 12, Loss: 0.4293
Epoch 13, Loss: 0.4291
Epoch 14, Loss: 0.4293
Epoch 15, Loss: 0.4289
Epoch 16, Loss: 0.4285
Epoch 17, Loss: 0.4290
Epoch 18, Loss: 0.4280
Epoch 19, Loss: 0.4283
Epoch 20, Loss: 0.4278
Epoch 21, Loss: 0.4278
Epoch 22, Loss: 0.4277
Epoch 23, Loss: 0.4275
Epoch 24, Loss: 0.4276
Epoch 25, Loss: 0.4272
Epoch 26, Loss: 0.4272
Epoch 27, Loss: 0.4267
Epoch 28, Loss: 0.4267
Epoch 29, Loss: 0.4267
Epoch 30, Loss: 0.4264
Epoch 31, Loss: 0.4264
Epoch 32, Loss: 0.4274
Epoch 33, Loss: 0.4260
Epoch 34, Loss: 0.4254
Epoch 35, Loss: 0.4251
Epoch 36, Loss: 0.4252
Epoch 37, Loss: 0.4248
Epoch 38, Loss: 0.4249
Epoch 39, Loss: 0.4240
Epoch 40, Loss: 0.4244
Epoch 41, Loss: 0.4250
Epoch 42, Loss: 0.4237
Epoch 43, Loss: 0.4230
Epoch 44, Loss: 0.42

In [77]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

print(f"Train size: {len(train_data)}, Test size: {len(test_data)}")

Train size: 599, Test size: 150


In [78]:
from torch_geometric.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")

Train batches: 19
Test batches: 5


In [79]:
def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)
            loss = criterion(out, data.y)
            total_loss += loss.item() * data.num_graphs

            # For binary multi-label, threshold outputs at 0 (logits)
            preds = (torch.sigmoid(out) > 0.5).float()
            correct = (preds == data.y).sum().item()
            total_correct += correct
            total_samples += data.y.numel()

    avg_loss = total_loss / len(loader.dataset)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

In [80]:
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

Test Loss: 0.4199, Test Accuracy: 0.8232


In [81]:
import pickle
import random

# Load the pickle file
with open('drug_name_to_side_effects.pkl', 'rb') as file:
    data = pickle.load(file)

# Check the type of data
print(f"Data type: {type(data)}")

# If it's a dictionary, sample random keys and print their values
if isinstance(data, dict):
    sample_keys = random.sample(list(data.keys()), 5)  # get 5 random keys
    for key in sample_keys:
        print(f"Drug: {key}")
        print(f"Side Effects: {data[key]}")
        print("-" * 40)
else:
    # If it's a list or DataFrame-like, print some random rows differently
    print(data[:5])  # just print first 5 if list

Data type: <class 'dict'>
Drug: Troglitazone
Side Effects: {'Weight increased', 'Shock', 'Anaemia', 'Abdominal pain', 'Liver function test abnormal', 'Hepatitis', 'Alanine aminotransferase increased', 'Decreased appetite', 'Dizziness', 'Loss of consciousness', 'Asthenia', 'Hepatic function abnormal', 'Gastrointestinal pain', 'Fatigue', 'Discomfort', 'Oedema', 'Vomiting', 'Cardiac failure congestive', 'Hyperglycaemia', 'Feeling abnormal', 'Nausea', 'Body temperature increased', 'Jaundice', 'Syncope', 'Ill-defined disorder'}
----------------------------------------
Drug: Tinidazole
Side Effects: {'Coma', 'Dermatitis', 'Urinary tract infection', 'Musculoskeletal discomfort', 'Dyspepsia', 'Pruritus', 'Rash', 'Sensory loss', 'Dysuria', 'Haemoglobin decreased', 'Urticaria', 'Myalgia', 'Palpitations', 'Angiopathy', 'Neutropenia', 'Insomnia', 'Hypersensitivity', 'Headache', 'Hypoaesthesia', 'Malnutrition', 'Dyspnoea', 'Abdominal pain', 'Candida infection', 'Menorrhagia', 'Thrombocytopenia', 'P

In [82]:
import pickle

# Load the pickle file
with open('pubchem_output_v2.pkl', 'rb') as f:
    data = pickle.load(f)

# Check what data contains
print(type(data))
print(data)

<class 'pandas.core.frame.DataFrame'>
           cid                                           cmpdname  \
0           85                                        Carnitinium   
1          119                            gamma-Aminobutyric acid   
2          137                                Aminolevulinic acid   
3          158                             8-Iso PROSTAGLANDIN E2   
4          159  5-[5-Hydroxy-4-(3-hydroxyoct-1-enyl)-3,3a,4,5,...   
...        ...                                                ...   
1338  56603655                                         Pegaptanib   
1339  56842239                                omega-3 Fatty Acids   
1340  70683024                                    Lipegfilgrastim   
1341  70695640                                      Colestyramine   
1342  71306834                                 Interferon alfa-2B   

                                            cmpdsynonym      mw  \
0     carnitinium|Carnitine|461-05-2|CHEBI:3424|3-ca...  162.21   

In [83]:
import pandas as pd
import pickle
import random

# Load your data
with open('drug_name_to_side_effects.pkl', 'rb') as f:
    drug_to_se = pickle.load(f)  # dict: drug name -> set of side effects

# Assuming your DataFrame is loaded as `df`
# For example: df = pd.read_pickle('pubchem_output_v2.pkl')

# Map side effects using compound name (cmpdname)
def get_side_effects(drug_name):
    # Side effects keys may be case sensitive; normalize
    # Also strip and handle possible missing values
    if not isinstance(drug_name, str):
        return None
    key = drug_name.strip()
    return drug_to_se.get(key)

# Create new column for side effects
df['side_effects'] = df['cmpdname'].apply(get_side_effects)

# Filter to rows where side effects exist
df_with_se = df[df['side_effects'].notnull()]

# Show some random rows with mf and side effects only
print(df_with_se[['mf', 'side_effects']].sample(5))


                 mf                                       side_effects
15             Ca+2  {Coma, Arrhythmia, Pruritus, Hypersensitivity,...
917  C53H74N10O14S2  {Dermatitis, Hepatobiliary disease, Anaphylact...
876       C18H23NO3  {Headache, Ventricular extrasystoles, Dyspnoea...
541      C21H26N2O7  {Dermatitis, Hepatobiliary disease, Musculoske...
736      C12H7Cl3O2                                 {Hypersensitivity}


In [84]:
# Save the filtered DataFrame with mf and side_effects as a pickle file
df_with_se[['mf', 'side_effects']].to_pickle('mf_with_side_effects.pkl')
print("Saved filtered data with molecular formulas and side effects as 'mf_with_side_effects.pkl'")

Saved filtered data with molecular formulas and side effects as 'mf_with_side_effects.pkl'


In [85]:
import pandas as pd
import pickle

# Load pubchem dataframe
df_pubchem = pd.read_pickle('pubchem_output_v2.pkl')

# Load drug_name_to_side_effects dictionary
with open('drug_name_to_side_effects.pkl', 'rb') as f:
    drug_to_se = pickle.load(f)

In [86]:
# Convert dict to DataFrame: columns = drug_name, side_effects (set)
df_side_effects = pd.DataFrame(list(drug_to_se.items()), columns=['drug_name', 'side_effects'])

In [87]:
# Merge on drug name and compound name
merged_df = pd.merge(df_side_effects, df_pubchem, left_on='drug_name', right_on='cmpdname', how='inner')

In [88]:
result_df = merged_df[['mf', 'drug_name', 'side_effects']]

In [89]:
print(result_df.sample(5))

               mf        drug_name  \
211    C13H6Cl6O2  Hexachlorophene   
613    C36H41N3O6    Lercanidipine   
126       C20H21N  Cyclobenzaprine   
498            Sm         Samarium   
346  C21H26ClN3OS     Perphenazine   

                                          side_effects  
211  {Dermatitis, Erythema, Photosensitivity reaction}  
613  {Dermatitis, Feeling hot, Abdominal pain upper...  
126  {Dermatitis, Tremor, Extrapyramidal disorder, ...  
498  {Dermatitis, Anaphylactic shock, Arrhythmia, H...  
346  {Extrapyramidal disorder, Hypoglycaemia, Pain,...  


In [90]:
result_df.to_pickle('mf_drugname_sideeffects.pkl')

In [91]:
import re
from collections import defaultdict

def parse_mf(mf):
    # Parse molecular formula like C4H9NO2 -> {'C':4, 'H':9, 'N':1, 'O':2}
    elements = re.findall(r'([A-Z][a-z]*)(\d*)', mf)
    counts = defaultdict(int)
    for (elem, count) in elements:
        counts[elem] += int(count) if count else 1
    return counts

# Example usage
mf = 'C4H9NO2'
counts = parse_mf(mf)
print(counts)  # {'C': 4, 'H': 9, 'N': 1, 'O': 2}

defaultdict(<class 'int'>, {'C': 4, 'H': 9, 'N': 1, 'O': 2})


In [92]:
import pandas as pd

# Let's define a common list of elements to consider (based on your dataset)
element_list = ['C', 'H', 'N', 'O', 'S', 'Cl', 'P', 'F', 'Br', 'I', 'Na', 'K', 'Ca', 'Mg']

def mf_to_vector(mf, element_list):
    counts = parse_mf(mf)
    return [counts.get(elem, 0) for elem in element_list]

# Example usage
mf = 'C4H9NO2'
vector = mf_to_vector(mf, element_list)
print(vector)

[4, 9, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [93]:
print(result_df.head())

         mf       drug_name                                       side_effects
0   C6H12O6          Hexose  {Coma, Pulmonary embolism, Dehydration, Shock,...
1       H3N         Ammonia  {Extravasation, Infection, Body temperature in...
2     C7H8O  Benzyl alcohol  {Dermatitis, Hypoaesthesia, Eye irritation, Er...
3  C5H11NO2         Betaine  {Urticaria, Headache, Malnutrition, Mental dis...
4      Ca+2     Calcium ion  {Coma, Arrhythmia, Pruritus, Hypersensitivity,...


In [94]:
result_df.to_pickle('mf_drugname_sideeffects.pkl')

In [95]:
# Step 1: Get unique elements across all MFs
from collections import defaultdict
import pandas as pd

def parse_mf(mf):
    import re
    elements = re.findall(r'([A-Z][a-z]*)(\d*)', mf)
    counts = defaultdict(int)
    for (elem, count) in elements:
        counts[elem] += int(count) if count else 1
    return counts

# Load your dataset
df = pd.read_pickle("mf_drugname_sideeffects.pkl")

# Extract unique elements
all_elements = set()
df['parsed_mf'] = df['mf'].apply(parse_mf)
df['parsed_mf'].apply(lambda d: all_elements.update(d.keys()))
element_list = sorted(list(all_elements))

# Convert to fixed-length vector
def mf_to_vector(mf_dict, element_list):
    return [mf_dict.get(el, 0) for el in element_list]

df['features'] = df['parsed_mf'].apply(lambda x: mf_to_vector(x, element_list))

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Features
X = list(df['features'])

# Labels (side effects → multi-label binarization)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['side_effects'])

# Store drug names for later use
drug_names = list(df['drug_name'])

# Split
X_train, X_test, y_train, y_test, drug_train, drug_test = train_test_split(
    X, y, drug_names, test_size=0.2, random_state=42
)

In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

In [99]:
import numpy as np

# Example prediction
idx = 0  # or any index in your test set
predicted_labels = mlb.inverse_transform(np.array([y_pred[idx]]))[0]

print("Drug:", drug_test[idx])
print("Predicted Side Effects:", predicted_labels)

Drug: (4R,5S,6S)-3-(((3S,5S)-5-(Dimethylcarbamoyl)-3-pyrrolidinyl)thio)-6-((1R)-1-hydroxyethyl)-4-methyl-7-oxo-1-azabicyclo(3.2.0)hept-2-ene-2-carboxylic acid, trihydrate
Predicted Side Effects: ('Abdominal pain', 'Angioedema', 'Asthenia', 'Dermatitis', 'Diarrhoea', 'Dizziness', 'Dyspepsia', 'Gastrointestinal disorder', 'Headache', 'Hyperhidrosis', 'Hypersensitivity', 'Hypertension', 'Leukopenia', 'Musculoskeletal discomfort', 'Nausea', 'Nervous system disorder', 'Pain', 'Pruritus', 'Rash', 'Thrombocytopenia', 'Urticaria', 'Vomiting')


In [101]:
def predict_side_effects(mf, model, mlb, element_list, df):
    import numpy as np

    # Convert MF to feature vector
    def parse_mf(mf):
        import re
        from collections import defaultdict
        elements = re.findall(r'([A-Z][a-z]*)(\d*)', mf)
        counts = defaultdict(int)
        for (elem, count) in elements:
            counts[elem] += int(count) if count else 1
        return counts

    def mf_to_vector(mf, element_list):
        counts = parse_mf(mf)
        return [counts[elem] for elem in element_list]

    # Create vector from input mf
    vector = mf_to_vector(mf, element_list)
    vector = np.array(vector).reshape(1, -1)

    # Predict side effects
    y_pred = model.predict(vector)
    predicted_labels = mlb.inverse_transform(y_pred)[0]

    # Lookup drug name from dataframe
    drug_name = df[df['mf'] == mf]['drug_name'].values
    drug_name = drug_name[0] if len(drug_name) > 0 else "Unknown Drug"

    return drug_name, predicted_labels

In [106]:
def predict_side_effects(mf, model, mlb, element_list, df):
    import numpy as np
    import re
    from collections import defaultdict

    # Parse molecular formula
    def parse_mf(mf):
        elements = re.findall(r'([A-Z][a-z]*)(\d*)', mf)
        counts = defaultdict(int)
        for (elem, count) in elements:
            counts[elem] += int(count) if count else 1
        return counts

    # Convert mf to vector
    def mf_to_vector(mf, element_list):
        counts = parse_mf(mf)
        return [counts[elem] for elem in element_list]

    # Check if exact MF exists in dataframe
    match = df[df['mf'].str.strip() == mf.strip()]
    if match.empty:
        return None, None  # No exact match found

    # Create feature vector
    vector = mf_to_vector(mf, element_list)
    vector = np.array(vector).reshape(1, -1)

    # Predict side effects
    y_pred = model.predict(vector)
    predicted_labels = mlb.inverse_transform(y_pred)[0]

    # Get drug name from dataframe
    drug_name = match.iloc[0]['drug_name']

    return drug_name, predicted_labels

In [107]:
while True:
    # Take input from user
    user_mf = input("Enter a molecular formula (e.g., C4H9NO2) or type 'exit' to quit: ").strip()

    if user_mf.lower() == 'exit':
        print("Exiting...")
        break

    try:
        drug_name, predicted_se = predict_side_effects(user_mf, model, mlb, element_list, result_df)

        if drug_name is None:
            print("\n⚠️ No match found for the molecular formula:", user_mf)
        else:
            print("\n📌 Drug Name:", drug_name)
            print("💊 Predicted Side Effects:")
            for se in predicted_se:
                print("  -", se)
        print("\n" + "-"*40 + "\n")

    except Exception as e:
        print("⚠️ Error:", e)
        print("Please enter a valid molecular formula.\n")

Enter a molecular formula (e.g., C4H9NO2) or type 'exit' to quit: h3n

⚠️ No match found for the molecular formula: h3n

----------------------------------------

Enter a molecular formula (e.g., C4H9NO2) or type 'exit' to quit: H3N

📌 Drug Name: Ammonia
💊 Predicted Side Effects:
  - Body temperature increased
  - Extravasation
  - Infection
  - Pain
  - Phlebitis

----------------------------------------

Enter a molecular formula (e.g., C4H9NO2) or type 'exit' to quit: C6H12O6

📌 Drug Name: Hexose
💊 Predicted Side Effects:
  - Abscess
  - Agitation
  - Asthenia
  - Body temperature increased
  - Cardiac failure congestive
  - Coma
  - Convulsion
  - Dehydration
  - Dizziness
  - Extravasation
  - Feeling abnormal
  - Fluid retention
  - Headache
  - Hypertension
  - Hypokalaemia
  - Irritability
  - Lacrimation increased
  - Loss of consciousness
  - Nervous system disorder
  - Oedema
  - Pain
  - Phlebitis
  - Pulmonary embolism
  - Respiratory failure
  - Salivary hypersecretion
  

In [108]:
import joblib

joblib.dump(model, 'model.joblib')

['model.joblib']

In [109]:
import joblib

# Save your model
joblib.dump(model, 'model.joblib')

# Save multilabel binarizer
joblib.dump(mlb, 'mlb.joblib')

# Save element list (if it's a Python list)
joblib.dump(element_list, 'element_list.joblib')

# Save your DataFrame too if needed
joblib.dump(result_df, 'result_df.joblib')

['result_df.joblib']

In [113]:
import zipfile
import os

zip_filename = "all_files.zip"
folder_to_zip = '/content/'  # current directory in Colab

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(folder_to_zip):
        for file in files:
            # Skip the zip file itself if it exists in the folder to avoid recursion
            if file == zip_filename:
                continue
            file_path = os.path.join(root, file)
            zipf.write(file_path, arcname=os.path.relpath(file_path, folder_to_zip))

print(f"Created {zip_filename}")

Created all_files.zip


In [115]:
from google.colab import files
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>