In [None]:
from google.colab import files

files.upload()


In [None]:
import pandas as pd

# File paths
drugbank_file = 'drug_target_uniprot_links.csv'   # DrugBank CSV
protac_file = 'protac.csv'     # Protac CSV
output_file = 'merged.csv'

# Read CSVs
df_drugbank = pd.read_csv(drugbank_file)
df_protac = pd.read_csv(protac_file)

# Normalize UniProt IDs (remove spaces, uppercase)
df_drugbank['UniProt ID'] = df_drugbank['UniProt ID'].astype(str).str.strip().str.upper()
df_protac['Uniprot'] = df_protac['Uniprot'].astype(str).str.strip().str.upper()

# Map protac.csv columns (keep only ProtacRow columns)
protac_columns = [
    'Compound ID', 'Uniprot', 'Target', 'E3 ligase', 'Smiles',
    'DC50 (nM)', 'Dmax (%)', 'Molecular Weight', 'Exact Mass',
    'Heavy Atom Count', 'Ring Count', 'Hydrogen Bond Donor Count',
    'Hydrogen Bond Acceptor Count', 'Molecular Formula', 'InChI', 'InChI Key'
]
df_protac_selected = df_protac[protac_columns]

# Rename protac columns to ProtacRow keys
df_protac_selected = df_protac_selected.rename(columns={
    'Compound ID': 'cid',
    'Uniprot': 'uniprot',
    'Target': 'poi',
    'E3 ligase': 'ligase',
    'Smiles': 'smiles',
    'DC50 (nM)': 'dc50',
    'Dmax (%)': 'dmax',
    'Molecular Weight': 'molecular_weight',
    'Exact Mass': 'exact_mass',
    'Heavy Atom Count': 'heavy_atom_count',
    'Ring Count': 'ring_count',
    'Hydrogen Bond Donor Count': 'hbond_donor_count',
    'Hydrogen Bond Acceptor Count': 'hbond_acceptor_count',
    'Molecular Formula': 'molecular_formula',
    'InChI': 'inchi',
    'InChI Key': 'inchikey'
})

# Convert numeric columns to numbers
numeric_cols = ['cid', 'molecular_weight', 'exact_mass', 'heavy_atom_count', 'ring_count', 'hbond_donor_count', 'hbond_acceptor_count']
for col in numeric_cols:
    df_protac_selected[col] = pd.to_numeric(df_protac_selected[col], errors='coerce')



In [None]:
# Merge DrugBank with Protac
merged_df = pd.merge(
    df_drugbank,
    df_protac_selected,
    left_on='UniProt ID',
    right_on='uniprot',
    how='inner'
)

In [None]:
merged_df = merged_df.drop(columns=["UniProt ID"])
merged_df = merged_df.rename(columns={
    "DrugBank ID": "dbid",
    "Name": "name",
    "Type": "type",
    "UniProt Name": "uniprot_name",
})
merged_df.to_csv(output_file, index=False)
print(f"Merged CSV saved to {output_file}")

In [None]:

# Example: your dataframe
df = pd.read_csv('merged.csv')

# Estimate average row size in bytes
avg_row_size = df.memory_usage(deep=True).sum() / len(df)  # in bytes
max_bytes = 1 * 1024 * 1024  # 1 MB
rows_to_keep = int(max_bytes / avg_row_size)

print(f"Saving first {rows_to_keep} rows (~2 MB)")

# Slice the dataframe
df_small = df.iloc[:rows_to_keep]

# Save to CSV
df_small.to_csv('merged_small.csv', index=False)