# Parsing SwissLipids into a network for LipiNet

In [1]:
import lipinet.databases  # Import the module
import importlib

# Reload the module to ensure changes are picked up
importlib.reload(lipinet)

# Now can use the functions after reloading the module
from lipinet.databases import get_prior_knowledge
from lipinet.utils import split_and_expand_large, create_nodedf_from_edgedf, check_for_split_characters

import pandas as pd

## Parsing the manual way

LipiNet offers conventient functions to parse prior knowledge resources straight into networks. But to show what is happening behind the scenes, this notebook goes through the data and each of the steps. Which may also be particularly helpful to you if you need to customise the networks in a way that is not yet supported by LipiNet directly. 

In [2]:
df_swisslipids = get_prior_knowledge('swisslipids', verbose=True)
df_swisslipids

File found locally at /Users/agjanyunlu/Documents/Metabolomics/lipinet/lipinet/.data/downloaded/swisslipids_lipids.tsv. Loading data...
Before cleaning, number of values in lipid class column with trailing space: Lipid class*
False    779171
True         76
Name: count, dtype: int64
After cleaning, number of values in lipid class column with trailing space: Lipid class*
False    779247
Name: count, dtype: int64


Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
0,SLM:000000002,Class,Ceramide (iso-d17:1(4E)),Cer(iso-d17:1(4E)),N-acyl-15-methylhexadecasphing-4-enine,SLM:000399814,,,CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](CO)NC([*])=O,InChI=none,...,,,,,,70846,,,MNXM97012,| 11443131 | 14685263 | 18390550 | 21325339 |...
1,SLM:000000003,Isomeric subspecies,15-methylhexadecasphing-4-enine,,,SLM:000390097,,,CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@@H]([NH3+])CO,InChI=1S/C17H35NO2/c1-15(2)12-10-8-6-4-3-5-7-9...,...,292.282235,303.300605,284.259503,320.236181,344.280632,70771,,,MNXM57784,19372430
2,SLM:000000006,Isomeric subspecies,15-methylhexadecasphinganine,,,SLM:000390097,,,CC(C)CCCCCCCCCCC[C@@H](O)[C@@H]([NH3+])CO,InChI=1S/C17H37NO2/c1-15(2)12-10-8-6-4-3-5-7-9...,...,294.297885,305.316255,286.275153,322.251831,346.296282,70829,,,MNXM97029,19372430
3,SLM:000000007,Class,Sphingomyelin (iso-d17:1(4E)),SM(iso-d17:1(4E)),N-acyl-15-methylhexadecasphing-4-enine-1-phosp...,SLM:000001000,,,CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](COP([O-])(=O...,InChI=none,...,,,,,,70775,,,MNXM97113,14685263 | 21926990 | 9603947
4,SLM:000000035,Isomeric subspecies,sphinganine,,,SLM:000390097,,,CCCCCCCCCCCCCCC[C@@H](O)[C@@H]([NH3+])CO,InChI=1S/C18H39NO2/c1-2-3-4-5-6-7-8-9-10-11-12...,...,308.313535,319.331905,300.290803,336.267481,360.311932,57817,LMSP01020001,HMDB00269,MNXM302,10652340 | 10702247 | 10751414 | 10802064 | 10...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
779244,SLM:000782324,,apo carotenoid,,,SLM:000508864,,,,,...,,,,,,53183,,,,
779245,SLM:000782325,,terpenoid,,,SLM:000508864,,,,,...,,,,,,26873,,,,
779246,SLM:000782326,,C-45 isoprenoid,,,SLM:000508864,,,,,...,,,,,,87168,,,,
779247,SLM:000782327,,gamma-lactone,,,SLM:000782238,,,O1C(C(C(C1=O)*)*)*,,...,,,,,,37581,,,,


If we take a closer look into the data, especially the `Lipid class*` column, we will see that some of the values have multiple entries. For example Ceramide phosphoinositol is a Class level entry that itself belongs to both the SLM:000000834 and SLM:000399815 classes.

In [3]:
df_swisslipids.dropna(subset='Lipid class*')[df_swisslipids['Lipid class*'].dropna().str.contains('|', regex=False)]

Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
142,SLM:000000392,Class,Ceramide phosphoinositol,IPC,Inositol-1-phosphoceramide,SLM:000000834 | SLM:000399815,,,O[C@H]([*])[C@H](COP([O-])(=O)O[C@H]1[C@H](O)[...,InChI=none,...,,,,,,64916,,,,10888667 | 20727985
234,SLM:000000509,Isomeric subspecies,All-trans-retinyl hexadecanoate,,all-trans-retinyl palmitate,SLM:000000982 | SLM:000508854,,,CCCCCCCCCCCCCCCC(=O)OCC=C(C)C=CC=C(C)C=CC1=C(C...,InChI=1S/C36H60O2/c1-7-8-9-10-11-12-13-14-15-1...,...,,,,,,17616,,HMDB03648,,10769148 | 10819989 | 12230550 | 15550674 | 15...
315,SLM:000000612,,tetracosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74146,,,,18541923 | 20110363 | 20937905
317,SLM:000000614,,hexacosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74161,,,,18165233
319,SLM:000000621,,2-hydroxy-tetracosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74215,,,,18541923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755324,SLM:000758294,Class,Globoside,Globo,Globo-series,SLM:000000834 | SLM:000399813,,,,,...,,,,,,61360,,,,
755325,SLM:000758295,Class,Isogloboside,Isoglobo,Isoglobo-series,SLM:000000834 | SLM:000399813,,,,,...,,,,,,78257,,,,
779141,SLM:000782221,,Resolvin E,RvE,,SLM:000501332 | SLM:000508853,,,,InChI=none,...,,,,,,,LMFA0314,,,
779142,SLM:000782222,,Resolvin D,RvD,,SLM:000501331 | SLM:000508853,,,,InChI=none,...,,,,,,,LMFA0403,,,


What about other IDs?

In [4]:
cols_with_split_chars = check_for_split_characters(df_swisslipids, delimiter='|')

Checking split characters (|) in Lipid ID
No rows found

Checking split characters (|) in Level
No rows found

Checking split characters (|) in Name
No rows found

Checking split characters (|) in Abbreviation*
Found 9768 rows with split characters


Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
56,SLM:000000262,Class,"1,2-diacyl-sn-glycerol","1,2-sn-DAG | DAG | DG",Diacylglycerol,SLM:000000423,,,OC[C@@H](COC([*])=O)OC([*])=O,InChI=none,...,,,,,,17815,,,MNXM59,10336610 | 10685032 | 10888667 | 10931938 | 11...
114,SLM:000000341,Class,1-acyl-sn-glycerol,MAG | MG,Monoacylglycerol,SLM:000117130,,,OC[C@H](O)COC([*])=O,InChI=none,...,,,,,,64683,,,MNXM2963,10685032 | 15939762 | 18037386 | 8663293 | 960...
122,SLM:000000355,Class,2-acylglycerol,MAG | MG,Monoacylglycerol,SLM:000000403,,,OCC(CO)OC([*])=O,InChI=none,...,,,,,,17389,,,MNXM335,
146,SLM:000000400,Class,Triacylglycerol,TAG | TG,,SLM:000117141,,,[*]C(=O)OCC(COC([*])=O)OC([*])=O,InChI=none,...,,,,,,17855,,,MNXM248,12682047 | 16135509 | 16150821 | 21704635 | 27...
147,SLM:000000401,Class,Diacylglycerol,DAG | DG,,SLM:000117140,,,[*]OCC(CO[*])O[*],InChI=none,...,,,,,,18035,,,MNXM59,12682047 | 16135509 | 16150821 | 27247428 | 29...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505694,SLM:000508489,Molecular subspecies,Phosphatidylglycerol (O-17:1_0:0),LPG(O-17:1_0:0) | PG(O-17:1_0:0),Lysophosphatidylglycerol (O-17:1_0:0),SLM:000508807,SLM:000508779,SLM:000001333 (sn1 or sn2 or sn3),OCC(O)COP([O-])(=O)OCC(CO[*])O[*],InChI=none,...,489.316311,500.334681,481.293579,517.270257,541.314708,,,,MNXM629334,
505695,SLM:000508490,Molecular subspecies,Phosphatidylglycerol (O-15:1_0:0),LPG(O-15:1_0:0) | PG(O-15:1_0:0),Lysophosphatidylglycerol (O-15:1_0:0),SLM:000508807,SLM:000508775,SLM:000001331 (sn1 or sn2 or sn3),OCC(O)COP([O-])(=O)OCC(CO[*])O[*],InChI=none,...,461.285011,472.303381,453.262279,489.238957,513.283408,,,,MNXM628940,
505696,SLM:000508491,Molecular subspecies,Phosphatidylglycerol (O-13:1_0:0),LPG(O-13:1_0:0) | PG(O-13:1_0:0),Lysophosphatidylglycerol (O-13:1_0:0),SLM:000508807,SLM:000508771,SLM:000001329 (sn1 or sn2 or sn3),OCC(O)COP([O-])(=O)OCC(CO[*])O[*],InChI=none,...,433.253711,444.272081,425.230979,461.207657,485.252108,,,,MNXM628548,
595061,SLM:000597889,Isomeric subspecies,7-oxoresolvin D2,7-oxo-RvD2| 7-keto-RvD2,"(16R,17S)-dihydroxy-7-oxo-(4Z,8E,10Z,12E,14E,1...",SLM:000508853 | SLM:000782222,,,C(C/C=C\CC(/C=C/C=C\C=C\C=C\[C@H]([C@H](C/C=C\...,InChI=1S/C22H30O5/c1-2-3-9-16-20(24)21(25)17-1...,...,381.224780,392.243150,373.202048,409.178725,433.223177,137497,,,,22844113


Checking split characters (|) in Synonyms*
Found 19853 rows with split characters


Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
11,SLM:000000101,Class,"1,2-diacyl-sn-glycero-3-phospho-(1'-sn-glycero...",PA,"1,2-diacyl-sn-glycero-3-phospho-(1'-sn-glycero...",SLM:000477285,,,O[C@@H](COP([O-])([O-])=O)COP([O-])(=O)OC[C@@H...,InChI=none,...,,,,,,60110,,,MNXM871,20485265 | 9880566
17,SLM:000000147,Isomeric subspecies,N-(9Z-octadecenoyl)-ethanolamine,NAE (18:1(9Z)),(9Z-octadecenoyl)-ethanolamide | N-(9Z-octadec...,SLM:000000378,,,CCCCCCCC\C=C/CCCCCCCC(=O)NCCO,InChI=1S/C20H39NO2/c1-2-3-4-5-6-7-8-9-10-11-12...,...,332.313535,343.331905,324.290803,360.267481,384.311932,71466,,HMDB02088,MNXM107386,14634025 | 16527816 | 17015445 | 17626977 | 17...
18,SLM:000000149,Isomeric subspecies,N-hexadecanoyl-ethanolamine,NAE (16:0),hexadecanoyl-ethanolamide | N-hexadecanoyl eth...,SLM:000000378,,,CCCCCCCCCCCCCCCC(=O)NCCO,InChI=1S/C18H37NO2/c1-2-3-4-5-6-7-8-9-10-11-12...,...,306.297885,317.316255,298.275153,334.251831,358.296282,71464,,HMDB02100,MNXM107548,12824167 | 14634025 | 15655246 | 15760304 | 16...
19,SLM:000000178,Isomeric subspecies,N-(docosanoyl)-15-methylhexadecasphing-4-enine,Cer(iso-d17:1(4E)/22:0),Ceramide (iso-d17:1(4E)/22:0) | N-docosanoyl-1...,SLM:000000002,SLM:000392021,SLM:000000827 (n-acyl),CCCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@H](O)\...,InChI=1S/C39H77NO3/c1-4-5-6-7-8-9-10-11-12-13-...,...,614.605801,625.624171,606.583069,642.559747,666.604198,71377,,,MNXM107026,19372430
20,SLM:000000179,Isomeric subspecies,N-(heneicosanoyl)-15-methylhexadecasphing-4-enine,Cer(iso-d17:1(4E)/21:0),Ceramide (iso-d17:1(4E)/21:0) | N-henicosanoyl...,SLM:000000002,SLM:000392020,SLM:000001207 (n-acyl),CCCCCCCCCCCCCCCCCCCCC(=O)N[C@@H](CO)[C@H](O)\C...,InChI=1S/C38H75NO3/c1-4-5-6-7-8-9-10-11-12-13-...,...,600.590151,611.608521,592.567419,628.544097,652.588548,71375,,,MNXM107036,19372430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745092,SLM:000747954,Isomeric subspecies,"CDP-1,2-di-(13-methyltetradecanoyl)-sn-glycerol",CDP-DAG (iso15:0/iso15:0),"1,2-di-(13-methyltetradecanoyl)-sn-glycero-3-c...",SLM:000000084,,SLM:000000047 (sn1 or sn2),[H]Nc1ccn([C@@H]2O[C@H](COP([O-])(=O)OP([O-])(...,InChI=1S/C42H77N3O15P2/c1-32(2)23-19-15-11-7-5...,...,932.498448,943.516818,924.475716,960.452394,984.496846,,,HMDB0116214,,
745093,SLM:000747955,Isomeric subspecies,CDP-1-(13-methyltetradecanoyl)-2-(15-methylhex...,CDP-DAG (iso15:0/iso17:0),1-(13-methyltetradecanoyl)-2-(15-methylhexadec...,SLM:000000084,,SLM:000000047 (sn1) / SLM:000000048 (sn2),[H]Nc1ccn([C@@H]2O[C@H](COP([O-])(=O)OP([O-])(...,InChI=1S/C44H81N3O15P2/c1-34(2)25-21-17-13-9-6...,...,960.529748,971.548118,952.507016,988.483694,1012.528146,,,HMDB0116216,,
745175,SLM:000748037,Isomeric subspecies,CDP-1-(15-methylhexadecanoyl)-2-(11-methyldode...,CDP-DAG (iso17:0/iso13:0),1-(15-methylhexadecanoyl)-2-(11-methyldodecano...,SLM:000000084,,SLM:000000048 (sn1) / SLM:000001197 (sn2),[H]Nc1ccn([C@@H]2O[C@H](COP([O-])(=O)OP([O-])(...,InChI=1S/C42H77N3O15P2/c1-32(2)23-19-15-11-8-6...,...,932.498448,943.516818,924.475716,960.452394,984.496846,,,HMDB0116248,,
745176,SLM:000748038,Isomeric subspecies,CDP-1-(15-methylhexadecanoyl)-2-(13-methyltetr...,CDP-DAG (iso17:0/iso15:0),1-(15-methylhexadecanoyl)-2-(13-methyltetradec...,SLM:000000084,,SLM:000000047 (sn2) / SLM:000000048 (sn1),[H]Nc1ccn([C@@H]2O[C@H](COP([O-])(=O)OP([O-])(...,InChI=1S/C44H81N3O15P2/c1-34(2)25-21-17-13-9-6...,...,960.529748,971.548118,952.507016,988.483694,1012.528146,,,HMDB0116250,,


Checking split characters (|) in Lipid class*
Found 119 rows with split characters


Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
142,SLM:000000392,Class,Ceramide phosphoinositol,IPC,Inositol-1-phosphoceramide,SLM:000000834 | SLM:000399815,,,O[C@H]([*])[C@H](COP([O-])(=O)O[C@H]1[C@H](O)[...,InChI=none,...,,,,,,64916,,,,10888667 | 20727985
234,SLM:000000509,Isomeric subspecies,All-trans-retinyl hexadecanoate,,all-trans-retinyl palmitate,SLM:000000982 | SLM:000508854,,,CCCCCCCCCCCCCCCC(=O)OCC=C(C)C=CC=C(C)C=CC1=C(C...,InChI=1S/C36H60O2/c1-7-8-9-10-11-12-13-14-15-1...,...,,,,,,17616,,HMDB03648,,10769148 | 10819989 | 12230550 | 15550674 | 15...
315,SLM:000000612,,tetracosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74146,,,,18541923 | 20110363 | 20937905
317,SLM:000000614,,hexacosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74161,,,,18165233
319,SLM:000000621,,2-hydroxy-tetracosenoyl-CoA,,,SLM:000390051 | SLM:000782334,,,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...,,...,,,,,,74215,,,,18541923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755324,SLM:000758294,Class,Globoside,Globo,Globo-series,SLM:000000834 | SLM:000399813,,,,,...,,,,,,61360,,,,
755325,SLM:000758295,Class,Isogloboside,Isoglobo,Isoglobo-series,SLM:000000834 | SLM:000399813,,,,,...,,,,,,78257,,,,
779141,SLM:000782221,,Resolvin E,RvE,,SLM:000501332 | SLM:000508853,,,,InChI=none,...,,,,,,,LMFA0314,,,
779142,SLM:000782222,,Resolvin D,RvD,,SLM:000501331 | SLM:000508853,,,,InChI=none,...,,,,,,,LMFA0403,,,


Checking split characters (|) in Parent
No rows found

Checking split characters (|) in Components*
No rows found

Checking split characters (|) in SMILES (pH7.3)
No rows found

Checking split characters (|) in InChI (pH7.3)
No rows found

Checking split characters (|) in InChI key (pH7.3)
No rows found

Checking split characters (|) in Formula (pH7.3)
No rows found

Checking split characters (|) in Charge (pH7.3)
Not a string column

Checking split characters (|) in Mass (pH7.3)
Not a string column

Checking split characters (|) in Exact Mass (neutral form)
Not a string column

Checking split characters (|) in Exact m/z of [M.]+
Not a string column

Checking split characters (|) in Exact m/z of [M+H]+
Not a string column

Checking split characters (|) in Exact m/z of [M+K]+ 
Not a string column

Checking split characters (|) in Exact m/z of [M+Na]+
Not a string column

Checking split characters (|) in Exact m/z of [M+Li]+
Not a string column

Checking split characters (|) in Exact m/z

Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
465,SLM:000000784,Isomeric subspecies,"1,2-di-(9Z-octadecenoyl)-sn-glycero-3-phosphate",PA(18:1(9Z)/18:1(9Z)),Phosphatidate (18:1(9Z)/18:1(9Z)),SLM:000000329,SLM:000082169,SLM:000000418 (sn1 or sn2),CCCCCCCC\C=C/CCCCCCCC(=O)OC[C@H](COP([O-])([O-...,InChI=1S/C39H73O8P/c1-3-5-7-9-11-13-15-17-19-2...,...,707.519775,718.538147,699.497009,735.473694,759.518188,74546 | 82922,LMGP10010962,HMDB07865,MNXM51075,11309392 | 14634025 | 14665624 | 15164764 | 15...
387185,SLM:000389154,,"(14Z,17Z,20Z,23Z,26Z)-dotriacontapentaenoate",,"Fatty acid 32:5(14Z,17Z,20Z,23Z,26Z)",SLM:000389801,,,CCCCC\C=C/C\C=C/C\C=C/C\C=C/C\C=C/CCCCCCCCCCCC...,InChI=1S/C32H54O2/c1-2-3-4-5-6-7-8-9-10-11-12-...,...,477.427836,488.446207,469.405105,505.381782,529.426234,82731 | CHEBI:82731,LMFA01030848,,,
595221,SLM:000598072,,all-trans-retinol--[retinol-binding protein],,,SLM:000000982,,,[*][C@H](N-*)C(-*)=O,InChI=none,...,,,,,,17336 | 83228,,,,20628054 | 28758396


Checking split characters (|) in LIPID MAPS
No rows found

Checking split characters (|) in HMDB
No rows found

Checking split characters (|) in MetaNetX
No rows found

Checking split characters (|) in PMID
Found 1318 rows with split characters


Unnamed: 0,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,SMILES (pH7.3),InChI (pH7.3),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
0,SLM:000000002,Class,Ceramide (iso-d17:1(4E)),Cer(iso-d17:1(4E)),N-acyl-15-methylhexadecasphing-4-enine,SLM:000399814,,,CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](CO)NC([*])=O,InChI=none,...,,,,,,70846,,,MNXM97012,| 11443131 | 14685263 | 18390550 | 21325339 |...
3,SLM:000000007,Class,Sphingomyelin (iso-d17:1(4E)),SM(iso-d17:1(4E)),N-acyl-15-methylhexadecasphing-4-enine-1-phosp...,SLM:000001000,,,CC(C)CCCCCCCCC\C=C\[C@@H](O)[C@H](COP([O-])(=O...,InChI=none,...,,,,,,70775,,,MNXM97113,14685263 | 21926990 | 9603947
4,SLM:000000035,Isomeric subspecies,sphinganine,,,SLM:000390097,,,CCCCCCCCCCCCCCC[C@@H](O)[C@@H]([NH3+])CO,InChI=1S/C18H39NO2/c1-2-3-4-5-6-7-8-9-10-11-12...,...,308.313535,319.331905,300.290803,336.267481,360.311932,57817,LMSP01020001,HMDB00269,MNXM302,10652340 | 10702247 | 10751414 | 10802064 | 10...
5,SLM:000000042,Isomeric subspecies,"cholesta-5,7-dien-3beta-ol",,,SLM:000501263,,,[H][C@@]1(CC[C@@]2([H])C3=CC=C4C[C@@H](O)CC[C@...,InChI=1S/C27H44O/c1-18(2)7-6-8-19(3)23-11-12-2...,...,391.354671,402.373042,383.331940,419.308617,443.353069,17759,LMST01010069,HMDB00032,MNXM710,10329655 | 10344195 | 10786622 | 11230174 | 16...
6,SLM:000000043,Isomeric subspecies,lathosterone,,,SLM:000501263,,,[H][C@@]12CC=C3[C@]4([H])CC[C@]([H])([C@H](C)C...,InChI=1S/C27H44O/c1-18(2)7-6-8-19(3)23-11-12-2...,...,391.354671,402.373042,383.331940,419.308617,443.353069,71550,,,MNXM97065,19531354 | 22505847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595221,SLM:000598072,,all-trans-retinol--[retinol-binding protein],,,SLM:000000982,,,[*][C@H](N-*)C(-*)=O,InChI=none,...,,,,,,17336 | 83228,,,,20628054 | 28758396
595222,SLM:000598073,,all-trans-retinyl heptanoate,,,SLM:000000982,,,C1(C)(C)C(\C=C\C(=C\C=C\C(=C\COC(CCCCCC)=O)\C)...,InChI=1S/C27H42O2/c1-7-8-9-10-16-26(28)29-21-1...,...,,,,,,138724,,,,20628054 | 28758396
595223,SLM:000598074,,2-heptanoyl-sn-glycero-3-phosphocholine,,,SLM:000000724,,,P(OC[C@@H](CO)OC(=O)CCCCCC)(=O)(OCC[N+](C)(C)C...,InChI=1S/C15H32NO7P/c1-5-6-7-8-9-15(18)23-14(1...,...,,,,,,138266,,,,20628054 | 22605381 | 28758396
595230,SLM:000598083,,12-hydroxy-(9Z)-octadecenoyl-CoA,,,SLM:000389958 | SLM:000390051,,,S(C(CCCCCCC/C=C\C[C@@H](CCCCCC)O)=O)CCNC(CCNC(...,InChI=1S/C39H68N7O18P3S/c1-4-5-6-13-16-27(47)1...,...,,,,,,139559,,,,17084870 | 27758859


Okay wow! So these are all the columns we have found with split characters...

In [5]:
cols_with_split_chars

['Abbreviation*', 'Synonyms*', 'Lipid class*', 'CHEBI', 'PMID']

We can also check for different types of characters if we know that they will be present. For instance SL uses the `/` character for Components*, but this is also used by another of columns like the lipid names themselves or smiles and inchi.

In [6]:
check_for_split_characters(df_swisslipids.drop(columns=['Name','Abbreviation*','Synonyms*','SMILES (pH7.3)','InChI (pH7.3)']), delimiter='/')

Checking split characters (/) in Lipid ID
No rows found

Checking split characters (/) in Level
No rows found

Checking split characters (/) in Lipid class*
No rows found

Checking split characters (/) in Parent
No rows found

Checking split characters (/) in Components*
Found 708725 rows with split characters


Unnamed: 0,Lipid ID,Level,Lipid class*,Parent,Components*,InChI key (pH7.3),Formula (pH7.3),Charge (pH7.3),Mass (pH7.3),Exact Mass (neutral form),...,Exact m/z of [M+Li]+,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID
164,SLM:000000422,Isomeric subspecies,SLM:000000329,SLM:000081844,SLM:000000418 (sn2) / SLM:000000510 (sn1),InChIKey=OPVZUEPSMJNLOM-QEJMHMKOSA-L,C37H69O8P,-2.0,672.913818,674.488647,...,681.504089,692.522461,673.481384,709.458069,733.502502,64839,LMGP10010032,HMDB07859,MNXM66476,10359651 | 11788596 | 12963729 | 16620771 | 17...
229,SLM:000000498,Isomeric subspecies,SLM:000000324,SLM:000105249,SLM:000000296 (sn2) / SLM:000000826 (sn1),InChIKey=KRTOMQDUKGRFDJ-ZAHDIIMDSA-M,C47H82O13P,-1.0,886.120483,886.557129,...,893.572571,904.590942,885.549866,921.526550,945.570984,133606,LMGP06010010,HMDB09815,MNXM75683,22942276 | 23097495 | 23472195 | 8300559
269,SLM:000000557,Isomeric subspecies,SLM:000000261,SLM:000088147,SLM:000000510 (sn1) / SLM:000000826 (sn2),InChIKey=PZNPLUBHRSSFHT-RRHRGVEJSA-N,C42H84NO8P,0.0,762.091980,761.593445,...,768.608887,779.627258,,796.562866,820.607300,73000,LMGP01010573,HMDB07970,MNXM69304,18195019 | 19416660 | 22923616 | 27399000
332,SLM:000000636,Isomeric subspecies,SLM:000000329,SLM:000082164,SLM:000000418 (sn1) / SLM:000000510 (sn2),InChIKey=ZSXHMDPHNCOWSV-QEJMHMKOSA-L,C37H69O8P,-2.0,672.913818,674.488647,...,681.504089,692.522461,673.481384,709.458069,733.502502,74551,LMGP10010964,,MNXM66662,16620771 | 18606822 | 19318427 | 19801371 | 20...
333,SLM:000000637,Isomeric subspecies,SLM:000000329,SLM:000082168,SLM:000000418 (sn1) / SLM:000000826 (sn2),InChIKey=XIERONXOJKEALF-PXYGFXEISA-L,C39H73O8P,-2.0,700.966980,702.519958,...,709.535400,720.553772,701.512695,737.489380,761.533813,74552,LMGP10010963,,MNXM66667,16620771 | 18606822 | 19318427 | 19801371 | 21...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745172,SLM:000748034,Isomeric subspecies,SLM:000000084,,SLM:000000048 (sn1) / SLM:000001195 (sn2),InChIKey=LJSBNBPNSBKZCI-JNOBRDIFSA-L,C33H57N3O15P2,-2.0,,799.342142,...,806.357598,817.375968,798.334866,834.311543,858.355995,,,,,
745173,SLM:000748035,Isomeric subspecies,SLM:000000084,,SLM:000000048 (sn1) / SLM:000001196 (sn2),InChIKey=ODNYDZLXLRZPCJ-GPTQCAHZSA-L,C35H61N3O15P2,-2.0,,827.373442,...,834.388898,845.407268,826.366166,862.342844,886.387295,,,,,
745174,SLM:000748036,Isomeric subspecies,SLM:000000084,,SLM:000000048 (sn1) / SLM:000000853 (sn2),InChIKey=FJIBTCUXUBRYKG-QOTCTSOZSA-L,C37H65N3O15P2,-2.0,,855.404743,...,862.420198,873.438568,854.397466,890.374144,914.418595,,,,,
745175,SLM:000748037,Isomeric subspecies,SLM:000000084,,SLM:000000048 (sn1) / SLM:000001197 (sn2),InChIKey=AIBKQADSQWEVSS-HUKRWTLJSA-L,C42H75N3O15P2,-2.0,,925.482993,...,932.498448,943.516818,924.475716,960.452394,984.496846,,,HMDB0116248,,


Checking split characters (/) in InChI key (pH7.3)
No rows found

Checking split characters (/) in Formula (pH7.3)
No rows found

Checking split characters (/) in Charge (pH7.3)
Not a string column

Checking split characters (/) in Mass (pH7.3)
Not a string column

Checking split characters (/) in Exact Mass (neutral form)
Not a string column

Checking split characters (/) in Exact m/z of [M.]+
Not a string column

Checking split characters (/) in Exact m/z of [M+H]+
Not a string column

Checking split characters (/) in Exact m/z of [M+K]+ 
Not a string column

Checking split characters (/) in Exact m/z of [M+Na]+
Not a string column

Checking split characters (/) in Exact m/z of [M+Li]+
Not a string column

Checking split characters (/) in Exact m/z of [M+NH4]+
Not a string column

Checking split characters (/) in Exact m/z of [M-H]-
Not a string column

Checking split characters (/) in Exact m/z of [M+Cl]-
Not a string column

Checking split characters (/) in Exact m/z of [M+OAc]- 
N

['Components*']

These double entries for the classes will be important to take into account for our class hierarchy, because if we don't many of these Class level entries will become disjointed in the ontology. 

To help us handle this connection we will split it into two using the `split_and_expand_large` utility function, but we will come back to this a bit later...

For now we will also add another column for components, so that later we can have both the actual component with location (e.g. sn) and a parsed version where we just have the SL 

In [7]:
df_swisslipids['Components_parsed'] = df_swisslipids['Components*']

Now we can melt to start creating the edges df

### Building the edges df

In [8]:
# # Split the 'Lipid class*' column into multiple rows
# df_swisslipids_splitexp = split_and_expand_large(
#     df_swisslipids, #.assign(from_layer_col='swisslipids')
#     split_col='Lipid class*', 
#     expand_cols=['Lipid ID', 'Level', 'Name', 'Abbreviation*',
#                     'CHEBI', 'LIPID MAPS', 'HMDB', 'MetaNetX', 'PMID','Synonyms*','Parent','Components*','Components_parsed'], #'from_layer_col'
#     delimiter='|'
# )

In [9]:
df_swisslipids_edges = pd.melt(df_swisslipids,  #df_swisslipids_splitexp
                id_vars=['Lipid ID'], 
                value_vars=['CHEBI','LIPID MAPS','HMDB','MetaNetX','PMID','Lipid class*','Abbreviation*','Synonyms*','Parent','Components*','Components_parsed'], 
                var_name='melted_column', value_name='value')
df_swisslipids_edges

Unnamed: 0,Lipid ID,melted_column,value
0,SLM:000000002,CHEBI,70846
1,SLM:000000003,CHEBI,70771
2,SLM:000000006,CHEBI,70829
3,SLM:000000007,CHEBI,70775
4,SLM:000000035,CHEBI,57817
...,...,...,...
8571734,SLM:000782324,Components_parsed,
8571735,SLM:000782325,Components_parsed,
8571736,SLM:000782326,Components_parsed,
8571737,SLM:000782327,Components_parsed,


Because this melt operation also resulted in a large number of null values, which probably mean nothing to us in this case, we will drop instances where the value is null

In [10]:
df_swisslipids_edges = df_swisslipids_edges.dropna(subset='value')
df_swisslipids_edges

Unnamed: 0,Lipid ID,melted_column,value
0,SLM:000000002,CHEBI,70846
1,SLM:000000003,CHEBI,70771
2,SLM:000000006,CHEBI,70829
3,SLM:000000007,CHEBI,70775
4,SLM:000000035,CHEBI,57817
...,...,...,...
8571494,SLM:000781997,Components_parsed,SLM:000000856 (n-acyl)
8571495,SLM:000781998,Components_parsed,SLM:000389154 (n-acyl)
8571496,SLM:000781999,Components_parsed,SLM:000485643 (n-acyl)
8571497,SLM:000782000,Components_parsed,SLM:000485644 (n-acyl)


There are still some things we need to tidy up so that it is in a suitable format for OnionNet

In [11]:
df_swisslipids_edges = df_swisslipids_edges.copy()
df_swisslipids_edges['source_layer'] = 'swisslipids'
df_swisslipids_edges.rename(columns={'Lipid ID':'source_id', 'melted_column':'target_layer', 'value':'target_id'}, inplace=True)
df_swisslipids_edges = df_swisslipids_edges[['source_layer','source_id','target_layer','target_id']]
df_swisslipids_edges['target_layer'] = df_swisslipids_edges['target_layer'].map(lambda x: 'swisslipids' if x=='Lipid class*' else f"sl_{str(x).replace(' ','').strip('*').lower()}")
#df_swisslipids_edges['target_layer'] = df_swisslipids_edges['target_layer'].map(lambda x: )
df_swisslipids_edges

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000002,sl_chebi,70846
1,swisslipids,SLM:000000003,sl_chebi,70771
2,swisslipids,SLM:000000006,sl_chebi,70829
3,swisslipids,SLM:000000007,sl_chebi,70775
4,swisslipids,SLM:000000035,sl_chebi,57817
...,...,...,...,...
8571494,swisslipids,SLM:000781997,sl_components_parsed,SLM:000000856 (n-acyl)
8571495,swisslipids,SLM:000781998,sl_components_parsed,SLM:000389154 (n-acyl)
8571496,swisslipids,SLM:000781999,sl_components_parsed,SLM:000485643 (n-acyl)
8571497,swisslipids,SLM:000782000,sl_components_parsed,SLM:000485644 (n-acyl)


For rows where it is swisslipids to swisslipids, we actually want to correct this from target_layer to source_layer, because currently the target_layer in this case is actually the parent class, and ideally it would be better to have the parent point towards the children, so that way the root node should be the one with multiple outgoing edges and no incoming edges...

Be sure to only run this once, otherwise it will switch back again...

In [12]:
# Identify rows where both source_layer and target_layer are 'swisslipids'
condition = (df_swisslipids_edges["source_layer"] == "swisslipids") & (df_swisslipids_edges["target_layer"] == "swisslipids")

# Swap the columns for rows satisfying the condition
df_swisslipids_edges.loc[condition, ["source_layer", "source_id", "target_layer", "target_id"]] = df_swisslipids_edges.loc[condition, ["target_layer", "target_id", "source_layer", "source_id"]].values

# Output the modified DataFrame
df_swisslipids_edges

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000002,sl_chebi,70846
1,swisslipids,SLM:000000003,sl_chebi,70771
2,swisslipids,SLM:000000006,sl_chebi,70829
3,swisslipids,SLM:000000007,sl_chebi,70775
4,swisslipids,SLM:000000035,sl_chebi,57817
...,...,...,...,...
8571494,swisslipids,SLM:000781997,sl_components_parsed,SLM:000000856 (n-acyl)
8571495,swisslipids,SLM:000781998,sl_components_parsed,SLM:000389154 (n-acyl)
8571496,swisslipids,SLM:000781999,sl_components_parsed,SLM:000485643 (n-acyl)
8571497,swisslipids,SLM:000782000,sl_components_parsed,SLM:000485644 (n-acyl)


In [13]:
df_swisslipids_edges['target_layer'].value_counts()

target_layer
swisslipids             779247
sl_abbreviation         776464
sl_components           765323
sl_components_parsed    765323
sl_synonyms             548163
sl_metanetx             505003
sl_parent               493491
sl_hmdb                  26026
sl_lipidmaps             12117
sl_chebi                  4276
sl_pmid                   3066
Name: count, dtype: int64

Now let's return to two items on our todo list:
1. splitting values that have multi-identifiers
2. trimming/parsing the components col

In [14]:
edges_with_multilinks = df_swisslipids_edges[df_swisslipids_edges['target_id'].str.contains('|', regex=False, na=False)]
edges_with_multilinks

Unnamed: 0,source_layer,source_id,target_layer,target_id
465,swisslipids,SLM:000000784,sl_chebi,74546 | 82922
387185,swisslipids,SLM:000389154,sl_chebi,82731 | CHEBI:82731
595221,swisslipids,SLM:000598072,sl_chebi,17336 | 83228
3116996,swisslipids,SLM:000000002,sl_pmid,| 11443131 | 14685263 | 18390550 | 21325339 |...
3116999,swisslipids,SLM:000000007,sl_pmid,14685263 | 21926990 | 9603947
...,...,...,...,...
6199835,swisslipids,SLM:000747954,sl_synonyms,"1,2-di-(13-methyltetradecanoyl)-sn-glycero-3-c..."
6199836,swisslipids,SLM:000747955,sl_synonyms,1-(13-methyltetradecanoyl)-2-(15-methylhexadec...
6199918,swisslipids,SLM:000748037,sl_synonyms,1-(15-methylhexadecanoyl)-2-(11-methyldodecano...
6199919,swisslipids,SLM:000748038,sl_synonyms,1-(15-methylhexadecanoyl)-2-(13-methyltetradec...


In [15]:
edges_with_multilinks.value_counts('target_layer')

target_layer
sl_synonyms        19853
sl_abbreviation     9768
sl_pmid             1318
sl_chebi               3
Name: count, dtype: int64

In [16]:
edges_with_multilinks_split = split_and_expand_large(edges_with_multilinks, 
                       split_col='target_id', 
                       expand_cols=['source_layer','source_id','target_layer'],
                       delimiter='|').drop_duplicates()
edges_with_multilinks_split

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000784,sl_chebi,74546
1,swisslipids,SLM:000000784,sl_chebi,82922
2,swisslipids,SLM:000389154,sl_chebi,82731
3,swisslipids,SLM:000389154,sl_chebi,CHEBI:82731
4,swisslipids,SLM:000598072,sl_chebi,17336
...,...,...,...,...
68383,swisslipids,SLM:000748037,sl_synonyms,"CDP-DG(22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:1(11Z))"
68384,swisslipids,SLM:000748038,sl_synonyms,1-(15-methylhexadecanoyl)-2-(13-methyltetradec...
68385,swisslipids,SLM:000748038,sl_synonyms,"CDP-DG(22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:1(9Z))"
68386,swisslipids,SLM:000748039,sl_synonyms,"1,2-di-(15-methylhexadecanoyl)-sn-glycero-3-cy..."


This is good, but we also need to remember the separators in the components column

In [17]:
edges_with_multilinks2 = df_swisslipids_edges[df_swisslipids_edges['target_id'].str.contains('/', regex=False, na=False) &
                     df_swisslipids_edges['target_layer'].str.contains('sl_components', regex=False, na=False)]
edges_with_multilinks2

Unnamed: 0,source_layer,source_id,target_layer,target_id
7013405,swisslipids,SLM:000000422,sl_components,SLM:000000418 (sn2) / SLM:000000510 (sn1)
7013470,swisslipids,SLM:000000498,sl_components,SLM:000000296 (sn2) / SLM:000000826 (sn1)
7013510,swisslipids,SLM:000000557,sl_components,SLM:000000510 (sn1) / SLM:000000826 (sn2)
7013573,swisslipids,SLM:000000636,sl_components,SLM:000000418 (sn1) / SLM:000000510 (sn2)
7013574,swisslipids,SLM:000000637,sl_components,SLM:000000418 (sn1) / SLM:000000826 (sn2)
...,...,...,...,...
8537662,swisslipids,SLM:000748034,sl_components_parsed,SLM:000000048 (sn1) / SLM:000001195 (sn2)
8537663,swisslipids,SLM:000748035,sl_components_parsed,SLM:000000048 (sn1) / SLM:000001196 (sn2)
8537664,swisslipids,SLM:000748036,sl_components_parsed,SLM:000000048 (sn1) / SLM:000000853 (sn2)
8537665,swisslipids,SLM:000748037,sl_components_parsed,SLM:000000048 (sn1) / SLM:000001197 (sn2)


In [18]:
edges_with_multilinks2_split = split_and_expand_large(edges_with_multilinks2, 
                       split_col='target_id', 
                       expand_cols=['source_layer','source_id','target_layer'],
                       delimiter='/').drop_duplicates()
edges_with_multilinks2_split

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000422,sl_components,SLM:000000418 (sn2)
1,swisslipids,SLM:000000422,sl_components,SLM:000000510 (sn1)
2,swisslipids,SLM:000000498,sl_components,SLM:000000296 (sn2)
3,swisslipids,SLM:000000498,sl_components,SLM:000000826 (sn1)
4,swisslipids,SLM:000000557,sl_components,SLM:000000510 (sn1)
...,...,...,...,...
3592487,swisslipids,SLM:000748036,sl_components_parsed,SLM:000000853 (sn2)
3592488,swisslipids,SLM:000748037,sl_components_parsed,SLM:000000048 (sn1)
3592489,swisslipids,SLM:000748037,sl_components_parsed,SLM:000001197 (sn2)
3592490,swisslipids,SLM:000748038,sl_components_parsed,SLM:000000047 (sn2)


Now let's also parse the brackets from the parsed components so that these can be linked directly to the other SLMs if needed

In [19]:
# Apply transformation only for rows where target_layer equals 'sl_components_parsed'
mask = edges_with_multilinks2_split['target_layer'] == 'sl_components_parsed'
edges_with_multilinks2_split.loc[mask, 'target_id'] = edges_with_multilinks2_split.loc[mask, 'target_id'].str.split('(').str[0].str.strip()
edges_with_multilinks2_split

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000422,sl_components,SLM:000000418 (sn2)
1,swisslipids,SLM:000000422,sl_components,SLM:000000510 (sn1)
2,swisslipids,SLM:000000498,sl_components,SLM:000000296 (sn2)
3,swisslipids,SLM:000000498,sl_components,SLM:000000826 (sn1)
4,swisslipids,SLM:000000557,sl_components,SLM:000000510 (sn1)
...,...,...,...,...
3592487,swisslipids,SLM:000748036,sl_components_parsed,SLM:000000853
3592488,swisslipids,SLM:000748037,sl_components_parsed,SLM:000000048
3592489,swisslipids,SLM:000748037,sl_components_parsed,SLM:000001197
3592490,swisslipids,SLM:000748038,sl_components_parsed,SLM:000000047


Now we need a way to change these original rows where they had multilinks and add back the corrected ones.

In [20]:
# Identify rows with multilinks (either '|' or '/' with the specific target_layer condition)
mask_pipe = df_swisslipids_edges['target_id'].str.contains('|', regex=False, na=False)
mask_slash = (
    df_swisslipids_edges['target_id'].str.contains('/', regex=False, na=False) &
    df_swisslipids_edges['target_layer'].str.contains('sl_components', regex=False, na=False)
)
mask_problem = mask_pipe | mask_slash

# Remove these rows from the original df
df_clean = df_swisslipids_edges[~mask_problem].copy()

# Now, combine the cleaned df with the corrected edges dataframes.
# These corrected dataframes are assumed to be: 
#   - edges_with_multilinks_split
#   - edges_with_multilinks2_split
df_swisslipids_edges = pd.concat([df_clean, edges_with_multilinks_split, edges_with_multilinks2_split], ignore_index=True)

# (Optional) Drop any duplicate rows that might arise
df_swisslipids_edges = df_swisslipids_edges.drop_duplicates()

# df_final now contains the original "good" rows plus the corrected edges.
df_swisslipids_edges

Unnamed: 0,source_layer,source_id,target_layer,target_id
0,swisslipids,SLM:000000002,sl_chebi,70846
1,swisslipids,SLM:000000003,sl_chebi,70771
2,swisslipids,SLM:000000006,sl_chebi,70829
3,swisslipids,SLM:000000007,sl_chebi,70775
4,swisslipids,SLM:000000035,sl_chebi,57817
...,...,...,...,...
6890974,swisslipids,SLM:000748036,sl_components_parsed,SLM:000000853
6890975,swisslipids,SLM:000748037,sl_components_parsed,SLM:000000048
6890976,swisslipids,SLM:000748037,sl_components_parsed,SLM:000001197
6890977,swisslipids,SLM:000748038,sl_components_parsed,SLM:000000047


Now we will determine whether the edge is within the same layer (intralayer) or between different layers (interlayer)

In [21]:
def assess_edge_layertype(df):
    interlayer = df['source_layer']!=df['target_layer']
    df['interlayer'] = interlayer
    return df 

df_swisslipids_edges = assess_edge_layertype(df_swisslipids_edges)
df_swisslipids_edges

Unnamed: 0,source_layer,source_id,target_layer,target_id,interlayer
0,swisslipids,SLM:000000002,sl_chebi,70846,True
1,swisslipids,SLM:000000003,sl_chebi,70771,True
2,swisslipids,SLM:000000006,sl_chebi,70829,True
3,swisslipids,SLM:000000007,sl_chebi,70775,True
4,swisslipids,SLM:000000035,sl_chebi,57817,True
...,...,...,...,...,...
6890974,swisslipids,SLM:000748036,sl_components_parsed,SLM:000000853,True
6890975,swisslipids,SLM:000748037,sl_components_parsed,SLM:000000048,True
6890976,swisslipids,SLM:000748037,sl_components_parsed,SLM:000001197,True
6890977,swisslipids,SLM:000748038,sl_components_parsed,SLM:000000047,True


Now we will build the node df

### Building the node df

In [22]:
df_swisslipids_nodes = create_nodedf_from_edgedf(edge_df=df_swisslipids_edges, props=['layer', 'id'], cols=['layer', 'node_id'])
df_swisslipids_nodes

Unnamed: 0,layer,node_id
0,swisslipids,SLM:000000002
1,swisslipids,SLM:000000003
2,swisslipids,SLM:000000006
3,swisslipids,SLM:000000007
4,swisslipids,SLM:000000035
...,...,...
13781953,sl_components_parsed,SLM:000000853
13781954,sl_components_parsed,SLM:000000048
13781955,sl_components_parsed,SLM:000001197
13781956,sl_components_parsed,SLM:000000047


Let's also see how many are duplicates

In [23]:
df_swisslipids_nodes.value_counts(dropna=True)

layer        node_id      
swisslipids  SLM:000000353    132660
             SLM:000000377     98800
             SLM:000000102     80218
             SLM:000117148     46826
             SLM:000000400     38525
                               ...  
sl_metanetx  MNXM311776            1
             MNXM311777            1
             MNXM311778            1
             MNXM311779            1
swisslipids  SLM:000782332         1
Name: count, Length: 2783345, dtype: int64

Now let's merge the nodes with the information from earlier to create richer node attributes

In [24]:
df_swisslipids_nodes = pd.merge(df_swisslipids_nodes, df_swisslipids.assign(from_layer_col='swisslipids'),
                                left_on=['layer','node_id'], right_on=['from_layer_col','Lipid ID'],
                                how='outer')
df_swisslipids_nodes

Unnamed: 0,layer,node_id,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,...,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID,Components_parsed,from_layer_col
0,sl_abbreviation,(5S)-HpHEPE,,,,,,,,,...,,,,,,,,,,
1,sl_abbreviation,15-KETE,,,,,,,,,...,,,,,,,,,,
2,sl_abbreviation,"(10,11S,12R)-TriHETrE",,,,,,,,,...,,,,,,,,,,
3,sl_abbreviation,"(10R)-H-(11S,12S)-EpETrE",,,,,,,,,...,,,,,,,,,,
4,sl_abbreviation,"(10R)-H-(8S,9S)-EpETrE",,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13781953,swisslipids,SLM:000782330,,,,,,,,,...,,,,,,,,,,
13781954,swisslipids,SLM:000782331,,,,,,,,,...,,,,,,,,,,
13781955,swisslipids,SLM:000782331,,,,,,,,,...,,,,,,,,,,
13781956,swisslipids,SLM:000782331,,,,,,,,,...,,,,,,,,,,


This has a lot of duplicates in it, so lets remove them, along with the from_layer_col which means nothing in this case and is just a relic of our join back with the initial df we used to create the edges (which could probably be tidied up)

In [25]:
df_swisslipids_nodes = df_swisslipids_nodes.drop_duplicates()
df_swisslipids_nodes = df_swisslipids_nodes.drop(columns='from_layer_col')
df_swisslipids_nodes

Unnamed: 0,layer,node_id,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,...,Exact m/z of [M+NH4]+,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID,Components_parsed
0,sl_abbreviation,(5S)-HpHEPE,,,,,,,,,...,,,,,,,,,,
1,sl_abbreviation,15-KETE,,,,,,,,,...,,,,,,,,,,
2,sl_abbreviation,"(10,11S,12R)-TriHETrE",,,,,,,,,...,,,,,,,,,,
3,sl_abbreviation,"(10R)-H-(11S,12S)-EpETrE",,,,,,,,,...,,,,,,,,,,
4,sl_abbreviation,"(10R)-H-(8S,9S)-EpETrE",,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13781947,swisslipids,SLM:000782328,SLM:000782328,,oxidized 2-acylglycerol,,,SLM:000000355,,,...,,,,,167117,,,,,
13781950,swisslipids,SLM:000782329,,,,,,,,,...,,,,,,,,,,
13781953,swisslipids,SLM:000782330,,,,,,,,,...,,,,,,,,,,
13781954,swisslipids,SLM:000782331,,,,,,,,,...,,,,,,,,,,


Now we have the nodes and edges dfs for swisslipids and understand how we have arrived at them. In reality you don't have to go through this process every time, LipiNet offers a convenient function to do just this if you are interested in this same network setup.

## Using the LipiNet `parse_swisslipids` function

The LipiNet `parse_swisslipids` function automatically runs through all of the same steps as we have just covered.

In [26]:
from lipinet.parse_swisslipids import parse_swisslipids_data

sl_results = parse_swisslipids_data(verbose=False)
df_sl_nodes = sl_results['df_nodes']
df_sl_edges = sl_results['df_edges']

We can also check to make sure these are equal here for an individual entry

In [27]:
df_swisslipids_nodes.iloc[0]

layer                        sl_abbreviation
node_id                          (5S)-HpHEPE
Lipid ID                                 NaN
Level                                    NaN
Name                                     NaN
Abbreviation*                            NaN
Synonyms*                                NaN
Lipid class*                             NaN
Parent                                   NaN
Components*                              NaN
SMILES (pH7.3)                           NaN
InChI (pH7.3)                            NaN
InChI key (pH7.3)                        NaN
Formula (pH7.3)                          NaN
Charge (pH7.3)                           NaN
Mass (pH7.3)                             NaN
Exact Mass (neutral form)                NaN
Exact m/z of [M.]+                       NaN
Exact m/z of [M+H]+                      NaN
Exact m/z of [M+K]+                      NaN
Exact m/z of [M+Na]+                     NaN
Exact m/z of [M+Li]+                     NaN
Exact m/z 

In [28]:
df_sl_nodes.iloc[0]

layer                        sl_abbreviation
node_id                          (5S)-HpHEPE
Lipid ID                                 NaN
Level                                    NaN
Name                                     NaN
Abbreviation*                            NaN
Synonyms*                                NaN
Lipid class*                             NaN
Parent                                   NaN
Components*                              NaN
SMILES (pH7.3)                           NaN
InChI (pH7.3)                            NaN
InChI key (pH7.3)                        NaN
Formula (pH7.3)                          NaN
Charge (pH7.3)                           NaN
Mass (pH7.3)                             NaN
Exact Mass (neutral form)                NaN
Exact m/z of [M.]+                       NaN
Exact m/z of [M+H]+                      NaN
Exact m/z of [M+K]+                      NaN
Exact m/z of [M+Na]+                     NaN
Exact m/z of [M+Li]+                     NaN
Exact m/z 

For the first entry it looks good, what about for the entire df? We can use the `pd.testing.assert_frame_equal` function to do this. 

First we will use a null test to test equality between `df_swisslipids_nodes` and `df_swisslipids_edges`, which should obviously be `False`.

In [29]:
try:
    pd.testing.assert_frame_equal(df_swisslipids_nodes, df_swisslipids_edges)
    print('DataFrames are equal')
except AssertionError as e:
    print(e)

DataFrame are different

DataFrame shape mismatch
[left]:  (2783345, 32)
[right]: (6890979, 5)


Now let's test between `df_swisslipids_nodes` and `df_sl_nodes`, which should hopefully be `True` and not throw an error. We will also test the edges df while we're at it too.

In [30]:
try:
    pd.testing.assert_frame_equal(df_swisslipids_nodes, df_sl_nodes)
    print('DataFrames for nodes are equal')
except AssertionError as e:
    print(e)

DataFrames for nodes are equal


In [31]:
try:
    pd.testing.assert_frame_equal(df_swisslipids_edges, df_sl_edges)
    print('DataFrames for edges are equal')
except AssertionError as e:
    print(e)

DataFrames for edges are equal


Great! It looks like both approaches achieve the same df. We will use these dfs in other parts of the package.

If they are different, we can inspect the exact rows here

In [32]:
diff = df_sl_edges.merge(df_swisslipids_edges, how='outer', indicator=True)
diff_rows_edges = diff[diff['_merge'] != 'both']
diff_rows_edges

Unnamed: 0,source_layer,source_id,target_layer,target_id,interlayer,_merge


In [33]:
diff = df_sl_nodes.merge(df_swisslipids_nodes, how='outer', indicator=True)
diff_rows_nodes = diff[diff['_merge'] != 'both']
diff_rows_nodes

Unnamed: 0,layer,node_id,Lipid ID,Level,Name,Abbreviation*,Synonyms*,Lipid class*,Parent,Components*,...,Exact m/z of [M-H]-,Exact m/z of [M+Cl]-,Exact m/z of [M+OAc]-,CHEBI,LIPID MAPS,HMDB,MetaNetX,PMID,Components_parsed,_merge


These should also be the same

In [38]:
df_sl_edges[df_sl_edges['source_id']=='SLM:000389145']

Unnamed: 0,source_layer,source_id,target_layer,target_id,interlayer
1640,swisslipids,SLM:000389145,sl_chebi,18059,True
429400,swisslipids,SLM:000389145,sl_metanetx,MNXM12117,True
549344,swisslipids,SLM:000389145,swisslipids,SLM:000000436,False
549407,swisslipids,SLM:000389145,swisslipids,SLM:000000525,False
549887,swisslipids,SLM:000389145,swisslipids,SLM:000001193,False
665828,swisslipids,SLM:000389145,swisslipids,SLM:000117142,False
936914,swisslipids,SLM:000389145,swisslipids,SLM:000390054,False
1046948,swisslipids,SLM:000389145,swisslipids,SLM:000500463,False
1055230,swisslipids,SLM:000389145,swisslipids,SLM:000508860,False
1328368,swisslipids,SLM:000389145,swisslipids,SLM:000782283,False


In [39]:
df_swisslipids_edges[df_swisslipids_edges['source_id']=='SLM:000389145']

Unnamed: 0,source_layer,source_id,target_layer,target_id,interlayer
1640,swisslipids,SLM:000389145,sl_chebi,18059,True
429400,swisslipids,SLM:000389145,sl_metanetx,MNXM12117,True
549344,swisslipids,SLM:000389145,swisslipids,SLM:000000436,False
549407,swisslipids,SLM:000389145,swisslipids,SLM:000000525,False
549887,swisslipids,SLM:000389145,swisslipids,SLM:000001193,False
665828,swisslipids,SLM:000389145,swisslipids,SLM:000117142,False
936914,swisslipids,SLM:000389145,swisslipids,SLM:000390054,False
1046948,swisslipids,SLM:000389145,swisslipids,SLM:000500463,False
1055230,swisslipids,SLM:000389145,swisslipids,SLM:000508860,False
1328368,swisslipids,SLM:000389145,swisslipids,SLM:000782283,False
