In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("data/CCS_consolidated_SMILES.csv")

In [3]:
data_COI = data[
    [
        "SMILES",
        "outlier",
        "charge",
        "monoisotopic mass",
        "Instrument",
        "CCS (+H)",
        "CCS (+Na)",
        "CCS (+K)",
        "CCS (+NH4)",
        "CCS (+TMT)",
        "CCS (+GirardP)",
        "CCS (-H)",
        "CCS (+HCOOH-H)",
        "CCS (+Li)",
        "CCS (+Cl)",
        "CCS (-CH3COO)",
        "CCS (-CO+H)",
        "CCS (-e)",
        "CCS (-Cl)",
        "CCS (-HO)",
    ]
]

In [4]:
ccs_columns = [col for col in data_COI.columns if col.startswith("CCS ")]

new_rows = []

for _, row in data_COI.iterrows():
    smiles = row["SMILES"]
    outlier = row["outlier"]
    charge = row["charge"]
    monoisotopic_mass = row["monoisotopic mass"]
    instrument = row["Instrument"]
    for ccs_col in ccs_columns:
        if not pd.isna(row[ccs_col]):
            new_row = {
                "SMILES": smiles,
                "outlier": outlier,
                "charge": charge,
                "monoisotopic mass": monoisotopic_mass,
                "Instrument": instrument,
                "Adduct": ccs_col.split(" ")[1],
                "CCS": row[ccs_col],
            }
            new_rows.append(new_row)

processed_data = pd.DataFrame(new_rows)

In [5]:
processed_data.to_csv("data/CCS_consolidated_SMILES_processed.csv", index=False)

In [6]:
processed_data["Adduct"].value_counts()

Adduct
(+H)          3349
(-H)          1726
(+Na)         1172
(+HCOOH-H)     277
(+K)            44
(+Li)           21
(+GirardP)      20
(-e)            19
(-HO)           13
(+TMT)          13
(+NH4)           7
(+Cl)            6
(-Cl)            4
(-CH3COO)        3
(-CO+H)          1
Name: count, dtype: int64

In [7]:
# Remove rows with adducts for which there are less than 3 rows
processed_data_filtered = processed_data.groupby("Adduct").filter(lambda x: len(x) >= 3)
processed_data_filtered = processed_data_filtered[
    processed_data_filtered["charge"] == 1
]
processed_data_filtered_stringent = processed_data_filtered.groupby("Adduct").filter(
    lambda x: len(x) >= 25
)
processed_data_filtered_stringent = processed_data_filtered_stringent[
    processed_data_filtered_stringent["charge"] == 1
]

In [8]:
processed_data_filtered

Unnamed: 0,SMILES,outlier,charge,monoisotopic mass,Instrument,Adduct,CCS
0,NC1=CC=CC2=C1C=CC=C2N,0,1,158.0844,Vion IMS Q-Tof,(+H),127.9
1,CC(C)(C)C1=CC(C(C)(C)C)=C(O)C=C1,0,1,206.1671,Vion IMS Q-Tof,(-H),158.9
2,NC1=CC=CC=C1C1=CC=CC=C1,0,1,169.0891,Vion IMS Q-Tof,(+H),138.1
3,NC1=CC=CC=C1C1=CC=CC=C1,0,1,169.0891,Vion IMS Q-Tof,(-e),132.4
4,CCCCCCCCOC1=CC(O)=C(C(=O)C2=CC=CC=C2)C=C1,0,1,326.1882,Vion IMS Q-Tof,(+H),195.0
...,...,...,...,...,...,...,...
6670,NS(=O)(=O)CC1=NOC2=CC=CC=C21,0,1,212.0256,Synapt G2-Si,(+Na),147.9
6671,NS(=O)(=O)CC1=NOC2=CC=CC=C21,0,1,212.0256,Synapt G2-Si,(-H),140.3
6672,[H]C1(N2CCN(C[C@@]([H])(O)COC3=CC=CC4=C3C=CC=N...,0,1,527.2384,Synapt G2-Si,(+H),218.3
6673,NC1=NC2=C(C=CC(Cl)=C2)O1,0,1,168.0090,Synapt G2-Si,(+H),127.7


In [9]:
print("C:")
print(processed_data_filtered["SMILES"].str.contains("C").sum())
print("N:")
print(processed_data_filtered["SMILES"].str.contains("N").sum())
print("O:")
print(processed_data_filtered["SMILES"].str.contains("O").sum())
print("S:")
print(processed_data_filtered["SMILES"].str.contains("S").sum())
print("P:")
print(processed_data_filtered["SMILES"].str.contains("P").sum())
print("F:")
print(processed_data_filtered["SMILES"].str.contains("F").sum())
print("Cl:")
print(processed_data_filtered["SMILES"].str.contains("Cl").sum())
print("Br:")
print(processed_data_filtered["SMILES"].str.contains("Br").sum())
print("I:")
print(processed_data_filtered["SMILES"].str.contains("I").sum())

C:
6667
N:
5130
O:
6266
S:
1303
P:
513
F:
712
Cl:
927
Br:
85
I:
50


In [10]:
processed_data_filtered.to_csv(
    "data/CCS_consolidated_SMILES_processed_filtered.csv", index=False
)

In [11]:
processed_data_filtered_stringent.to_csv(
    "data/CCS_consolidated_SMILES_processed_filtered_stringent.csv", index=False
)