In [17]:
import pandas as pd
from pandas import Series, DataFrame
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
import numpy as np
folder = "/data/AIpep-clean/"
from Levenshtein import distance as lev_dist

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Filtering

### new filtering with correct column

In [18]:
df_all = pd.read_pickle(folder+"pickles/all_sequences_with_NN_prop_helicity-hem.pkl")
gram_neg = df_all.query("Set == 'generated-TL-GN-hem' and isPredActive == True and isPredNotHemolytic == True")
gram_pos = df_all.query("Set == 'generated-TL-GP-hem' and isPredActive == True and isPredNotHemolytic == True")

In [54]:
len(gram_neg.query("fraction_PredHelical > 0.8 and HydroMoment > 0.3"))

1182

In [55]:
len(gram_pos.query("fraction_PredHelical > 0.8 and HydroMoment > 0.3"))

930

In [47]:
len(df_all.query("Set == 'generated-TL-GN-hem' and isPredActive == True"))

11247

In [48]:
len(df_all.query("Set == 'generated-TL-GP-hem' and isPredActive == True"))

11149

In [50]:
len(df_all.query("Set == 'generated-TL-GN-hem' and isPredActive == True and isPredNotHemolytic == True"))

3046

In [49]:
len(df_all.query("Set == 'generated-TL-GP-hem' and isPredActive == True and isPredNotHemolytic == True"))

2717

In [None]:
gram_neg_filtered = gram_neg.query("nd HydroMoment > 0.3 and (dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True)

In [20]:
gram_neg_filtered = gram_neg.query("fraction_PredHelical > 0.8 and HydroMoment > 0.3 and (dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True)

In [21]:
gram_pos_filtered = gram_pos.query("fraction_PredHelical > 0.8 and HydroMoment > 0.3 and (dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True)

In [56]:
len(gram_neg.query("(dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True))

148

In [57]:
len(gram_pos.query("(dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True))

160

In [23]:
len(gram_pos_filtered)

78

In [24]:
gram_pos_filtered.columns

Index(['ID', 'Name', 'N terminus', 'Sequence', 'C terminus', 'targetSpecies',
       'baumannii', 'aureus', 'aeruginosa', 'activity', 'Set', 'hemolysis',
       'len_hemolysis', 'isNotHemolytic', 'Hemolysis_pred', 'Repetition',
       'inTraining', 'prediction', 'isPredActive', 'prediction_hem', 'SMILES',
       'MAP4', 'map_dist_Training', 'map_NN_Training', 'map_dist_Test',
       'map_NN_Test', 'length', 'D_AA', 'A_fract', 'C_fract', 'D_fract',
       'E_fract', 'F_fract', 'G_fract', 'H_fract', 'I_fract', 'L_fract',
       'M_fract', 'N_fract', 'P_fract', 'K_fract', 'Q_fract', 'R_fract',
       'S_fract', 'T_fract', 'V_fract', 'W_fract', 'Y_fract', 'positive',
       'negative', 'HAC', 'HBA', 'HBD', 'hydrophobic', 'hydrophobic_patches',
       'hydrophobic_patches_num', 'hydrophobic_patches_len', 'hydro_res_fract',
       'pos_res_fract', 'HydroMoment', 'charge', 'hydrophobicity',
       'av_hydrophobicity', 'discrimination', 'fasta', 'SpiderFilename',
       'SpiderFileloc', 'SS', 

In [28]:
cols = ['ID', 'Sequence', 'prediction', "prediction_hem", 'length',\
       'dist_Training_hem', 'NN_Training_hem', 'dist_Test_hem', 'NN_Test_hem','dist_Training_act', 'NN_Training_act',\
        'dist_Test_act', 'NN_Test_act', 'fraction_PredHelical', 'HydroMoment', 'SMILES', 'MAP4']

In [29]:
gram_neg_filtered[cols].to_csv(folder+"data/gram_neg_filtered.csv", index=False)
gram_pos_filtered[cols].to_csv(folder+"data/gram_pos_filtered.csv", index=False)
gram_neg_filtered[cols].to_csv("data/gram_neg_filtered.csv", index=False)
gram_pos_filtered[cols].to_csv("data/gram_pos_filtered.csv", index=False)

# Clustering

In [30]:
from rdkit.ML.Cluster.Butina import ClusterData
clusters_neg = ClusterData(gram_neg_filtered.Sequence.to_list(), len(gram_neg_filtered), 8, distFunc=lev_dist)
clusters_pos = ClusterData(gram_pos_filtered.Sequence.to_list(), len(gram_pos_filtered), 8, distFunc=lev_dist)

In [31]:
len(clusters_neg)

11

In [32]:
len(clusters_pos)

18

### checking and cleaning clusters

In [33]:
clusters_neg_new = []
for cluster_neg in clusters_neg:
    cluster_neg_new = []
    for idx in cluster_neg:
        if "M" not in gram_neg_filtered.iloc[idx].Sequence and gram_neg_filtered.iloc[idx].dist_Training_act >= 5 and gram_neg_filtered.iloc[idx].dist_Test_act >= 5:
            cluster_neg_new.append(idx)
    clusters_neg_new.append(cluster_neg_new)

In [43]:
clusters_pos_new = []
for cluster_pos in clusters_pos:
    cluster_pos_new = []
    for idx in cluster_pos:
        if "M" not in gram_pos_filtered.iloc[idx].Sequence and gram_pos_filtered.iloc[idx].dist_Training_act >= 5 and gram_pos_filtered.iloc[idx].dist_Test_act >= 5:
            cluster_pos_new.append(idx)
    clusters_pos_new.append(cluster_pos_new)

In [44]:
np.random.seed(787)

indices = []
idx_pos = []
for cluster_pos in clusters_pos_new:
    if len(cluster_pos) > 0:
        idx_pos.append(cluster_pos[0])
    if len(cluster_pos) > 6:
        cluster_pos_tmp = list(cluster_pos)
        cluster_pos_tmp.remove(cluster_pos[0])
        idx_plus = np.random.choice(np.array(cluster_pos_tmp),1)
        idx_pos.append(idx_plus)


idx_neg = []
for cluster_neg in clusters_neg_new:
    if len(cluster_neg) > 0:
        idx_neg.append(cluster_neg[0])
    if len(cluster_neg) > 6:
        cluster_neg_tmp = list(cluster_neg)
        cluster_neg_tmp.remove(cluster_neg[0])
        idx_plus = np.random.choice(np.array(cluster_neg_tmp),1)
        idx_neg.append(idx_plus)

In [45]:
gram_pos_filtered.iloc[idx_pos][cols].to_csv(folder+"data/gram_pos_filtered_clustered.csv", index=False)
gram_neg_filtered.iloc[idx_neg][cols].to_csv(folder+"data/gram_neg_filtered_clustered.csv", index=False)
gram_pos_filtered.iloc[idx_pos][cols].to_csv("data/gram_pos_filtered_clustered.csv", index=False)
gram_neg_filtered.iloc[idx_neg][cols].to_csv("data/gram_neg_filtered_clustered.csv", index=False)

### "Random" filtering from predicted active

In [37]:
gram_neg_filtered_rnd = gram_neg.query("(dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True).sample(10, random_state = 0)

In [38]:
gram_pos_filtered_rnd = gram_pos.query("(dist_Training_hem > 5 and dist_Training_hem <= 7) and dist_Test_hem > 4 and length <= 15 and D_AA == False and dist_Test_act != 0 and dist_Training_act != 0").reset_index(drop=True).sample(10, random_state =0)

In [39]:
gram_pos_filtered_rnd[cols].to_csv(folder+"data/gram_pos_rnd.csv", index=False)
gram_neg_filtered_rnd[cols].to_csv(folder+"data/gram_neg_rnd.csv", index=False)
gram_pos_filtered_rnd[cols].to_csv("data/gram_pos_rnd.csv", index=False)
gram_neg_filtered_rnd[cols].to_csv("data/gram_neg_rnd.csv", index=False)

In [40]:
checkunique = pd.concat([gram_neg_filtered_rnd, gram_neg_filtered.iloc[idx_neg]])

In [41]:
len(checkunique)

20

In [42]:
len(checkunique.drop_duplicates("Sequence"))

20