In [1]:
import random
import torch.nn as nn
import torch
import time
import math
import pickle
import pandas as pd
from pandas import Series, DataFrame
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, matthews_corrcoef, f1_score, precision_score, recall_score
import random
import pickle
from rdkit.Chem import rdchem, Lipinski
from rdkit import Chem
from rdkit.Chem.rdmolfiles import MolFromFASTA, MolToSmiles, MolFromSmiles
from sklearn.model_selection import GridSearchCV
import numpy as np
import torch.optim as optim
folder2 = "/data/AIpep/"
query = "Val-Asn-Trp-Lys-Lys-Ile-Leu-Gly-Lys-Ile-Ile-Lys-Val-Val-Lys-NH2"
folder = "/../models/"
import matplotlib.pyplot as plt
from Levenshtein import distance as lev_dist
from models import Classifier
import tmap as tm
from map4 import MAP4Calculator
import os

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


# Load Classifiers

In [2]:
n_embedding = 100
n_hidden = 400
n_layers = 2
epoch = 38

filename = folder2 + "models/RNN-classifier/em{}_hi{}_la{}_ep{}".format(n_embedding, n_hidden, n_layers, epoch)

model_activity = Classifier.load_from_file(filename)

if torch.cuda.is_available():
    device = "cuda" 
else:
    device = "cpu" 

model_activity.to(device)

ClassifierRNN(
  (embedding): Embedding(42, 100)
  (rnn): GRU(100, 400, num_layers=2, batch_first=True)
  (output_layer): Linear(in_features=400, out_features=2, bias=True)
  (softmax): LogSoftmax()
)

In [3]:
n_embedding = 100
n_hidden = 400
n_layers = 1
epoch = 95

filename = folder2 + "models/RNN-classifier-hem/em{}_hi{}_la{}_ep{}".format(n_embedding, n_hidden, n_layers, epoch)

model_hemolysis = Classifier.load_from_file(filename)

if torch.cuda.is_available():
    device = "cuda" 
else:
    device = "cpu" 

model_hemolysis.to(device)

ClassifierRNN(
  (embedding): Embedding(42, 100)
  (rnn): GRU(100, 400, batch_first=True)
  (output_layer): Linear(in_features=400, out_features=2, bias=True)
  (softmax): LogSoftmax()
)

# Load data

In [4]:
if not os.path.exists(folder + "pickles/all_sequences-hem.pkl"):
    df_training_test = pd.read_pickle(folder2+"pickles/DAASP_RNN_dataset_with_hem_and_prediction_hem.plk")
    df_generated_pdga = pd.read_pickle(f"{folder}/analogs_{query}.pkl")
    
    count = 0
    def make_id(row):
        global count
        count += 1
        return f"pdga_{count}"
        

    df_generated_pdga["prediction"] = df_generated_pdga.Sequence.map(lambda x: model_activity.predict_peptide_sequence(x)[:,1][0])
    df_generated_pdga["isPredActive"] = df_generated_pdga["prediction"] > 0.99205756
    df_generated_pdga["prediction_hem"] = df_generated_pdga.Sequence.map(lambda x: model_hemolysis.predict_peptide_sequence(x)[:,1][0])
    df_generated_pdga["isPredNotHemolytic"] = df_generated_pdga["prediction_hem"] > 0.99981695
    new_ids = df_generated_pdga.apply(make_id, axis=1)
    df_generated_pdga["ID"] = new_ids
    df_all = pd.concat([df_training_test, df_generated_pdga])
    
    df_all = df_all.reset_index(drop=True)

    df_all.to_pickle(folder+"pickles/all_sequences-hem.pkl")
else:
    df_all = pd.read_pickle(folder+"pickles/all_sequences-hem.pkl")

In [3]:
df_training_test = pd.read_pickle(folder2+"pickles/DAASP_RNN_dataset_with_hem_and_prediction_hem.plk")

In [4]:
len(df_training_test)

2262

# Find NN

In [2]:
def find_seqNN(seq, dataframe):
    best_dist = float("inf")
    dists = dataframe["Sequence"].map(lambda seq2 : lev_dist(seq,seq2))
    NNi = np.argmin(dists)
    best_dist = dists.iloc[NNi]
    NN = dataframe["Sequence"].iloc[NNi]
    return best_dist, NN

In [6]:
def seq_to_smiles(seq):
    mol = MolFromFASTA(seq, flavor=True, sanitize = True)
    smiles = MolToSmiles(mol, isomericSmiles=True)
    return smiles

MAP4 = MAP4Calculator(dimensions=1024)
def calc_map4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    map4 = MAP4.calculate(mol)
    return np.array(map4)

In [7]:
def distance(a, b):
    """Estimates the Jaccard distance of two binary arrays based on their hashes.

Arguments:
  a {numpy.ndarray} -- An array containing hash values.
  b {numpy.ndarray} -- An array containing hash values.

Returns:
  float -- The estimated Jaccard distance.
"""

    # The Jaccard distance of Minhashed values is estimated by
    return 1.0 - np.float(np.count_nonzero(a == b)) / np.float(len(a))

def find_map_seqNN(fp, dataframe):
    best_dist = float("inf")
    dists = dataframe["MAP4"].map(lambda fp2 : distance(fp,fp2))
    NNi = np.argmin(dists)
    best_dist = dists.iloc[NNi]
    NN = dataframe["Sequence"].iloc[NNi]
    return best_dist, NN

In [8]:
if not os.path.exists(folder+"pickles/all_sequences_with_NN_hem.pkl"):
    df_all["dist-NN-Training"] = df_all["Sequence"].parallel_map(lambda x: find_seqNN(x, df_all[df_all["Set"]=="training"]))
    df_all["dist-NN-Test"] = df_all["Sequence"].parallel_map(lambda x: find_seqNN(x, df_all[df_all["Set"]=="test"]))
    df_all["dist_Training"] = df_all["dist-NN-Training"].map(lambda x: x[0])
    df_all["NN_Training"] = df_all["dist-NN-Training"].map(lambda x: x[1])
    df_all["dist_Test"] = df_all["dist-NN-Test"].map(lambda x: x[0])
    df_all["NN_Test"] = df_all["dist-NN-Test"].map(lambda x: x[1])
    del df_all["dist-NN-Training"]
    del df_all["dist-NN-Test"]
    
    df_all["SMILES"] = df_all.Sequence.parallel_map(seq_to_smiles)
    df_all["MAP4"] = df_all.SMILES.parallel_map(calc_map4)
    
    df_all["map-dist-NN-Training"] = df_all["MAP4"].parallel_map(lambda x: find_map_seqNN(x, df_all[df_all["Set"]=="training"]))
    df_all["map-dist-NN-Test"] = df_all["MAP4"].parallel_map(lambda x: find_map_seqNN(x, df_all[df_all["Set"]=="test"]))
    df_all["map_dist_Training"] = df_all["map-dist-NN-Training"].map(lambda x: x[0])
    df_all["map_NN_Training"] = df_all["map-dist-NN-Training"].map(lambda x: x[1])
    df_all["map_dist_Test"] = df_all["map-dist-NN-Test"].map(lambda x: x[0])
    df_all["map_NN_Test"] = df_all["map-dist-NN-Test"].map(lambda x: x[1])
    del df_all["map-dist-NN-Training"]
    del df_all["map-dist-NN-Test"]
    
    df_all.to_pickle(folder+"pickles/all_sequences_with_NN_hem.pkl")
else:
    df_all = pd.read_pickle(folder+"pickles/all_sequences_with_NN_hem.pkl")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

In [4]:
df_all = pd.read_pickle(folder+"pickles/all_sequences_with_NN_prop_helicity-hem.pkl")
df_training_test = pd.read_pickle(folder2+"pickles/DAASP_RNN_dataset_with_prediction.plk")

In [5]:
df_all["dist-NN-Training_"] = df_all["Sequence"].parallel_map(lambda x: find_seqNN(x, df_training_test[df_training_test["Set"]=="training"]))
df_all["dist-NN-Test_"] = df_all["Sequence"].parallel_map(lambda x: find_seqNN(x, df_training_test[df_training_test["Set"]=="test"]))
df_all["dist_Training_"] = df_all["dist-NN-Training_"].map(lambda x: x[0])
df_all["NN_Training_"] = df_all["dist-NN-Training_"].map(lambda x: x[1])
df_all["dist_Test_"] = df_all["dist-NN-Test_"].map(lambda x: x[0])
df_all["NN_Test_"] = df_all["dist-NN-Test_"].map(lambda x: x[1])
del df_all["dist-NN-Training_"]
del df_all["dist-NN-Test_"]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

In [6]:
df_all.to_pickle(folder+"pickles/all_sequences_with_NN_prop_helicity-hem_correct_dist.pkl")

# Calculate properties

In [9]:
def calc_neg(seq):
    seq = seq.upper()
    neg = (seq.count('D') + seq.count('E'))
    return neg

def calc_pos(seq):
    seq = seq.upper()
    pos = (seq.count('K') + seq.count('R'))
    return pos

def calc_aa(seq, aa):
    seq = seq.upper()
    aa_f = seq.count(aa)/len(seq) 
    return aa_f

def calc_hac(smiles):
    mol = MolFromSmiles(smiles)
    hac = Lipinski.HeavyAtomCount(mol)
    return hac

def calc_hydr(seq):
    hydr = (seq.count('A') + seq.count('L') + seq.count('I') + seq.count('L') \
            + seq.count('V') + seq.count('M') + seq.count('F') + seq.count('C'))
    return hydr

def hydropatch(seq):
    seq = seq.upper()
    
    hydro = ["A", "L", "I", "V", "M", "F", "C"]
    patch = ""
    patches = []
    for aa in seq:
        if aa in hydro:
            patch+=aa
        else:
            if patch != "":
                patches.append(len(patch))
            patch=""
    if patch != "":
        patches.append(len(patch))    
    return np.array(patches)


def calc_hba(smiles):
    mol = MolFromSmiles(smiles)
    hba = Lipinski.NumHAcceptors(mol)
    return hba

def calc_hbd(smiles):
    mol = MolFromSmiles(smiles)
    hbd = Lipinski.NumHDonors(mol)
    return hbd

def mean(patches):
    if len(patches) == 0:
        return 0
    return round(patches.mean(),2)

d_aminoacids = ["a","c","d","e","f","g","h","i","l","m","n","p","k","q","r","s","t","v","w","y"]
def d_aa(seq):
    for aa in d_aminoacids:
        if aa in seq:
            return True
    return False

In [10]:
#!/usr/bin/env python

"""
Calculates a set of properties from a protein sequence:
    - hydrophobicity (according to a particular scale)
    - mean hydrophobic dipole moment assuming it is an alpha-helix.
    - total charge (at pH 7.4)
    - amino acid composition
    - discimination factor according to Rob Keller (IJMS, 2011)
Essentially the same as HeliQuest (reproduces the same values).
Author:
  Joao Rodrigues
  j.p.g.l.m.rodrigues@gmail.com
"""

from __future__ import print_function

import argparse
import csv
import math
import os
import time

#
# Definitions
#
scales = {'Fauchere-Pliska': {'A':  0.31, 'R': -1.01, 'N': -0.60,
                              'D': -0.77, 'C':  1.54, 'Q': -0.22,
                              'E': -0.64, 'G':  0.00, 'H':  0.13,
                              'I':  1.80, 'L':  1.70, 'K': -0.99,
                              'M':  1.23, 'F':  1.79, 'P':  0.72,
                              'S': -0.04, 'T':  0.26, 'W':  2.25,
                              'Y':  0.96, 'V':  1.22},

          'Eisenberg': {'A':  0.25, 'R': -1.80, 'N': -0.64,
                        'D': -0.72, 'C':  0.04, 'Q': -0.69,
                        'E': -0.62, 'G':  0.16, 'H': -0.40,
                        'I':  0.73, 'L':  0.53, 'K': -1.10,
                        'M':  0.26, 'F':  0.61, 'P': -0.07,
                        'S': -0.26, 'T': -0.18, 'W':  0.37,
                        'Y':  0.02, 'V':  0.54},
          }
_supported_scales = list(scales.keys())

aa_charge = {'E': -1, 'D': -1, 'K': 1, 'R': 1}

#
# Functions
#
def assign_hydrophobicity(sequence, scale='Fauchere-Pliska'):  # noqa: E302
    """Assigns a hydrophobicity value to each amino acid in the sequence"""

    hscale = scales.get(scale, None)
    if not hscale:
        raise KeyError('{} is not a supported scale. '.format(scale))

    hvalues = []
    for aa in sequence:
        sc_hydrophobicity = hscale.get(aa, None)
        if sc_hydrophobicity is None:
            raise KeyError('Amino acid not defined in scale: {}'.format(aa))
        hvalues.append(sc_hydrophobicity)

    return hvalues


def calculate_moment(array, angle=100):
    """Calculates the hydrophobic dipole moment from an array of hydrophobicity
    values. Formula defined by Eisenberg, 1982 (Nature). Returns the average
    moment (normalized by sequence length)
    uH = sqrt(sum(Hi cos(i*d))**2 + sum(Hi sin(i*d))**2),
    where i is the amino acid index and d (delta) is an angular value in
    degrees (100 for alpha-helix, 180 for beta-sheet).
    """

    sum_cos, sum_sin = 0.0, 0.0
    for i, hv in enumerate(array):
        rad_inc = ((i*angle)*math.pi)/180.0
        sum_cos += hv * math.cos(rad_inc)
        sum_sin += hv * math.sin(rad_inc)
    if len(array) != 0:
        return math.sqrt(sum_cos**2 + sum_sin**2) / len(array)
    else:
        print(array)
        return 0


def calculate_charge(sequence, charge_dict=aa_charge):
    """Calculates the charge of the peptide sequence at pH 7.4
    """
    sc_charges = [charge_dict.get(aa, 0) for aa in sequence]
    return sum(sc_charges)


def calculate_discrimination(mean_uH, total_charge):
    """Returns a discrimination factor according to Rob Keller (IJMS, 2011)
    A sequence with d>0.68 can be considered a potential lipid-binding region.
    """
    d = 0.944*mean_uH + 0.33*total_charge
    return d


def calculate_composition(sequence):
    """Returns a dictionary with percentages per classes"""

    # Residue character table
    polar_aa = set(('S', 'T', 'N', 'H', 'Q', 'G'))
    speci_aa = set(('P', 'C'))
    apolar_aa = set(('A', 'L', 'V', 'I', 'M'))
    charged_aa = set(('E', 'D', 'K', 'R'))
    aromatic_aa = set(('W', 'Y', 'F'))

    n_p, n_s, n_a, n_ar, n_c = 0, 0, 0, 0, 0
    for aa in sequence:
        if aa in polar_aa:
            n_p += 1
        elif aa in speci_aa:
            n_s += 1
        elif aa in apolar_aa:
            n_a += 1
        elif aa in charged_aa:
            n_c += 1
        elif aa in aromatic_aa:
            n_ar += 1

    return {'polar': n_p, 'special': n_s,
            'apolar': n_a, 'charged': n_c, 'aromatic': n_ar}


def analyze_sequence(name=None, sequence=None, window=18, verbose=False):
    """Runs all the above on a sequence. Pretty prints the results"""



    w = window

    outdata = []  # for csv writing

    # Processing...
    seq_len = len(sequence)
    print('[+] Analysing sequence {} ({} aa.)'.format(name, seq_len))
    print('[+] Using a window of {} aa.'.format(w))
    for seq_range in range(0, seq_len):

        seq_w = sequence[seq_range:seq_range+w]
        if seq_range and len(seq_w) < w:
            break

        # Numerical values
        z = calculate_charge(seq_w)
        seq_h = assign_hydrophobicity(seq_w)
        av_h = sum(seq_h)/len(seq_h)
        av_uH = calculate_moment(seq_h)
        d = calculate_discrimination(av_uH, z)

        # AA composition
        aa_comp = calculate_composition(seq_w)
        n_tot_pol = aa_comp['polar'] + aa_comp['charged']
        n_tot_apol = aa_comp['apolar'] + aa_comp['aromatic'] + aa_comp['special']  # noqa: E501
        n_charged = aa_comp['charged']  # noqa: E501
        n_aromatic = aa_comp['aromatic']  # noqa: E501

        _t = [name, sequence, seq_range+1, w, seq_w, z, av_h, av_uH, d,
              n_tot_pol, n_tot_apol, n_charged, n_aromatic]
        outdata.append(_t)

        if verbose:
            print('  Window {}: {}-{}-{}'.format(seq_range+1, seq_range,
                                                 seq_w, seq_range+w))
            print('    z={:<3d} <H>={:4.3f} <uH>={:4.3f} D={:4.3f}'.format(z, av_h,  # noqa: E501
                                                                           av_uH, d))  # noqa: E501
            print('    Amino acid composition')
            print('      Polar    : {:3d} / {:3.2f}%'.format(n_tot_pol, n_tot_pol*100/w))  # noqa: E501
            print('      Non-Polar: {:3d} / {:3.2f}%'.format(n_tot_apol, n_tot_apol*100/w))  # noqa: E501
            print('      Charged  : {:3d} / {:3.2f}%'.format(n_charged, n_charged*100/w))  # noqa: E501
            print('      Aromatic : {:3d} / {:3.2f}%'.format(n_aromatic, n_aromatic*100/w))  # noqa: E501
            print()

    return outdata


def read_fasta_file(afile):
    """Parses a file with FASTA formatted sequences"""

    if not os.path.isfile(afile):
        raise IOError('File not found/readable: {}'.format(afile))

    sequences = []
    seq_name, cur_seq = None, None
    with open(afile) as handle:
        for line in handle:
            line = line.strip()
            if line.startswith('>'):
                if cur_seq:
                    sequences.append((seq_name, ''.join(cur_seq)))
                seq_name = line[1:]
                cur_seq = []
            elif line:
                cur_seq.append(line)
    sequences.append((seq_name, ''.join(cur_seq)))  # last seq

    return sequences

def hydr_moment(seq):
    seq = seq.upper()
    hdr = assign_hydrophobicity(seq,"Eisenberg")
    return calculate_moment(hdr)

In [11]:
if not os.path.exists(folder+"pickles/all_sequences_with_NN_prop-hem.pkl"):
    df_all["length"] = df_all.Sequence.map(len)
    df_all = df_all.query("length>1")

    df_all["D_AA"] = df_all.Sequence.map(d_aa) 

    aminoacids = ["A","C","D","E","F","G","H","I","L","M","N","P","K","Q","R","S","T","V","W","Y"]
    for aa in aminoacids:
        df_all[f"{aa}_fract"] = df_all.Sequence.map(lambda x: calc_aa(x, aa))   

    df_all["positive"] = df_all.Sequence.parallel_map(calc_pos)
    df_all["negative"] = df_all.Sequence.parallel_map(calc_neg)
    df_all["HAC"] = df_all.SMILES.parallel_map(calc_hac)
    df_all["HBA"] = df_all.SMILES.parallel_map(calc_hba)
    df_all["HBD"] = df_all.SMILES.parallel_map(calc_hbd)
    df_all["hydrophobic"] = df_all.Sequence.parallel_map(calc_hydr)
    df_all["hydrophobic_patches"] = df_all.Sequence.parallel_map(hydropatch)
    df_all["hydrophobic_patches_num"] = df_all.hydrophobic_patches.map(len)
    df_all["hydrophobic_patches_len"] = df_all.hydrophobic_patches.map(mean)
    df_all["hydro_res_fract"] = df_all.apply(lambda x: x.hydrophobic / x.length, axis=1)
    df_all["pos_res_fract"] = df_all.apply(lambda x: x.positive / x.length, axis=1)


    df_all["HydroMoment"] = df_all.Sequence.map(hydr_moment)
    df_all["charge"] = df_all["Sequence"].map(lambda x: calculate_charge(x.upper()))
    df_all["hydrophobicity"] = df_all["Sequence"].map(lambda x: assign_hydrophobicity(x.upper()))
    df_all["av_hydrophobicity"] = df_all["hydrophobicity"].map(lambda x: sum(x)/len(x)) 
    df_all["discrimination"] = df_all.apply(lambda x: calculate_discrimination(x.HydroMoment, x.charge), axis=1)

    df_all.to_pickle(folder+"pickles/all_sequences_with_NN_prop-hem.pkl")
else:
    df_all = pd.read_pickle(folder+"pickles/all_sequences_with_NN_prop-hem.pkl")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=90751), Label(value='0 / 90751')))…

# SPIDER helicity prediction

In [12]:
def row_to_fasta(row):
    seq = row["Sequence"]
    ID = row["Sequence"]
    l = 40
    fasta_seq = seq
    fasta = ">{}\n{}".format(ID,fasta_seq)
    return fasta

def fastafile(row, folder="/data/PDGA_MAP4/spiderData/"):
    fasta = row["fasta"]
    fasta = fasta.upper()
    ID = str(row["ID"])
    name = folder+ID+".seq"
    with open(name, "w") as output:
        output.write(fasta)
        
def filename(row, folder="/data/PDGA_MAP4/spiderData/"):
    ID = str(row["ID"])
    name = ID+".seq"
    return name

def fileloc(row, folder="/data/PDGA_MAP4/spiderData/"):
    ID = str(row["ID"])
    name = folder+ID+".seq"
    return name

def read_spider(row, folder="/data/PDGA_MAP4/spider3/"):
    ss = []
    ID = str(row["ID"])
    name = ID+".seq.i2"

    with open(folder+name) as infile:
        for line in infile:
            line = line.strip()
            line = line.split(" ")
            ss.append(line[2])
    return ss[1:]

def count_ss(ss, pred = "H"):
    return ss.count(pred)
def fract_ss(ss, pred = "H"):
    if len(ss)!=0:
        return ss.count(pred)/len(ss)
    else:
        return 0

In [13]:
df_all["fasta"] = df_all.apply(row_to_fasta, axis = 1)
df_all.apply(fastafile, axis=1)
df_all["SpiderFilename"] = df_all.apply(filename, axis=1)
df_all["SpiderFileloc"] = df_all.apply(fileloc, axis=1)
df_all[["SpiderFilename", "SpiderFileloc"]].to_csv("/home/alice/Code/AIpep/SPIDER3-Single_np/file_list-pdga", header=False, index=False, sep=' ')

In [14]:
df_all.SpiderFilename

0                 118.seq
1                 119.seq
2                 120.seq
3                 122.seq
4                 124.seq
               ...       
725997    pdga_723736.seq
725998    pdga_723737.seq
725999    pdga_723738.seq
726000    pdga_723739.seq
726001    pdga_723740.seq
Name: SpiderFilename, Length: 726002, dtype: object

In [15]:
### Run SPIDER
%cd /home/alice/Code/AIpep/SPIDER3-Single_np
!./impute_script_np.sh
%cd -

/home/alice/Code/AIpep/SPIDER3-Single_np
doing iteration 0 - SS
doing iteration 0 - ASA THETA TAU PHI PSI HSEa CN
combining both prediction files
doing iteration 1 - SS
doing iteration 1 - ASA THETA TAU PHI PSI HSEa CN
combining both prediction files
doing iteration 2 - SS
doing iteration 2 - ASA THETA TAU PHI PSI HSEa CN
combining both prediction files
Time taken - 57278 seconds
/home/alice/Code/PDGA_MAP4


In [16]:
df_all["SS"] = df_all.apply(read_spider, axis=1)
df_all["countH"] = df_all.SS.map(count_ss)
df_all["fraction_PredHelical"] = df_all.SS.map(fract_ss)
df_all["fraction_PredBetaSheet"] = df_all.SS.map(lambda x : fract_ss(x, "E"))
df_all["fraction_PredCoil"] = df_all.SS.map(lambda x : fract_ss(x, "C"))

In [9]:
df_all.to_pickle(folder+"pickles/all_sequences_with_NN_prop_helicity-hem.pkl")