In [1]:
import pandas as pd
import numpy as np

In [3]:
negative_dataset = pd.read_csv('negative_dataset.txt', header = None)
negative_dataset.columns = ["Sequence"]
negative_dataset["label"] = np.zeros(len(negative_dataset))
#positive_dataset = pd.read_csv('positive_dataset.txt', header = None)

positive_dataset_1 = pd.read_csv('positive_dataset_1.txt', header = None)
positive_dataset_1.columns = ["Sequence"]
positive_dataset_1["label"] = np.ones(len(positive_dataset_1))

# Concatenate both dataset
data = pd.concat([negative_dataset, positive_dataset_1], ignore_index=True)
data

Unnamed: 0,Sequence,label
0,AACKCDDEGPDIRTAPLTGTVDLGSCNAGWEKCASYYTIIADCCRKKK,0.0
1,AADAPAQLDPAGEKLYRSACVVCHASGVANAPKLGDKQAWAPFLAQ...,0.0
2,AAKKTVTKADLVDQVAQATGLKKKDVKAMVDALLAKVEEALANGSK...,0.0
3,AAMKVYDVTAPIYEGMPVYKNKPEKQPKRTTITNGYVTESRIDMDV...,0.0
4,AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...,0.0
...,...,...
5598,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0
5599,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0
5600,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0
5601,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0


### Formulate sequence feature using LQL method
- The composition of 20 amino acids formed the first 20 dimensions. 
- Then the amino acids were clustered into 3 types of amino acids for each of six physicochemical property (hydrophobicity, polarity, polarizability, solvent accessibility and normalized van der Waals volume). Composition, Transition and Distribution of each type and each amino acid type property was calculated. For example, amino acids were clustered into polar, neutral and hydrophobic for hydrophobicity property. For ‘Composition’, 3 dimensions were calculated: the percentage of polar, neutral and hydrophobic; For ‘Transition’, 3 dimensions were calculated: the percentage of polar transferred to neutral, neutral transferred to hydrophobicity and hydrophobicity transferred to polar; For ‘Distribution’, 5 dimensions were calculated for each of the 3 types of amino acids: the location percentage of the first, 25%, 50%, 75% of that type. A vector of total 146 dimensions were calculated for LQL method.

In [163]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis

#Calculate compositions
compositions = data['Sequence'].apply(lambda x: ProteinAnalysis(x).get_amino_acids_percent())
df = pd.DataFrame.from_dict(compositions)
df = df['Sequence'].apply(pd.Series)
data1 = data.copy()
data1 = pd.concat([data1, df], axis = 1)
data1

Unnamed: 0,Sequence,label,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,AACKCDDEGPDIRTAPLTGTVDLGSCNAGWEKCASYYTIIADCCRKKK,0.0,0.125000,0.125000,0.104167,0.041667,0.000000,0.083333,0.000000,0.062500,...,0.000000,0.020833,0.041667,0.000000,0.041667,0.041667,0.083333,0.020833,0.020833,0.041667
1,AADAPAQLDPAGEKLYRSACVVCHASGVANAPKLGDKQAWAPFLAQ...,0.0,0.275862,0.022989,0.068966,0.022989,0.011494,0.091954,0.011494,0.000000,...,0.034483,0.011494,0.068966,0.034483,0.045977,0.022989,0.034483,0.057471,0.011494,0.022989
2,AAKKTVTKADLVDQVAQATGLKKKDVKAMVDALLAKVEEALANGSK...,0.0,0.145833,0.000000,0.052083,0.041667,0.031250,0.072917,0.000000,0.020833,...,0.010417,0.010417,0.041667,0.041667,0.031250,0.010417,0.083333,0.104167,0.000000,0.010417
3,AAMKVYDVTAPIYEGMPVYKNKPEKQPKRTTITNGYVTESRIDMDV...,0.0,0.091787,0.004831,0.086957,0.067633,0.057971,0.072464,0.038647,0.072464,...,0.024155,0.024155,0.057971,0.019324,0.048309,0.014493,0.072464,0.077295,0.000000,0.028986
4,AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...,0.0,0.180723,0.048193,0.048193,0.048193,0.024096,0.084337,0.012048,0.024096,...,0.012048,0.072289,0.072289,0.060241,0.024096,0.024096,0.048193,0.048193,0.036145,0.012048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5598,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0,0.093827,0.007407,0.056790,0.060494,0.046914,0.104938,0.009877,0.048148,...,0.033333,0.060494,0.045679,0.040741,0.033333,0.069136,0.046914,0.055556,0.017284,0.028395
5599,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0,0.093827,0.007407,0.056790,0.060494,0.040741,0.104938,0.009877,0.048148,...,0.033333,0.060494,0.045679,0.040741,0.033333,0.069136,0.046914,0.055556,0.017284,0.028395
5600,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0,0.072917,0.003906,0.039062,0.022135,0.031250,0.138021,0.019531,0.037760,...,0.018229,0.058594,0.050781,0.093750,0.028646,0.131510,0.042969,0.049479,0.000000,0.049479
5601,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0,0.086039,0.004870,0.040584,0.027597,0.040584,0.076299,0.024351,0.050325,...,0.037338,0.099026,0.060065,0.063312,0.037338,0.086039,0.038961,0.063312,0.000000,0.021104


In [165]:
# map primary protein sequence into tertiary sequence by hydrophobicity

def map_hydrophobicity(seq):
    hydrophobic_aas = ['F', 'I','W','L','V','M','Y','C','A'] # Marked as 0
    neutral_aas = ['T','H','G','S','Q'] # Marked as 1
    polar_aas = ['R','K','N','E','P','D'] # Marked as 2
    new_seq = []
    for each in seq:
        if each in hydrophobic_aas:
            new_seq.append(0)
        elif each in neutral_aas:
            new_seq.append(1)
        else:
            new_seq.append(2)
    return np.array(new_seq)
# https://www.researchgate.net/figure/Amino-acid-parameter-sets_tbl1_226809003
def map_vdW_volume(seq):
    large_aas = ['F','Y','W','R','K','H','M'] # >4
    medium_aas = ['V','L','I','E','Q'] 
    small_aas = ['A','G','T','S','D','N','P','C'] # < 3
    
    new_seq = []
    for each in seq:
        if each in hydrophobic_aas:
            new_seq.append(0)
        elif each in neutral_aas:
            new_seq.append(1)
        else:
            new_seq.append(2)
    return np.array(new_seq)

def mapped_polarizability(seq):
    high_aas = ['F','Y','W','R','K','H','M'] # >0.2
    medium_aas = ['V','L','I','E','N','Q','C'] # >0.12
    low_aas = ['A','G','T','S','D','P']
    
    new_seq = []
    for each in seq:
        if each in hydrophobic_aas:
            new_seq.append(0)
        elif each in neutral_aas:
            new_seq.append(1)
        else:
            new_seq.append(2)
    return np.array(new_seq)

def composition(mapped_seq):
    group_zero = np.sum(mapped_seq == 0)/np.sum(len(mapped_seq))
    group_one = np.sum(mapped_seq == 1)/np.sum(len(mapped_seq))
    group_two = np.sum(mapped_seq == 2)/np.sum(len(mapped_seq))
    return [group_zero, group_one, group_two]

def transition(mapped_seq):
    two_to_one = 0
    one_to_zero = 0
    zero_to_two = 0
    
    prev = mapped_seq[0]
    for each in mapped_seq[1:]:
        if (prev == 2) and (each == 1):
            two_to_one += 1
        elif (prev == 1) and (each == 0):
            one_to_zero += 1
        elif (prev == 0) and (each == 2):
            zero_to_two += 1
        prev = each
    total_transition = two_to_one + one_to_zero + zero_to_two
    if total_transition == 0:
        return [0,0,0]
    return [two_to_one / total_transition, one_to_zero / total_transition, zero_to_two / total_transition]

def distribution(mapped_seq):
    zeros = np.sum(mapped_seq == 0)
    ones = np.sum(mapped_seq == 1)
    twos = np.sum(mapped_seq == 2)
    length = len(mapped_seq)
    
    cnt_zero = 0
    zero_fractions = [] # store the index of zeros at 0%, 25%, 50%, 75%, 100%
    cnt_one = 0
    one_fractions = []
    cnt_two = 0
    two_fractions = []
    
    for idx, each in enumerate(mapped_seq):
        if each == 0:
            cnt_zero += 1
            if cnt_zero in [1,int(0.25*zeros), int(0.5*zeros), int(0.75*zeros),zeros]:
                zero_fractions.append((idx+1)/length)
        elif each == 1:
            cnt_one += 1
            if cnt_one in [1,int(0.25*ones), int(0.5*ones), int(0.75*ones),ones]:
                one_fractions.append((idx+1)/length)
        else:
            cnt_two += 1
            if cnt_two in [1,int(0.25*twos), int(0.5*twos), int(0.75*twos),twos]:
                two_fractions.append((idx+1)/length)
    return zero_fractions + one_fractions + two_fractions
def pipeline(mapped_seq):
    features = composition(mapped_seq) + transition(mapped_seq) + distribution(mapped_seq)
    return features

In [166]:
data = data1.copy()
data = data.assign(mapped_hydrophobicity=data["Sequence"].apply(map_hydrophobicity))
data = data.assign(mapped_vdW_volume=data["Sequence"].apply(map_vdW_volume))
data = data.assign(mapped_polarizability=data["Sequence"].apply(mapped_polarizability))
data = data.assign(hydrophobic_properties = data['mapped_hydrophobicity'].apply(pipeline))
data = data.assign(vdW_volume_properties = data['mapped_vdW_volume'].apply(pipeline))
data = data.assign(polarizability_properties = data['mapped_polarizability'].apply(pipeline))
data.drop(columns = ['mapped_hydrophobicity','mapped_vdW_volume','mapped_polarizability'],inplace = True)

In [189]:
name1 = []
name2 = []
name3 = []
for i in range(3):
    for j in range(21):
        if (i == 0):
            name1.append("H_" + str(j))
        if (i == 1):
            name2.append("V_" + str(j))
        if (i == 2):
            name3.append("P_" + str(j))

In [190]:
df1 = pd.DataFrame(data["hydrophobic_properties"].to_list(), columns = name1)
df2 = pd.DataFrame(data["vdW_volume_properties"].to_list(),columns = name2)
df3 = pd.DataFrame(data["polarizability_properties"].to_list(),columns = name3)
data_final = data.copy()
data_final = pd.concat([data_final, df1,df2,df3], axis = 1)
df = data_final.drop(columns = ['hydrophobic_properties', 'vdW_volume_properties','polarizability_properties'])
df

Unnamed: 0,Sequence,label,A,C,D,E,F,G,H,I,...,P_11,P_12,P_13,P_14,P_15,P_16,P_17,P_18,P_19,P_20
0,AACKCDDEGPDIRTAPLTGTVDLGSCNAGWEKCASYYTIIADCCRKKK,0.0,0.125000,0.125000,0.104167,0.041667,0.000000,0.083333,0.000000,0.062500,...,0.187500,0.291667,0.416667,0.520833,0.791667,0.083333,0.166667,0.333333,0.666667,1.000000
1,AADAPAQLDPAGEKLYRSACVVCHASGVANAPKLGDKQAWAPFLAQ...,0.0,0.275862,0.022989,0.068966,0.022989,0.011494,0.091954,0.011494,0.000000,...,0.080460,0.275862,0.436782,0.666667,0.850575,0.034483,0.160920,0.425287,0.735632,1.000000
2,AAKKTVTKADLVDQVAQATGLKKKDVKAMVDALLAKVEEALANGSK...,0.0,0.145833,0.000000,0.052083,0.041667,0.031250,0.072917,0.000000,0.020833,...,0.052083,0.197917,0.520833,0.677083,0.906250,0.031250,0.260417,0.614583,0.781250,1.000000
3,AAMKVYDVTAPIYEGMPVYKNKPEKQPKRTTITNGYVTESRIDMDV...,0.0,0.091787,0.004831,0.086957,0.067633,0.057971,0.072464,0.038647,0.072464,...,0.043478,0.231884,0.405797,0.705314,0.932367,0.019324,0.207729,0.473430,0.734300,1.000000
4,AAPANAVTADDPTAIALKYNQDATKSERVAAARPGLPPEEQHCANC...,0.0,0.180723,0.048193,0.048193,0.048193,0.024096,0.084337,0.012048,0.024096,...,0.096386,0.289157,0.566265,0.746988,1.000000,0.036145,0.216867,0.409639,0.662651,0.975904
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5598,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0,0.093827,0.007407,0.056790,0.060494,0.046914,0.104938,0.009877,0.048148,...,0.002469,0.262963,0.440741,0.633333,0.998765,0.003704,0.228395,0.546914,0.764198,1.000000
5599,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,1.0,0.093827,0.007407,0.056790,0.060494,0.040741,0.104938,0.009877,0.048148,...,0.002469,0.262963,0.440741,0.633333,0.998765,0.003704,0.228395,0.546914,0.764198,1.000000
5600,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0,0.072917,0.003906,0.039062,0.022135,0.031250,0.138021,0.019531,0.037760,...,0.003906,0.372396,0.720052,0.858073,1.000000,0.002604,0.180990,0.428385,0.608073,0.996094
5601,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,1.0,0.086039,0.004870,0.040584,0.027597,0.040584,0.076299,0.024351,0.050325,...,0.004870,0.240260,0.511364,0.756494,1.000000,0.003247,0.237013,0.563312,0.795455,0.998377


In [192]:
df.to_csv("LQL_features",index=False)

In [9]:
data_llps = pd.read_excel('LLPS_one_component/LLPS.xls') 
data_proteins = pd.read_excel("LLPS_one_component/protein.xls")

In [10]:
# Useful features
data_llps = data_llps[['Protein ID','Protein structure type','Nucleic acid',\
                       'Solute concentration','Salt concentration', 'Crowding agent','Temperature','other molecules',\
                      'Phase separation','Morphology']]
# Extract protein sequences
df_sequence = data_proteins[['PID','Sequence']]
sequences = df_sequence['Sequence'].apply(lambda x: x.split(';')[1:]).str.join("")
df_sequence['clean_sequences'] = sequences

# Add protein sequence into dataset
data = data_llps.join(df_sequence.set_index('PID'), on='Protein ID')
data.drop(columns=['Sequence','Phase separation'],inplace = True)

# Add label
data['label'] = 1 * data['Morphology'].str.contains("droplet")
data.drop(columns=['Morphology'], inplace = True)

# Replace salt concentration with values
data['Salt concentration'].fillna("", inplace = True)
data['Salt concentration'] = data['Salt concentration'].apply(lambda x: x.split(' ')[0])

# Replace Crowding agent with 1 (existed) or 0
data['Crowding agent'] = 1 * data['Crowding agent'].apply(lambda x: x != "-")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
