# Header

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from os.path import exists

## original dataset

In [3]:
load_path = f"./data/data_for_ml.csv"
original_data = pd.read_csv(load_path)
original_data = original_data[(original_data['SEQ']=='S') | (original_data['SEQ']=='T')]

original_features = dict(
    zip(range(0, len(original_data.columns)), original_data.columns)
)
display(original_features)

{0: '#',
 1: 'SEQ',
 2: 'SS',
 3: 'ASA',
 4: 'Phi',
 5: 'Psi',
 6: 'Theta(i-1=>i+1)',
 7: 'Tau(i-2=>i+2)',
 8: 'HSE_alpha_up',
 9: 'HSE_alpha_down',
 10: 'P(C)',
 11: 'P(H)',
 12: 'P(E)',
 13: 'flexibility',
 14: 'side_-1',
 15: 'side_1',
 16: 'side_2',
 17: 'side_3',
 18: 'side_4',
 19: 'side_5',
 20: 'nAli',
 21: 'nPos',
 22: 'nS/nT',
 23: 'Proline',
 24: 'phi_psi',
 25: 'positivity',
 26: 'protein'}

In [4]:
original_x = list(original_data.columns[3:14]) + list(original_data.columns[20:24])
original_y = 'positivity'

## augmented dataset

In [5]:
protein_list = list(original_data.protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

augmented_data = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    augmented_data.append(pd.read_csv(load_path))
augmented_data = pd.concat(augmented_data, axis=0).reset_index(drop=True)
augmented_data = augmented_data[(augmented_data['SEQ']=='S') | (augmented_data['SEQ']=='T')].reset_index(drop=True)

augmented_features = dict(
    zip(range(0, len(augmented_data.columns)), augmented_data.columns)
)
display(augmented_features)

{0: '#',
 1: 'SEQ',
 2: 'SS',
 3: 'ASA',
 4: 'Phi',
 5: 'Psi',
 6: 'Theta(i-1=>i+1)',
 7: 'Tau(i-2=>i+2)',
 8: 'HSE_alpha_up',
 9: 'HSE_alpha_down',
 10: 'P(C)',
 11: 'P(H)',
 12: 'P(E)',
 13: 'flexibility',
 14: 'side_-1',
 15: 'side_1',
 16: 'side_2',
 17: 'side_3',
 18: 'side_4',
 19: 'side_5',
 20: 'nAli',
 21: 'nPos',
 22: 'nS/nT',
 23: 'Proline',
 24: 'phi_psi',
 25: 'positivity',
 26: 'residue_SER_THR',
 27: 'number_of_hydrophobic',
 28: 'number_of_hydrophilic',
 29: 'number_of_polar',
 30: 'number_of_aromatic',
 31: 'number_of_aliphatic',
 32: 'number_of_charged',
 33: 'number_of_positive',
 34: 'number_of_negative',
 35: 'number_of_g',
 36: 'number_of_v',
 37: 'number_of_s',
 38: 'number_of_n',
 39: 'number_of_l',
 40: 'number_of_p',
 41: 'number_of_A',
 42: 'number_of_b',
 43: 'number_of_d',
 44: 'number_of_e',
 45: 'number_of_f',
 46: 'number_of_ala',
 47: 'number_of_cys',
 48: 'number_of_asp',
 49: 'number_of_glu',
 50: 'number_of_phe',
 51: 'number_of_his',
 52: 'number_of

In [6]:
augmented_x = list(augmented_data.columns[26:])
augmented_y = 'positivity'

In [7]:
both_x = original_x + augmented_x
both_y = 'positivity'

# ElasticNet model

In [8]:
from sklearn.linear_model import ElasticNet

def elasticnet(data, x=[], y='', 
               alpha=0.001, l1_ratio=0.5):
    data_x = data[x]
    data_y = data[y]
    
    x_min = data_x.min(0)
    x_max = data_x.max(0)
    data_x_sc = (data_x - x_min) / (x_max - x_min)
    
    elastic_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    elastic_model.fit(data_x_sc, data_y)
    
    return elastic_model

def elastic_coeff(model, x, rnd=3, disp=False):
    df = pd.DataFrame(
        zip(x, model.coef_, np.abs(model.coef_)
           ), columns = ['features', 'coefficient', 'absolute'] )
    df = df.sort_values(by='absolute', ascending=False).round(rnd)
    if disp:
        display(df.head(disp))
        
    df_important = df[df['absolute']>0]
    print(f'the number of important features: {len(df_important)} out of {len(x)} \n')
    display(dict(zip(df_important['features'], df_important['coefficient'])))
    
    return list(df_important['features'])

## original dataset

In [9]:
data = original_data
x = original_x
y = original_y

alpha = 0.0001
model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 11 out of 15 



{'nS/nT': 0.029,
 'ASA': 0.019,
 'nAli': 0.015,
 'HSE_alpha_down': -0.012,
 'nPos': 0.011,
 'flexibility': 0.01,
 'Proline': -0.009,
 'Theta(i-1=>i+1)': 0.009,
 'Tau(i-2=>i+2)': -0.006,
 'HSE_alpha_up': 0.004,
 'P(E)': -0.001}

## augmented dataset

In [10]:
data = augmented_data
x = augmented_x
y = augmented_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 38 out of 156 



{'sasa_ala': 0.047,
 'sasa_g': 0.034,
 'all_sasa_lys': -0.032,
 'number_of_asn': -0.025,
 'number_of_g': -0.023,
 'sasa_ser': 0.022,
 'sasa_cys': -0.021,
 'sasa_p': 0.02,
 'number_of_ser': 0.018,
 'number_of_thr': -0.017,
 'sasa_lys': -0.016,
 'sasa_val': -0.016,
 'number_of_leu': -0.014,
 'sasa_negative': 0.013,
 'net_charge_all_around_target_ser_thr': 0.012,
 'sasa_ile': -0.011,
 'sasa_s': -0.011,
 'sasa_leu': -0.01,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.009,
 'all_sasa_aliphatic': -0.009,
 'sasa_back': -0.009,
 'number_of_lys': 0.008,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'sasa_polar': -0.008,
 'sasa_phe': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.007,
 'number_of_polar': -0.007,
 'number_of_charged': -0.007,
 'number_of_e': -0.007,
 'sasa_e': -0.007,
 'all_atom_negative_charge_all_backbone_around_target_ser_thr': 0.005,
 'number_of_l': -0.004,
 'sasa_f': 0.004,
 'all_sasa_met': -0.003,
 'sasa_met': -0.003,
 '

## both dataset

In [11]:
data = augmented_data
x = both_x
y = both_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 43 out of 171 



{'sasa_ala': 0.059,
 'all_sasa_lys': -0.048,
 'nS/nT': 0.044,
 'HSE_alpha_down': -0.037,
 'sasa_ser': 0.029,
 'sasa_g': 0.026,
 'sasa_negative': 0.022,
 'ASA': 0.021,
 'nAli': 0.02,
 'number_of_asn': -0.019,
 'number_of_thr': -0.019,
 'sasa_val': -0.015,
 'Theta(i-1=>i+1)': 0.014,
 'number_of_v': 0.014,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.014,
 'net_charge_all_around_target_ser_thr': 0.013,
 'sasa_ile': -0.013,
 'Proline': -0.011,
 'sasa_cys': -0.01,
 'P(H)': 0.01,
 'number_of_lys': 0.01,
 'sasa_back': -0.01,
 'Tau(i-2=>i+2)': -0.01,
 'number_of_leu': -0.009,
 'all_sasa_leu': -0.009,
 'number_of_ser': 0.009,
 'number_of_l': -0.007,
 'all_sasa_cys': -0.007,
 'all_sasa_phe': -0.007,
 'P(E)': -0.007,
 'number_of_g': -0.006,
 'exposed_charge_all_with_around_target_ser_thr': -0.006,
 'number_of_arg': -0.005,
 'sasa_polar': -0.004,
 'sasa_s': -0.004,
 'sasa_e': -0.004,
 'sasa_leu': -0.003,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.003,
 'number_

# ANOVA model

In [12]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

def anova(data, x=[], y='',
          disp=False, rnd=4):
    data = data[x + [y]]

    f_num = [f"f{num}" for num in range(len(x))]
    data.columns =  f_num + [y]

    n_not = len(f_num) # the number of not important features (initialize to the number of features)
    i = 1
    while n_not > 0:
        print(f'{i}th iteration')
        fs = '+'.join(f_num) 
        model = smf.ols(
            y + '~' + fs,
            data).fit()

        result = sm.stats.anova_lm(model, typ=2).round(rnd)
        n_not = len(result[result['PR(>F)'] > 0.05])
        f_num = [fn for fn in result[result['PR(>F)'] <= 0.05].index]

        i += 1

    result = result.sort_values(by='PR(>F)')
    print(f'the number of important features: {len(f_num)} out of {len(x)}')

    if disp:
        n = 10
        i = 0
        while i < len(result):
            print(result.iloc[i:i+n, :])
            i += n

    x_idx = [int(x.removeprefix('f')) for x in result.index[:-1]]
    x_names = [x[idx] for idx in x_idx]

    print('\norder variables by their importance')
    display(dict(zip(x_idx, x_names)))
    
    return x_names

## original dataset

In [13]:
data = original_data
x = original_x
y = original_y

features = anova(data, x, y)

1th iteration
2th iteration
the number of important features: 10 out of 15

order variables by their importance


{0: 'ASA',
 11: 'nAli',
 12: 'nPos',
 13: 'nS/nT',
 14: 'Proline',
 6: 'HSE_alpha_down',
 10: 'flexibility',
 4: 'Tau(i-2=>i+2)',
 3: 'Theta(i-1=>i+1)',
 5: 'HSE_alpha_up'}

## augmented dataset

In [14]:
data = augmented_data
x = augmented_x
y = augmented_y

features = anova(data, x, y)

1th iteration
2th iteration
3th iteration
the number of important features: 41 out of 156

order variables by their importance


{35: 'number_of_val',
 28: 'number_of_leu',
 17: 'number_of_d',
 11: 'number_of_s',
 27: 'number_of_lys',
 22: 'number_of_asp',
 19: 'number_of_f',
 18: 'number_of_e',
 12: 'number_of_n',
 8: 'number_of_negative',
 3: 'number_of_polar',
 6: 'number_of_charged',
 24: 'number_of_phe',
 124: 'net_charge_all_sidechain_around_target_ser_thr',
 126: 'net_charge_all_sidechain_with_around_target_ser_thr',
 123: 'net_charge_all_backbone_around_target_ser_thr',
 122: 'net_charge_all_with_around_target_ser_thr',
 121: 'net_charge_all_around_target_ser_thr',
 120: 'sasa_all_with_around_target_ser_thr',
 127: 'all_atom_positive_charge_all_around_target_ser_thr',
 128: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 129: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 125: 'net_charge_all_backbone_with_around_target_ser_thr',
 131: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 130: 'all_atom_positive_charge_all_sidechain_around_target_ser_thr',
 2: 

## both dataset

In [15]:
data = augmented_data
x = both_x
y = both_y

features = anova(data, x, y)

1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
the number of important features: 50 out of 171

order variables by their importance


{6: 'HSE_alpha_down',
 13: 'nS/nT',
 11: 'nAli',
 35: 'number_of_ala',
 48: 'number_of_ser',
 43: 'number_of_leu',
 32: 'number_of_d',
 37: 'number_of_asp',
 34: 'number_of_f',
 42: 'number_of_lys',
 26: 'number_of_s',
 46: 'number_of_gln',
 19: 'number_of_aromatic',
 39: 'number_of_phe',
 21: 'number_of_charged',
 144: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 143: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 135: 'sasa_all_with_around_target_ser_thr',
 141: 'net_charge_all_sidechain_with_around_target_ser_thr',
 136: 'net_charge_all_around_target_ser_thr',
 137: 'net_charge_all_with_around_target_ser_thr',
 139: 'net_charge_all_sidechain_around_target_ser_thr',
 140: 'net_charge_all_backbone_with_around_target_ser_thr',
 142: 'all_atom_positive_charge_all_around_target_ser_thr',
 138: 'net_charge_all_backbone_around_target_ser_thr',
 146: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 145: 'all_atom_positive_charge_all_sidech

# Random Forest model

In [16]:
from sklearn.ensemble import RandomForestClassifier

def randomforest(data, x=[], y='',
                 n_estimators=500, rs=1):
    model = RandomForestClassifier(n_estimators = n_estimators, n_jobs=-1, random_state=rs)
    model.fit(data[x], data[y])
    
    return model

def FI(model, threshold= 0.95, rnd=3, disp=False):
    df = pd.DataFrame(zip(x, model.feature_importances_), columns = ['features', 'score'])
    df = df.sort_values(by='score', ascending=False)

    score_sum = []
    s = 1
    for score in df.score:
        s -= score
        score_sum.append(s)
    df['sum'] = np.array(score_sum)
    df = df.round(rnd)
    
    if disp:
        display(df.head(disp))

    
    df_important = df[df['sum']>= 1-threshold]

    print(f'the number of important features (threshold={threshold}): {len(df_important)} out of {len(df)}')
    display(dict(
        zip(df_important.index, zip(df_important['features'].values, df_important['score'].values, df_important['sum'].values))
    ))
    
    return list(df_important['features'])

## original dataset

In [17]:
data = original_data
x = original_x
y = original_y

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 11 out of 15


{4: ('Tau(i-2=>i+2)', 0.096, 0.904),
 0: ('ASA', 0.09, 0.814),
 10: ('flexibility', 0.086, 0.728),
 2: ('Psi', 0.086, 0.641),
 1: ('Phi', 0.083, 0.559),
 7: ('P(C)', 0.081, 0.478),
 5: ('HSE_alpha_up', 0.079, 0.398),
 6: ('HSE_alpha_down', 0.078, 0.32),
 3: ('Theta(i-1=>i+1)', 0.077, 0.243),
 8: ('P(H)', 0.077, 0.166),
 9: ('P(E)', 0.073, 0.092)}

## augmented dataset

In [18]:
data = augmented_data
x = augmented_x
y = augmented_y 

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 129 out of 156


{107: ('all_sasa_ser', 0.014, 0.986),
 83: ('all_sasa_g', 0.013, 0.974),
 88: ('all_sasa_p', 0.012, 0.961),
 94: ('all_sasa_ala', 0.012, 0.949),
 70: ('sasa_ser', 0.011, 0.938),
 51: ('sasa_p', 0.011, 0.927),
 82: ('all_sasa_negative', 0.011, 0.916),
 108: ('all_sasa_thr', 0.011, 0.905),
 106: ('all_sasa_arg', 0.011, 0.893),
 78: ('all_sasa_aromatic', 0.01, 0.883),
 93: ('all_sasa_f', 0.01, 0.873),
 69: ('sasa_arg', 0.01, 0.863),
 76: ('all_sasa_hydrophilic', 0.01, 0.852),
 105: ('all_sasa_gln', 0.01, 0.842),
 99: ('all_sasa_his', 0.01, 0.832),
 39: ('sasa_hydrophilic', 0.01, 0.822),
 121: ('net_charge_all_around_target_ser_thr', 0.009, 0.813),
 46: ('sasa_g', 0.009, 0.804),
 117: ('all_sasa_side_with_whole_target', 0.009, 0.794),
 104: ('all_sasa_asn', 0.009, 0.785),
 96: ('all_sasa_asp', 0.009, 0.776),
 74: ('sasa_tyr', 0.009, 0.766),
 120: ('sasa_all_with_around_target_ser_thr', 0.009, 0.757),
 86: ('all_sasa_n', 0.009, 0.748),
 87: ('all_sasa_l', 0.009, 0.74),
 56: ('sasa_f', 0.009

## both dataset

In [19]:
data = augmented_data
x = both_x
y = both_y

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 140 out of 171


{4: ('Tau(i-2=>i+2)', 0.017, 0.983),
 5: ('HSE_alpha_up', 0.013, 0.97),
 122: ('all_sasa_ser', 0.012, 0.958),
 6: ('HSE_alpha_down', 0.012, 0.947),
 103: ('all_sasa_p', 0.011, 0.936),
 121: ('all_sasa_arg', 0.011, 0.925),
 1: ('Phi', 0.011, 0.914),
 109: ('all_sasa_ala', 0.011, 0.903),
 0: ('ASA', 0.01, 0.892),
 108: ('all_sasa_f', 0.01, 0.882),
 85: ('sasa_ser', 0.01, 0.872),
 13: ('nS/nT', 0.01, 0.862),
 66: ('sasa_p', 0.01, 0.852),
 97: ('all_sasa_negative', 0.01, 0.842),
 98: ('all_sasa_g', 0.01, 0.832),
 123: ('all_sasa_thr', 0.01, 0.822),
 120: ('all_sasa_gln', 0.01, 0.813),
 3: ('Theta(i-1=>i+1)', 0.009, 0.803),
 10: ('flexibility', 0.009, 0.794),
 91: ('all_sasa_hydrophilic', 0.009, 0.785),
 102: ('all_sasa_l', 0.009, 0.777),
 114: ('all_sasa_his', 0.009, 0.768),
 119: ('all_sasa_asn', 0.008, 0.76),
 61: ('sasa_g', 0.008, 0.751),
 106: ('all_sasa_d', 0.008, 0.743),
 84: ('sasa_arg', 0.008, 0.735),
 93: ('all_sasa_aromatic', 0.008, 0.726),
 136: ('net_charge_all_around_target_se

# Result of finding important variables

## original dataset

In [20]:
data = original_data
x = original_x
y = original_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 11 out of 15 



{'nS/nT': 0.029,
 'ASA': 0.019,
 'nAli': 0.015,
 'HSE_alpha_down': -0.012,
 'nPos': 0.011,
 'flexibility': 0.01,
 'Proline': -0.009,
 'Theta(i-1=>i+1)': 0.009,
 'Tau(i-2=>i+2)': -0.006,
 'HSE_alpha_up': 0.004,
 'P(E)': -0.001}

1th iteration
2th iteration
the number of important features: 10 out of 15

order variables by their importance


{0: 'ASA',
 11: 'nAli',
 12: 'nPos',
 13: 'nS/nT',
 14: 'Proline',
 6: 'HSE_alpha_down',
 10: 'flexibility',
 4: 'Tau(i-2=>i+2)',
 3: 'Theta(i-1=>i+1)',
 5: 'HSE_alpha_up'}

the number of important features (threshold=0.95): 11 out of 15


{4: ('Tau(i-2=>i+2)', 0.096, 0.904),
 0: ('ASA', 0.09, 0.814),
 10: ('flexibility', 0.086, 0.728),
 2: ('Psi', 0.086, 0.641),
 1: ('Phi', 0.083, 0.559),
 7: ('P(C)', 0.081, 0.478),
 5: ('HSE_alpha_up', 0.079, 0.398),
 6: ('HSE_alpha_down', 0.078, 0.32),
 3: ('Theta(i-1=>i+1)', 0.077, 0.243),
 8: ('P(H)', 0.077, 0.166),
 9: ('P(E)', 0.073, 0.092)}

In [21]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_original_only_cts.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_original = importance
importance_original

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
ASA,1.0,1.0,1.0,3.0
Phi,0.0,0.0,1.0,1.0
Psi,0.0,0.0,1.0,1.0
Theta(i-1=>i+1),1.0,1.0,1.0,3.0
Tau(i-2=>i+2),1.0,1.0,1.0,3.0
HSE_alpha_up,1.0,1.0,1.0,3.0
HSE_alpha_down,1.0,1.0,1.0,3.0
P(C),0.0,0.0,1.0,1.0
P(H),0.0,0.0,1.0,1.0
P(E),1.0,0.0,1.0,2.0


## augmented dataset

In [22]:
data = augmented_data
x = augmented_x
y = augmented_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 38 out of 156 



{'sasa_ala': 0.047,
 'sasa_g': 0.034,
 'all_sasa_lys': -0.032,
 'number_of_asn': -0.025,
 'number_of_g': -0.023,
 'sasa_ser': 0.022,
 'sasa_cys': -0.021,
 'sasa_p': 0.02,
 'number_of_ser': 0.018,
 'number_of_thr': -0.017,
 'sasa_lys': -0.016,
 'sasa_val': -0.016,
 'number_of_leu': -0.014,
 'sasa_negative': 0.013,
 'net_charge_all_around_target_ser_thr': 0.012,
 'sasa_ile': -0.011,
 'sasa_s': -0.011,
 'sasa_leu': -0.01,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.009,
 'all_sasa_aliphatic': -0.009,
 'sasa_back': -0.009,
 'number_of_lys': 0.008,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'sasa_polar': -0.008,
 'sasa_phe': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.007,
 'number_of_polar': -0.007,
 'number_of_charged': -0.007,
 'number_of_e': -0.007,
 'sasa_e': -0.007,
 'all_atom_negative_charge_all_backbone_around_target_ser_thr': 0.005,
 'number_of_l': -0.004,
 'sasa_f': 0.004,
 'all_sasa_met': -0.003,
 'sasa_met': -0.003,
 '

1th iteration
2th iteration
3th iteration
the number of important features: 41 out of 156

order variables by their importance


{35: 'number_of_val',
 28: 'number_of_leu',
 17: 'number_of_d',
 11: 'number_of_s',
 27: 'number_of_lys',
 22: 'number_of_asp',
 19: 'number_of_f',
 18: 'number_of_e',
 12: 'number_of_n',
 8: 'number_of_negative',
 3: 'number_of_polar',
 6: 'number_of_charged',
 24: 'number_of_phe',
 124: 'net_charge_all_sidechain_around_target_ser_thr',
 126: 'net_charge_all_sidechain_with_around_target_ser_thr',
 123: 'net_charge_all_backbone_around_target_ser_thr',
 122: 'net_charge_all_with_around_target_ser_thr',
 121: 'net_charge_all_around_target_ser_thr',
 120: 'sasa_all_with_around_target_ser_thr',
 127: 'all_atom_positive_charge_all_around_target_ser_thr',
 128: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 129: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 125: 'net_charge_all_backbone_with_around_target_ser_thr',
 131: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 130: 'all_atom_positive_charge_all_sidechain_around_target_ser_thr',
 2: 

the number of important features (threshold=0.95): 129 out of 156


{107: ('all_sasa_ser', 0.014, 0.986),
 83: ('all_sasa_g', 0.013, 0.974),
 88: ('all_sasa_p', 0.012, 0.961),
 94: ('all_sasa_ala', 0.012, 0.949),
 70: ('sasa_ser', 0.011, 0.938),
 51: ('sasa_p', 0.011, 0.927),
 82: ('all_sasa_negative', 0.011, 0.916),
 108: ('all_sasa_thr', 0.011, 0.905),
 106: ('all_sasa_arg', 0.011, 0.893),
 78: ('all_sasa_aromatic', 0.01, 0.883),
 93: ('all_sasa_f', 0.01, 0.873),
 69: ('sasa_arg', 0.01, 0.863),
 76: ('all_sasa_hydrophilic', 0.01, 0.852),
 105: ('all_sasa_gln', 0.01, 0.842),
 99: ('all_sasa_his', 0.01, 0.832),
 39: ('sasa_hydrophilic', 0.01, 0.822),
 121: ('net_charge_all_around_target_ser_thr', 0.009, 0.813),
 46: ('sasa_g', 0.009, 0.804),
 117: ('all_sasa_side_with_whole_target', 0.009, 0.794),
 104: ('all_sasa_asn', 0.009, 0.785),
 96: ('all_sasa_asp', 0.009, 0.776),
 74: ('sasa_tyr', 0.009, 0.766),
 120: ('sasa_all_with_around_target_ser_thr', 0.009, 0.757),
 86: ('all_sasa_n', 0.009, 0.748),
 87: ('all_sasa_l', 0.009, 0.74),
 56: ('sasa_f', 0.009

In [23]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_augmented_only_cts.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_augmented = importance
importance_augmented

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
residue_SER_THR,0.0,1.0,1.0,2.0
number_of_hydrophobic,0.0,0.0,1.0,1.0
number_of_hydrophilic,0.0,1.0,1.0,2.0
number_of_polar,1.0,1.0,0.0,2.0
number_of_aromatic,0.0,1.0,1.0,2.0
...,...,...,...,...
exposed_negative_charge_all_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_with_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_backbone_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_sidechain_around_target_ser_thr,0.0,0.0,1.0,1.0


## both dataset

In [24]:
data = augmented_data
x = both_x
y = both_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 43 out of 171 



{'sasa_ala': 0.059,
 'all_sasa_lys': -0.048,
 'nS/nT': 0.044,
 'HSE_alpha_down': -0.037,
 'sasa_ser': 0.029,
 'sasa_g': 0.026,
 'sasa_negative': 0.022,
 'ASA': 0.021,
 'nAli': 0.02,
 'number_of_asn': -0.019,
 'number_of_thr': -0.019,
 'sasa_val': -0.015,
 'Theta(i-1=>i+1)': 0.014,
 'number_of_v': 0.014,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.014,
 'net_charge_all_around_target_ser_thr': 0.013,
 'sasa_ile': -0.013,
 'Proline': -0.011,
 'sasa_cys': -0.01,
 'P(H)': 0.01,
 'number_of_lys': 0.01,
 'sasa_back': -0.01,
 'Tau(i-2=>i+2)': -0.01,
 'number_of_leu': -0.009,
 'all_sasa_leu': -0.009,
 'number_of_ser': 0.009,
 'number_of_l': -0.007,
 'all_sasa_cys': -0.007,
 'all_sasa_phe': -0.007,
 'P(E)': -0.007,
 'number_of_g': -0.006,
 'exposed_charge_all_with_around_target_ser_thr': -0.006,
 'number_of_arg': -0.005,
 'sasa_polar': -0.004,
 'sasa_s': -0.004,
 'sasa_e': -0.004,
 'sasa_leu': -0.003,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.003,
 'number_

1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
the number of important features: 50 out of 171

order variables by their importance


{6: 'HSE_alpha_down',
 13: 'nS/nT',
 11: 'nAli',
 35: 'number_of_ala',
 48: 'number_of_ser',
 43: 'number_of_leu',
 32: 'number_of_d',
 37: 'number_of_asp',
 34: 'number_of_f',
 42: 'number_of_lys',
 26: 'number_of_s',
 46: 'number_of_gln',
 19: 'number_of_aromatic',
 39: 'number_of_phe',
 21: 'number_of_charged',
 144: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 143: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 135: 'sasa_all_with_around_target_ser_thr',
 141: 'net_charge_all_sidechain_with_around_target_ser_thr',
 136: 'net_charge_all_around_target_ser_thr',
 137: 'net_charge_all_with_around_target_ser_thr',
 139: 'net_charge_all_sidechain_around_target_ser_thr',
 140: 'net_charge_all_backbone_with_around_target_ser_thr',
 142: 'all_atom_positive_charge_all_around_target_ser_thr',
 138: 'net_charge_all_backbone_around_target_ser_thr',
 146: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 145: 'all_atom_positive_charge_all_sidech

the number of important features (threshold=0.95): 140 out of 171


{4: ('Tau(i-2=>i+2)', 0.017, 0.983),
 5: ('HSE_alpha_up', 0.013, 0.97),
 122: ('all_sasa_ser', 0.012, 0.958),
 6: ('HSE_alpha_down', 0.012, 0.947),
 103: ('all_sasa_p', 0.011, 0.936),
 121: ('all_sasa_arg', 0.011, 0.925),
 1: ('Phi', 0.011, 0.914),
 109: ('all_sasa_ala', 0.011, 0.903),
 0: ('ASA', 0.01, 0.892),
 108: ('all_sasa_f', 0.01, 0.882),
 85: ('sasa_ser', 0.01, 0.872),
 13: ('nS/nT', 0.01, 0.862),
 66: ('sasa_p', 0.01, 0.852),
 97: ('all_sasa_negative', 0.01, 0.842),
 98: ('all_sasa_g', 0.01, 0.832),
 123: ('all_sasa_thr', 0.01, 0.822),
 120: ('all_sasa_gln', 0.01, 0.813),
 3: ('Theta(i-1=>i+1)', 0.009, 0.803),
 10: ('flexibility', 0.009, 0.794),
 91: ('all_sasa_hydrophilic', 0.009, 0.785),
 102: ('all_sasa_l', 0.009, 0.777),
 114: ('all_sasa_his', 0.009, 0.768),
 119: ('all_sasa_asn', 0.008, 0.76),
 61: ('sasa_g', 0.008, 0.751),
 106: ('all_sasa_d', 0.008, 0.743),
 84: ('sasa_arg', 0.008, 0.735),
 93: ('all_sasa_aromatic', 0.008, 0.726),
 136: ('net_charge_all_around_target_se

In [25]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_both_only_cts.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_both = importance
importance_both

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
ASA,1.0,0.0,1.0,2.0
Phi,0.0,0.0,1.0,1.0
Psi,0.0,0.0,1.0,1.0
Theta(i-1=>i+1),1.0,0.0,1.0,2.0
Tau(i-2=>i+2),1.0,1.0,1.0,3.0
...,...,...,...,...
exposed_negative_charge_all_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_with_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_backbone_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_sidechain_around_target_ser_thr,0.0,0.0,1.0,1.0
