# Header

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from os.path import exists
from src.Data import *
from src.Preprocessing import *
variables = xy_variables()

## original dataset

In [3]:
load_path = f"./data/data_for_ml.csv"
original_data = pd.read_csv(load_path)
original_data = original_data[(original_data['SEQ']=='S') | (original_data['SEQ']=='T')]

x_cts   = variables.x_cts_window
x_cat   = variables.x_cat_window
y_label = variables.y_label

data_x, data_y = df_to_dummy(original_data, x_cts, x_cat, y_label)
original_data = pd.concat([data_x, data_y], axis=1)
original_features = dict(
    zip(range(0, len(original_data.columns)), original_data.columns)
)

display(original_features)

dummy x shape: (41264, 87)
dummy y shape: (41264, 1)


{0: 'Proline',
 1: 'flexibility',
 2: 'SEQ_S',
 3: 'SEQ_T',
 4: 'nS/nT_1',
 5: 'nS/nT_2',
 6: 'nS/nT_3',
 7: 'nS/nT_4',
 8: 'nS/nT_5',
 9: 'nS/nT_6',
 10: 'nS/nT_7',
 11: 'nS/nT_8',
 12: 'nS/nT_9',
 13: 'nS/nT_10',
 14: 'nS/nT_11',
 15: 'nS/nT_12',
 16: 'nS/nT_13',
 17: 'nS/nT_14',
 18: 'nS/nT_15',
 19: 'nS/nT_16',
 20: 'nS/nT_17',
 21: 'nS/nT_18',
 22: 'nS/nT_19',
 23: 'nS/nT_20',
 24: 'nS/nT_21',
 25: 'nAli_0',
 26: 'nAli_1',
 27: 'nAli_2',
 28: 'nAli_3',
 29: 'nPos_0',
 30: 'nPos_1',
 31: 'nPos_2',
 32: 'nPos_3',
 33: 'phi_psi_alpha',
 34: 'phi_psi_beta',
 35: 'phi_psi_other',
 36: 'SS_C',
 37: 'SS_E',
 38: 'SS_H',
 39: 'side_-1_None',
 40: 'side_-1_cycle',
 41: 'side_-1_gly',
 42: 'side_-1_long',
 43: 'side_-1_normal',
 44: 'side_-1_pro',
 45: 'side_-1_small',
 46: 'side_-1_very_small',
 47: 'side_1_None',
 48: 'side_1_cycle',
 49: 'side_1_gly',
 50: 'side_1_long',
 51: 'side_1_normal',
 52: 'side_1_pro',
 53: 'side_1_small',
 54: 'side_1_very_small',
 55: 'side_2_None',
 56: 'side

In [4]:
original_x = list(original_data.columns[:-1])
original_y = original_data.columns[-1]

## augmented dataset

In [5]:
load_name = "./data/data_for_ml.csv"
protein_list = list(pd.read_csv(load_name).protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

augmented_data = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    augmented_data.append(pd.read_csv(load_path))
augmented_data = pd.concat(augmented_data, axis=0).reset_index(drop=True)
augmented_data = augmented_data[(augmented_data['SEQ']=='S') | (augmented_data['SEQ']=='T')].reset_index(drop=True)

x_charge = pd.read_csv('./data/from_Krishna/features-all-names.csv').name.to_list()[1:-1]
x_cts   = variables.x_cts_window + x_charge
x_cat   = variables.x_cat_window
y_label = variables.y_label

data_x, data_y = df_to_dummy(augmented_data, x_cts, x_cat, y_label)
augmented_data = pd.concat([data_x, data_y], axis=1)

augmented_features = dict(
    zip(range(0, len(augmented_data.columns)), augmented_data.columns)
)
display(augmented_features)

dummy x shape: (8910, 236)
dummy y shape: (8910, 1)


{0: 'Proline',
 1: 'flexibility',
 2: 'residue_SER_THR',
 3: 'number_of_hydrophobic',
 4: 'number_of_hydrophilic',
 5: 'number_of_polar',
 6: 'number_of_aromatic',
 7: 'number_of_aliphatic',
 8: 'number_of_charged',
 9: 'number_of_positive',
 10: 'number_of_negative',
 11: 'number_of_g',
 12: 'number_of_v',
 13: 'number_of_s',
 14: 'number_of_n',
 15: 'number_of_l',
 16: 'number_of_p',
 17: 'number_of_A',
 18: 'number_of_b',
 19: 'number_of_d',
 20: 'number_of_e',
 21: 'number_of_f',
 22: 'number_of_ala',
 23: 'number_of_cys',
 24: 'number_of_asp',
 25: 'number_of_glu',
 26: 'number_of_phe',
 27: 'number_of_his',
 28: 'number_of_ile',
 29: 'number_of_lys',
 30: 'number_of_leu',
 31: 'number_of_met',
 32: 'number_of_asn',
 33: 'number_of_gln',
 34: 'number_of_arg',
 35: 'number_of_ser',
 36: 'number_of_thr',
 37: 'number_of_val',
 38: 'number_of_trp',
 39: 'number_of_tyr',
 40: 'sasa_hydrophobic',
 41: 'sasa_hydrophilic',
 42: 'sasa_polar',
 43: 'sasa_aromatic',
 44: 'sasa_aliphatic',
 

In [6]:
augmented_x = list(augmented_data.columns[2:158])
augmented_y = augmented_data.columns[-1]

In [7]:
both_x = list(augmented_data.columns[:2]) + list(augmented_data.columns[158:-1])
both_x = both_x + augmented_x
both_y = 'positivity'

# ElasticNet model

In [8]:
from sklearn.linear_model import ElasticNet

def elasticnet(data, x=[], y='', 
               alpha=0.001, l1_ratio=0.5):
    data_x = data[x]
    data_y = data[y]
    
    x_min = data_x.min(0)
    x_max = data_x.max(0)
    data_x_sc = (data_x - x_min) / (x_max - x_min)
    
    elastic_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    elastic_model.fit(data_x_sc, data_y)
    
    return elastic_model

def elastic_coeff(model, x, rnd=3, disp=False):
    df = pd.DataFrame(
        zip(x, model.coef_, np.abs(model.coef_)
           ), columns = ['features', 'coefficient', 'absolute'] )
    df = df.sort_values(by='absolute', ascending=False).round(rnd)
    if disp:
        display(df.head(disp))
        
    df_important = df[df['absolute']>0]
    print(f'the number of important features: {len(df_important)} out of {len(x)} \n')
    display(dict(zip(df_important['features'], df_important['coefficient'])))
    
    return list(df_important['features'])

## original dataset

In [9]:
data = original_data
x = original_x
y = original_y

alpha = 0.0001
model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 48 out of 87 



{'nS/nT_10': 0.017,
 'flexibility': -0.012,
 'nS/nT_11': 0.01,
 'side_2_gly': 0.008,
 'side_-1_pro': -0.007,
 'nAli_0': -0.007,
 'nAli_3': 0.007,
 'side_2_very_small': 0.007,
 'side_-1_very_small': 0.007,
 'Proline': -0.005,
 'side_4_gly': 0.005,
 'side_3_long': -0.005,
 'side_-1_small': 0.005,
 'nPos_0': -0.004,
 'SEQ_S': -0.004,
 'side_4_small': 0.004,
 'side_3_cycle': -0.004,
 'side_5_normal': -0.004,
 'SS_C': 0.004,
 'side_5_very_small': 0.003,
 'nS/nT_8': 0.003,
 'side_3_very_small': 0.003,
 'nAli_1': -0.003,
 'nS/nT_12': 0.003,
 'nS/nT_2': -0.003,
 'side_1_pro': -0.003,
 'nS/nT_7': 0.003,
 'side_1_small': 0.002,
 'nS/nT_3': -0.002,
 'side_5_cycle': -0.002,
 'side_4_long': 0.002,
 'side_4_normal': -0.002,
 'side_-1_normal': -0.002,
 'nPos_1': 0.002,
 'side_3_gly': 0.002,
 'side_2_normal': -0.002,
 'side_1_long': 0.001,
 'nS/nT_1': -0.001,
 'nS/nT_4': -0.001,
 'side_2_long': -0.001,
 'phi_psi_beta': -0.001,
 'side_2_cycle': -0.001,
 'side_2_small': 0.001,
 'side_1_very_small': 0.00

## augmented dataset

In [10]:
data = augmented_data
x = augmented_x
y = augmented_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 38 out of 156 



{'sasa_ala': 0.047,
 'sasa_g': 0.034,
 'all_sasa_lys': -0.032,
 'number_of_asn': -0.025,
 'number_of_g': -0.023,
 'sasa_ser': 0.022,
 'sasa_cys': -0.021,
 'sasa_p': 0.02,
 'number_of_ser': 0.018,
 'number_of_thr': -0.017,
 'sasa_lys': -0.016,
 'sasa_val': -0.016,
 'number_of_leu': -0.014,
 'sasa_negative': 0.013,
 'net_charge_all_around_target_ser_thr': 0.012,
 'sasa_ile': -0.011,
 'sasa_s': -0.011,
 'sasa_leu': -0.01,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.009,
 'all_sasa_aliphatic': -0.009,
 'sasa_back': -0.009,
 'number_of_lys': 0.008,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'sasa_polar': -0.008,
 'sasa_phe': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.007,
 'number_of_polar': -0.007,
 'number_of_charged': -0.007,
 'number_of_e': -0.007,
 'sasa_e': -0.007,
 'all_atom_negative_charge_all_backbone_around_target_ser_thr': 0.005,
 'number_of_l': -0.004,
 'sasa_f': 0.004,
 'all_sasa_met': -0.003,
 'sasa_met': -0.003,
 '

## both dataset

In [11]:
data = augmented_data
x = both_x
y = both_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 98 out of 236 



{'nS/nT_15': 0.596,
 'sasa_ala': 0.049,
 'all_sasa_lys': -0.038,
 'sasa_l': 0.029,
 'nS/nT_11': 0.027,
 'number_of_g': -0.026,
 'sasa_ser': 0.024,
 'flexibility': -0.024,
 'side_-1_pro': -0.024,
 'number_of_asn': -0.019,
 'nS/nT_8': 0.018,
 'number_of_ser': 0.017,
 'sasa_lys': -0.017,
 'sasa_val': -0.016,
 'side_2_gly': 0.015,
 'side_4_gly': 0.014,
 'nAli_0': -0.012,
 'net_charge_all_around_target_ser_thr': 0.011,
 'number_of_negative': -0.011,
 'Proline': -0.011,
 'number_of_thr': -0.011,
 'sasa_cys': -0.011,
 'nAli_3': 0.011,
 'nS/nT_5': -0.009,
 'nS/nT_1': -0.009,
 'number_of_arg': -0.009,
 'side_-1_very_small': 0.009,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'side_2_None': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.008,
 'SS_E': -0.008,
 'sasa_negative': 0.008,
 'side_3_very_small': 0.008,
 'side_2_pro': -0.008,
 'number_of_leu': -0.008,
 'number_of_polar': -0.008,
 'number_of_l': -0.007,
 'sasa_back': -0.006,
 'side_5_normal': -0.006,
 

# ANOVA model

In [12]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

def anova(data, x=[], y='',
          disp=False, rnd=4):
    data = data[x + [y]]

    f_num = [f"f{num}" for num in range(len(x))]
    data.columns =  f_num + [y]

    n_not = len(f_num) # the number of not important features (initialize to the number of features)
    i = 1
    while n_not > 0:
        print(f'{i}th iteration')
        fs = '+'.join(f_num) 
        model = smf.ols(
            y + '~' + fs,
            data).fit()

        result = sm.stats.anova_lm(model, typ=2).round(rnd)
        n_not = len(result[result['PR(>F)'] > 0.05])
        f_num = [fn for fn in result[result['PR(>F)'] <= 0.05].index]

        i += 1

    result = result.sort_values(by='PR(>F)')
    print(f'the number of important features: {len(f_num)} out of {len(x)}')

    if disp:
        n = 10
        i = 0
        while i < len(result):
            print(result.iloc[i:i+n, :])
            i += n

    x_idx = [int(x.removeprefix('f')) for x in result.index[:-1]]
    x_names = [x[idx] for idx in x_idx]

    print('\norder variables by their importance')
    display(dict(zip(x_idx, x_names)))
    
    return x_names

## original dataset

In [13]:
data = original_data
x = original_x
y = original_y

features = anova(data, x, y)

1th iteration
2th iteration
the number of important features: 1 out of 87

order variables by their importance


{1: 'flexibility'}

## augmented dataset

In [14]:
data = augmented_data
x = augmented_x
y = augmented_y

features = anova(data, x, y)

1th iteration
2th iteration
3th iteration
the number of important features: 41 out of 156

order variables by their importance


{35: 'number_of_val',
 28: 'number_of_leu',
 17: 'number_of_d',
 11: 'number_of_s',
 27: 'number_of_lys',
 22: 'number_of_asp',
 19: 'number_of_f',
 18: 'number_of_e',
 12: 'number_of_n',
 8: 'number_of_negative',
 3: 'number_of_polar',
 6: 'number_of_charged',
 24: 'number_of_phe',
 124: 'net_charge_all_sidechain_around_target_ser_thr',
 126: 'net_charge_all_sidechain_with_around_target_ser_thr',
 123: 'net_charge_all_backbone_around_target_ser_thr',
 122: 'net_charge_all_with_around_target_ser_thr',
 121: 'net_charge_all_around_target_ser_thr',
 120: 'sasa_all_with_around_target_ser_thr',
 127: 'all_atom_positive_charge_all_around_target_ser_thr',
 128: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 129: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 125: 'net_charge_all_backbone_with_around_target_ser_thr',
 131: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 130: 'all_atom_positive_charge_all_sidechain_around_target_ser_thr',
 2: 

## both dataset

In [15]:
data = augmented_data
x = both_x
y = both_y

features = anova(data, x, y)

1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
the number of important features: 48 out of 236

order variables by their importance


{18: 'nS/nT_15',
 39: 'side_-1_very_small',
 38: 'side_-1_small',
 2: 'SEQ_S',
 115: 'number_of_val',
 3: 'SEQ_T',
 113: 'number_of_ser',
 88: 'number_of_negative',
 100: 'number_of_ala',
 95: 'number_of_A',
 108: 'number_of_leu',
 91: 'number_of_s',
 98: 'number_of_e',
 107: 'number_of_lys',
 111: 'number_of_gln',
 86: 'number_of_charged',
 200: 'sasa_all_with_around_target_ser_thr',
 206: 'net_charge_all_sidechain_with_around_target_ser_thr',
 208: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 209: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 207: 'all_atom_positive_charge_all_around_target_ser_thr',
 202: 'net_charge_all_with_around_target_ser_thr',
 203: 'net_charge_all_backbone_around_target_ser_thr',
 204: 'net_charge_all_sidechain_around_target_ser_thr',
 201: 'net_charge_all_around_target_ser_thr',
 205: 'net_charge_all_backbone_with_around_target_ser_thr',
 211: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 50: 'side_2_gl

# Random Forest model

In [16]:
from sklearn.ensemble import RandomForestClassifier

def randomforest(data, x=[], y='',
                 n_estimators=500, rs=1):
    model = RandomForestClassifier(n_estimators = n_estimators, n_jobs=-1, random_state=rs)
    model.fit(data[x], data[y])
    
    return model

def FI(model, threshold= 0.95, rnd=3, disp=False):
    df = pd.DataFrame(zip(x, model.feature_importances_), columns = ['features', 'score'])
    df = df.sort_values(by='score', ascending=False)

    score_sum = []
    s = 1
    for score in df.score:
        s -= score
        score_sum.append(s)
    df['sum'] = np.array(score_sum)
    df = df.round(rnd)
    
    if disp:
        display(df.head(disp))

    
    df_important = df[df['sum']>= 1-threshold]

    print(f'the number of important features (threshold={threshold}): {len(df_important)} out of {len(df)}')
    display(dict(
        zip(df_important.index, zip(df_important['features'].values, df_important['score'].values, df_important['sum'].values))
    ))
    
    return list(df_important['features'])

## original dataset

In [17]:
data = original_data
x = original_x
y = original_y

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 62 out of 87


{1: ('flexibility', 0.155, 0.845),
 69: ('side_3_small', 0.022, 0.823),
 85: ('side_5_small', 0.021, 0.802),
 61: ('side_2_small', 0.021, 0.781),
 33: ('phi_psi_alpha', 0.02, 0.76),
 53: ('side_1_small', 0.02, 0.74),
 77: ('side_4_small', 0.02, 0.721),
 67: ('side_3_normal', 0.019, 0.702),
 34: ('phi_psi_beta', 0.019, 0.684),
 26: ('nAli_1', 0.018, 0.666),
 3: ('SEQ_T', 0.017, 0.648),
 70: ('side_3_very_small', 0.017, 0.631),
 2: ('SEQ_S', 0.017, 0.614),
 27: ('nAli_2', 0.016, 0.598),
 62: ('side_2_very_small', 0.016, 0.582),
 30: ('nPos_1', 0.016, 0.566),
 8: ('nS/nT_5', 0.016, 0.55),
 86: ('side_5_very_small', 0.016, 0.534),
 45: ('side_-1_small', 0.016, 0.518),
 29: ('nPos_0', 0.016, 0.502),
 35: ('phi_psi_other', 0.015, 0.487),
 7: ('nS/nT_4', 0.015, 0.472),
 51: ('side_1_normal', 0.015, 0.456),
 54: ('side_1_very_small', 0.015, 0.442),
 9: ('nS/nT_6', 0.014, 0.428),
 10: ('nS/nT_7', 0.014, 0.414),
 78: ('side_4_very_small', 0.014, 0.401),
 59: ('side_2_normal', 0.013, 0.387),
 50:

## augmented dataset

In [18]:
data = augmented_data
x = augmented_x
y = augmented_y

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 129 out of 156


{107: ('all_sasa_ser', 0.014, 0.986),
 83: ('all_sasa_g', 0.013, 0.974),
 88: ('all_sasa_p', 0.012, 0.961),
 94: ('all_sasa_ala', 0.012, 0.949),
 70: ('sasa_ser', 0.011, 0.938),
 51: ('sasa_p', 0.011, 0.927),
 82: ('all_sasa_negative', 0.011, 0.916),
 108: ('all_sasa_thr', 0.011, 0.905),
 106: ('all_sasa_arg', 0.011, 0.893),
 78: ('all_sasa_aromatic', 0.01, 0.883),
 93: ('all_sasa_f', 0.01, 0.873),
 69: ('sasa_arg', 0.01, 0.863),
 76: ('all_sasa_hydrophilic', 0.01, 0.852),
 105: ('all_sasa_gln', 0.01, 0.842),
 99: ('all_sasa_his', 0.01, 0.832),
 39: ('sasa_hydrophilic', 0.01, 0.822),
 121: ('net_charge_all_around_target_ser_thr', 0.009, 0.813),
 46: ('sasa_g', 0.009, 0.804),
 117: ('all_sasa_side_with_whole_target', 0.009, 0.794),
 104: ('all_sasa_asn', 0.009, 0.785),
 96: ('all_sasa_asp', 0.009, 0.776),
 74: ('sasa_tyr', 0.009, 0.766),
 120: ('sasa_all_with_around_target_ser_thr', 0.009, 0.757),
 86: ('all_sasa_n', 0.009, 0.748),
 87: ('all_sasa_l', 0.009, 0.74),
 56: ('sasa_f', 0.009

## both dataset

In [19]:
data = augmented_data
x = both_x
y = both_y

model = randomforest(data, x, y)
features = FI(model)

the number of important features (threshold=0.95): 167 out of 236


{187: ('all_sasa_ser', 0.012, 0.988),
 168: ('all_sasa_p', 0.011, 0.977),
 163: ('all_sasa_g', 0.011, 0.965),
 131: ('sasa_p', 0.011, 0.955),
 174: ('all_sasa_ala', 0.01, 0.944),
 186: ('all_sasa_arg', 0.01, 0.934),
 150: ('sasa_ser', 0.01, 0.924),
 162: ('all_sasa_negative', 0.01, 0.914),
 185: ('all_sasa_gln', 0.01, 0.905),
 173: ('all_sasa_f', 0.01, 0.895),
 126: ('sasa_g', 0.009, 0.886),
 1: ('flexibility', 0.009, 0.877),
 156: ('all_sasa_hydrophilic', 0.009, 0.868),
 119: ('sasa_hydrophilic', 0.009, 0.859),
 188: ('all_sasa_thr', 0.009, 0.85),
 155: ('all_sasa_hydrophobic', 0.009, 0.841),
 149: ('sasa_arg', 0.009, 0.832),
 179: ('all_sasa_his', 0.009, 0.823),
 184: ('all_sasa_asn', 0.009, 0.815),
 197: ('all_sasa_side_with_whole_target', 0.008, 0.806),
 176: ('all_sasa_asp', 0.008, 0.798),
 158: ('all_sasa_aromatic', 0.008, 0.789),
 166: ('all_sasa_n', 0.008, 0.781),
 171: ('all_sasa_d', 0.008, 0.773),
 169: ('all_sasa_A', 0.008, 0.764),
 154: ('sasa_tyr', 0.008, 0.756),
 160: ('a

# Result of finding important variables

## original dataset

In [20]:
data = original_data
x = original_x
y = original_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 48 out of 87 



{'nS/nT_10': 0.017,
 'flexibility': -0.012,
 'nS/nT_11': 0.01,
 'side_2_gly': 0.008,
 'side_-1_pro': -0.007,
 'nAli_0': -0.007,
 'nAli_3': 0.007,
 'side_2_very_small': 0.007,
 'side_-1_very_small': 0.007,
 'Proline': -0.005,
 'side_4_gly': 0.005,
 'side_3_long': -0.005,
 'side_-1_small': 0.005,
 'nPos_0': -0.004,
 'SEQ_S': -0.004,
 'side_4_small': 0.004,
 'side_3_cycle': -0.004,
 'side_5_normal': -0.004,
 'SS_C': 0.004,
 'side_5_very_small': 0.003,
 'nS/nT_8': 0.003,
 'side_3_very_small': 0.003,
 'nAli_1': -0.003,
 'nS/nT_12': 0.003,
 'nS/nT_2': -0.003,
 'side_1_pro': -0.003,
 'nS/nT_7': 0.003,
 'side_1_small': 0.002,
 'nS/nT_3': -0.002,
 'side_5_cycle': -0.002,
 'side_4_long': 0.002,
 'side_4_normal': -0.002,
 'side_-1_normal': -0.002,
 'nPos_1': 0.002,
 'side_3_gly': 0.002,
 'side_2_normal': -0.002,
 'side_1_long': 0.001,
 'nS/nT_1': -0.001,
 'nS/nT_4': -0.001,
 'side_2_long': -0.001,
 'phi_psi_beta': -0.001,
 'side_2_cycle': -0.001,
 'side_2_small': 0.001,
 'side_1_very_small': 0.00

1th iteration
2th iteration
the number of important features: 1 out of 87

order variables by their importance


{1: 'flexibility'}

the number of important features (threshold=0.95): 62 out of 87


{1: ('flexibility', 0.155, 0.845),
 69: ('side_3_small', 0.022, 0.823),
 85: ('side_5_small', 0.021, 0.802),
 61: ('side_2_small', 0.021, 0.781),
 33: ('phi_psi_alpha', 0.02, 0.76),
 53: ('side_1_small', 0.02, 0.74),
 77: ('side_4_small', 0.02, 0.721),
 67: ('side_3_normal', 0.019, 0.702),
 34: ('phi_psi_beta', 0.019, 0.684),
 26: ('nAli_1', 0.018, 0.666),
 3: ('SEQ_T', 0.017, 0.648),
 70: ('side_3_very_small', 0.017, 0.631),
 2: ('SEQ_S', 0.017, 0.614),
 27: ('nAli_2', 0.016, 0.598),
 62: ('side_2_very_small', 0.016, 0.582),
 30: ('nPos_1', 0.016, 0.566),
 8: ('nS/nT_5', 0.016, 0.55),
 86: ('side_5_very_small', 0.016, 0.534),
 45: ('side_-1_small', 0.016, 0.518),
 29: ('nPos_0', 0.016, 0.502),
 35: ('phi_psi_other', 0.015, 0.487),
 7: ('nS/nT_4', 0.015, 0.472),
 51: ('side_1_normal', 0.015, 0.456),
 54: ('side_1_very_small', 0.015, 0.442),
 9: ('nS/nT_6', 0.014, 0.428),
 10: ('nS/nT_7', 0.014, 0.414),
 78: ('side_4_very_small', 0.014, 0.401),
 59: ('side_2_normal', 0.013, 0.387),
 50:

In [21]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_original_cts_cat.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_original = importance
importance_original

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
Proline,1.0,0.0,0.0,1.0
flexibility,1.0,1.0,1.0,3.0
SEQ_S,1.0,0.0,1.0,2.0
SEQ_T,0.0,0.0,1.0,1.0
nS/nT_1,1.0,0.0,0.0,1.0
...,...,...,...,...
side_5_long,0.0,0.0,1.0,1.0
side_5_normal,1.0,0.0,1.0,2.0
side_5_pro,0.0,0.0,1.0,1.0
side_5_small,0.0,0.0,1.0,1.0


## augmented dataset

In [22]:
data = augmented_data
x = augmented_x
y = augmented_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 38 out of 156 



{'sasa_ala': 0.047,
 'sasa_g': 0.034,
 'all_sasa_lys': -0.032,
 'number_of_asn': -0.025,
 'number_of_g': -0.023,
 'sasa_ser': 0.022,
 'sasa_cys': -0.021,
 'sasa_p': 0.02,
 'number_of_ser': 0.018,
 'number_of_thr': -0.017,
 'sasa_lys': -0.016,
 'sasa_val': -0.016,
 'number_of_leu': -0.014,
 'sasa_negative': 0.013,
 'net_charge_all_around_target_ser_thr': 0.012,
 'sasa_ile': -0.011,
 'sasa_s': -0.011,
 'sasa_leu': -0.01,
 'exposed_charge_all_sidechain_with_around_target_ser_thr': -0.009,
 'all_sasa_aliphatic': -0.009,
 'sasa_back': -0.009,
 'number_of_lys': 0.008,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'sasa_polar': -0.008,
 'sasa_phe': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.007,
 'number_of_polar': -0.007,
 'number_of_charged': -0.007,
 'number_of_e': -0.007,
 'sasa_e': -0.007,
 'all_atom_negative_charge_all_backbone_around_target_ser_thr': 0.005,
 'number_of_l': -0.004,
 'sasa_f': 0.004,
 'all_sasa_met': -0.003,
 'sasa_met': -0.003,
 '

1th iteration
2th iteration
3th iteration
the number of important features: 41 out of 156

order variables by their importance


{35: 'number_of_val',
 28: 'number_of_leu',
 17: 'number_of_d',
 11: 'number_of_s',
 27: 'number_of_lys',
 22: 'number_of_asp',
 19: 'number_of_f',
 18: 'number_of_e',
 12: 'number_of_n',
 8: 'number_of_negative',
 3: 'number_of_polar',
 6: 'number_of_charged',
 24: 'number_of_phe',
 124: 'net_charge_all_sidechain_around_target_ser_thr',
 126: 'net_charge_all_sidechain_with_around_target_ser_thr',
 123: 'net_charge_all_backbone_around_target_ser_thr',
 122: 'net_charge_all_with_around_target_ser_thr',
 121: 'net_charge_all_around_target_ser_thr',
 120: 'sasa_all_with_around_target_ser_thr',
 127: 'all_atom_positive_charge_all_around_target_ser_thr',
 128: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 129: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 125: 'net_charge_all_backbone_with_around_target_ser_thr',
 131: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 130: 'all_atom_positive_charge_all_sidechain_around_target_ser_thr',
 2: 

the number of important features (threshold=0.95): 129 out of 156


{107: ('all_sasa_ser', 0.014, 0.986),
 83: ('all_sasa_g', 0.013, 0.974),
 88: ('all_sasa_p', 0.012, 0.961),
 94: ('all_sasa_ala', 0.012, 0.949),
 70: ('sasa_ser', 0.011, 0.938),
 51: ('sasa_p', 0.011, 0.927),
 82: ('all_sasa_negative', 0.011, 0.916),
 108: ('all_sasa_thr', 0.011, 0.905),
 106: ('all_sasa_arg', 0.011, 0.893),
 78: ('all_sasa_aromatic', 0.01, 0.883),
 93: ('all_sasa_f', 0.01, 0.873),
 69: ('sasa_arg', 0.01, 0.863),
 76: ('all_sasa_hydrophilic', 0.01, 0.852),
 105: ('all_sasa_gln', 0.01, 0.842),
 99: ('all_sasa_his', 0.01, 0.832),
 39: ('sasa_hydrophilic', 0.01, 0.822),
 121: ('net_charge_all_around_target_ser_thr', 0.009, 0.813),
 46: ('sasa_g', 0.009, 0.804),
 117: ('all_sasa_side_with_whole_target', 0.009, 0.794),
 104: ('all_sasa_asn', 0.009, 0.785),
 96: ('all_sasa_asp', 0.009, 0.776),
 74: ('sasa_tyr', 0.009, 0.766),
 120: ('sasa_all_with_around_target_ser_thr', 0.009, 0.757),
 86: ('all_sasa_n', 0.009, 0.748),
 87: ('all_sasa_l', 0.009, 0.74),
 56: ('sasa_f', 0.009

In [23]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_augmented_cts_cat.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_augmented = importance
importance_augmented

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
residue_SER_THR,0.0,1.0,1.0,2.0
number_of_hydrophobic,0.0,0.0,1.0,1.0
number_of_hydrophilic,0.0,1.0,1.0,2.0
number_of_polar,1.0,1.0,0.0,2.0
number_of_aromatic,0.0,1.0,1.0,2.0
...,...,...,...,...
exposed_negative_charge_all_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_with_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_backbone_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_sidechain_around_target_ser_thr,0.0,0.0,1.0,1.0


## both dataset

In [26]:
data = augmented_data
x = both_x
y = both_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

the number of important features: 98 out of 236 



{'nS/nT_15': 0.596,
 'sasa_ala': 0.049,
 'all_sasa_lys': -0.038,
 'sasa_l': 0.029,
 'nS/nT_11': 0.027,
 'number_of_g': -0.026,
 'sasa_ser': 0.024,
 'flexibility': -0.024,
 'side_-1_pro': -0.024,
 'number_of_asn': -0.019,
 'nS/nT_8': 0.018,
 'number_of_ser': 0.017,
 'sasa_lys': -0.017,
 'sasa_val': -0.016,
 'side_2_gly': 0.015,
 'side_4_gly': 0.014,
 'nAli_0': -0.012,
 'net_charge_all_around_target_ser_thr': 0.011,
 'number_of_negative': -0.011,
 'Proline': -0.011,
 'number_of_thr': -0.011,
 'sasa_cys': -0.011,
 'nAli_3': 0.011,
 'nS/nT_5': -0.009,
 'nS/nT_1': -0.009,
 'number_of_arg': -0.009,
 'side_-1_very_small': 0.009,
 'exposed_charge_all_with_around_target_ser_thr': -0.008,
 'side_2_None': -0.008,
 'exposed_positive_charge_all_with_around_target_ser_thr': -0.008,
 'SS_E': -0.008,
 'sasa_negative': 0.008,
 'side_3_very_small': 0.008,
 'side_2_pro': -0.008,
 'number_of_leu': -0.008,
 'number_of_polar': -0.008,
 'number_of_l': -0.007,
 'sasa_back': -0.006,
 'side_5_normal': -0.006,
 

1th iteration
2th iteration
3th iteration
4th iteration
5th iteration
the number of important features: 48 out of 236

order variables by their importance


{18: 'nS/nT_15',
 39: 'side_-1_very_small',
 38: 'side_-1_small',
 2: 'SEQ_S',
 115: 'number_of_val',
 3: 'SEQ_T',
 113: 'number_of_ser',
 88: 'number_of_negative',
 100: 'number_of_ala',
 95: 'number_of_A',
 108: 'number_of_leu',
 91: 'number_of_s',
 98: 'number_of_e',
 107: 'number_of_lys',
 111: 'number_of_gln',
 86: 'number_of_charged',
 200: 'sasa_all_with_around_target_ser_thr',
 206: 'net_charge_all_sidechain_with_around_target_ser_thr',
 208: 'all_atom_positive_charge_all_with_around_target_ser_thr',
 209: 'all_atom_positive_charge_all_backbone_around_target_ser_thr',
 207: 'all_atom_positive_charge_all_around_target_ser_thr',
 202: 'net_charge_all_with_around_target_ser_thr',
 203: 'net_charge_all_backbone_around_target_ser_thr',
 204: 'net_charge_all_sidechain_around_target_ser_thr',
 201: 'net_charge_all_around_target_ser_thr',
 205: 'net_charge_all_backbone_with_around_target_ser_thr',
 211: 'all_atom_positive_charge_all_backbone_with_around_target_ser_thr',
 50: 'side_2_gl

the number of important features (threshold=0.95): 167 out of 236


{187: ('all_sasa_ser', 0.012, 0.988),
 168: ('all_sasa_p', 0.011, 0.977),
 163: ('all_sasa_g', 0.011, 0.965),
 131: ('sasa_p', 0.011, 0.955),
 174: ('all_sasa_ala', 0.01, 0.944),
 186: ('all_sasa_arg', 0.01, 0.934),
 150: ('sasa_ser', 0.01, 0.924),
 162: ('all_sasa_negative', 0.01, 0.914),
 185: ('all_sasa_gln', 0.01, 0.905),
 173: ('all_sasa_f', 0.01, 0.895),
 126: ('sasa_g', 0.009, 0.886),
 1: ('flexibility', 0.009, 0.877),
 156: ('all_sasa_hydrophilic', 0.009, 0.868),
 119: ('sasa_hydrophilic', 0.009, 0.859),
 188: ('all_sasa_thr', 0.009, 0.85),
 155: ('all_sasa_hydrophobic', 0.009, 0.841),
 149: ('sasa_arg', 0.009, 0.832),
 179: ('all_sasa_his', 0.009, 0.823),
 184: ('all_sasa_asn', 0.009, 0.815),
 197: ('all_sasa_side_with_whole_target', 0.008, 0.806),
 176: ('all_sasa_asp', 0.008, 0.798),
 158: ('all_sasa_aromatic', 0.008, 0.789),
 166: ('all_sasa_n', 0.008, 0.781),
 171: ('all_sasa_d', 0.008, 0.773),
 169: ('all_sasa_A', 0.008, 0.764),
 154: ('sasa_tyr', 0.008, 0.756),
 160: ('a

In [25]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_both_cts_cat.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_both = importance
importance_both

Unnamed: 0,ElasticNet,ANOVA,RandomForest,Score
Proline,1.0,0.0,0.0,1.0
flexibility,1.0,0.0,1.0,2.0
SEQ_S,1.0,1.0,0.0,2.0
SEQ_T,0.0,1.0,0.0,1.0
nS/nT_1,1.0,0.0,0.0,1.0
...,...,...,...,...
exposed_negative_charge_all_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_with_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_backbone_around_target_ser_thr,0.0,0.0,1.0,1.0
exposed_negative_charge_all_sidechain_around_target_ser_thr,0.0,0.0,1.0,1.0
