# Header

In [1]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from os.path import exists
from src.Data import *
from src.Preprocessing import *
variables = xy_variables()

## original dataset

In [3]:
load_path = f"./data/data_for_ml.csv"
original_data = pd.read_csv(load_path)
original_data = original_data[(original_data['SEQ']=='S') | (original_data['SEQ']=='T')]

x_cts   = variables.x_cts_window
x_cat   = variables.x_cat_window
y_label = variables.y_label

data_x, data_y = df_to_dummy(original_data, x_cts, x_cat, y_label)
original_data = pd.concat([data_x, data_y], axis=1)
original_features = dict(
    zip(range(0, len(original_data.columns)), original_data.columns)
)

display(original_features)

dummy x shape: (41264, 87)
dummy y shape: (41264, 1)


{0: 'Proline',
 1: 'flexibility',
 2: 'SEQ_S',
 3: 'SEQ_T',
 4: 'nS/nT_1',
 5: 'nS/nT_2',
 6: 'nS/nT_3',
 7: 'nS/nT_4',
 8: 'nS/nT_5',
 9: 'nS/nT_6',
 10: 'nS/nT_7',
 11: 'nS/nT_8',
 12: 'nS/nT_9',
 13: 'nS/nT_10',
 14: 'nS/nT_11',
 15: 'nS/nT_12',
 16: 'nS/nT_13',
 17: 'nS/nT_14',
 18: 'nS/nT_15',
 19: 'nS/nT_16',
 20: 'nS/nT_17',
 21: 'nS/nT_18',
 22: 'nS/nT_19',
 23: 'nS/nT_20',
 24: 'nS/nT_21',
 25: 'nAli_0',
 26: 'nAli_1',
 27: 'nAli_2',
 28: 'nAli_3',
 29: 'nPos_0',
 30: 'nPos_1',
 31: 'nPos_2',
 32: 'nPos_3',
 33: 'phi_psi_alpha',
 34: 'phi_psi_beta',
 35: 'phi_psi_other',
 36: 'SS_C',
 37: 'SS_E',
 38: 'SS_H',
 39: 'side_-1_None',
 40: 'side_-1_cycle',
 41: 'side_-1_gly',
 42: 'side_-1_long',
 43: 'side_-1_normal',
 44: 'side_-1_pro',
 45: 'side_-1_small',
 46: 'side_-1_very_small',
 47: 'side_1_None',
 48: 'side_1_cycle',
 49: 'side_1_gly',
 50: 'side_1_long',
 51: 'side_1_normal',
 52: 'side_1_pro',
 53: 'side_1_small',
 54: 'side_1_very_small',
 55: 'side_2_None',
 56: 'side

In [4]:
original_x = list(original_data.columns[:-1])
original_y = original_data.columns[-1]

## augmented dataset

In [6]:
load_name = "./data/data_for_ml.csv"
protein_list = list(pd.read_csv(load_name).protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

augmented_data = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    augmented_data.append(pd.read_csv(load_path))
augmented_data = pd.concat(augmented_data, axis=0).reset_index(drop=True)
augmented_data = augmented_data[(augmented_data['SEQ']=='S') | (augmented_data['SEQ']=='T')].reset_index(drop=True)

x_charge = pd.read_csv('./data/from_Krishna/features-all-names.csv').name.to_list()[1:-1]
x_cts   = variables.x_cts_window + x_charge[38:]
x_cat   = variables.x_cat_window + x_charge[:38]
y_label = variables.y_label

data_x, data_y = df_to_dummy(augmented_data, x_cts, x_cat, y_label)
augmented_data = pd.concat([data_x, data_y], axis=1)

augmented_features = dict(
    zip(range(0, len(augmented_data.columns)), augmented_data.columns)
)
display(augmented_features)

dummy x shape: (8910, 9307)
dummy y shape: (8910, 1)


{0: 'Proline',
 1: 'flexibility',
 2: 'sasa_hydrophobic',
 3: 'sasa_hydrophilic',
 4: 'sasa_polar',
 5: 'sasa_aromatic',
 6: 'sasa_aliphatic',
 7: 'sasa_charged',
 8: 'sasa_positive',
 9: 'sasa_negative',
 10: 'sasa_g',
 11: 'sasa_v',
 12: 'sasa_s',
 13: 'sasa_n',
 14: 'sasa_l',
 15: 'sasa_p',
 16: 'sasa_A',
 17: 'sasa_b',
 18: 'sasa_d',
 19: 'sasa_e',
 20: 'sasa_f',
 21: 'sasa_ala',
 22: 'sasa_cys',
 23: 'sasa_asp',
 24: 'sasa_glu',
 25: 'sasa_phe',
 26: 'sasa_his',
 27: 'sasa_ile',
 28: 'sasa_lys',
 29: 'sasa_leu',
 30: 'sasa_met',
 31: 'sasa_asn',
 32: 'sasa_gln',
 33: 'sasa_arg',
 34: 'sasa_ser',
 35: 'sasa_thr',
 36: 'sasa_val',
 37: 'sasa_trp',
 38: 'sasa_tyr',
 39: 'all_sasa_hydrophobic',
 40: 'all_sasa_hydrophilic',
 41: 'all_sasa_polar',
 42: 'all_sasa_aromatic',
 43: 'all_sasa_aliphatic',
 44: 'all_sasa_charged',
 45: 'all_sasa_positive',
 46: 'all_sasa_negative',
 47: 'all_sasa_g',
 48: 'all_sasa_v',
 49: 'all_sasa_s',
 50: 'all_sasa_n',
 51: 'all_sasa_l',
 52: 'all_sasa_p',

In [7]:
augmented_x = list(augmented_data.columns[2:120]) + list(augmented_data.columns[198:-1])
augmented_y = augmented_data.columns[-1]

In [8]:
both_x = list(augmented_data.columns[:2]) + list(augmented_data.columns[120:198])
both_x = both_x + augmented_x
both_y = 'positivity'

# ElasticNet model

In [9]:
from sklearn.linear_model import ElasticNet

def elasticnet(data, x=[], y='', 
               alpha=0.001, l1_ratio=0.5):
    data_x = data[x]
    data_y = data[y]
    
    x_min = data_x.min(0)
    x_max = data_x.max(0)
    data_x_sc = (data_x - x_min) / (x_max - x_min)
    
    elastic_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    elastic_model.fit(data_x_sc, data_y)
    
    return elastic_model

def elastic_coeff(model, x, rnd=3, disp=False):
    df = pd.DataFrame(
        zip(x, model.coef_, np.abs(model.coef_)
           ), columns = ['features', 'coefficient', 'absolute'] )
    df = df.sort_values(by='absolute', ascending=False).round(rnd)
    if disp:
        display(df.head(disp))
        
    df_important = df[df['absolute']>0]
    print(f'the number of important features: {len(df_important)} out of {len(x)} \n')
    display(dict(zip(df_important['features'], df_important['coefficient'])))
    
    return list(df_important['features'])

## original dataset

In [10]:
data = original_data
x = original_x
y = original_y

alpha = 0.0001
model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 48 out of 87 



{'nS/nT_10': 0.017,
 'flexibility': -0.012,
 'nS/nT_11': 0.01,
 'side_2_gly': 0.008,
 'side_-1_pro': -0.007,
 'nAli_0': -0.007,
 'nAli_3': 0.007,
 'side_2_very_small': 0.007,
 'side_-1_very_small': 0.007,
 'Proline': -0.005,
 'side_4_gly': 0.005,
 'side_3_long': -0.005,
 'side_-1_small': 0.005,
 'nPos_0': -0.004,
 'SEQ_S': -0.004,
 'side_4_small': 0.004,
 'side_3_cycle': -0.004,
 'side_5_normal': -0.004,
 'SS_C': 0.004,
 'side_5_very_small': 0.003,
 'nS/nT_8': 0.003,
 'side_3_very_small': 0.003,
 'nAli_1': -0.003,
 'nS/nT_12': 0.003,
 'nS/nT_2': -0.003,
 'side_1_pro': -0.003,
 'nS/nT_7': 0.003,
 'side_1_small': 0.002,
 'nS/nT_3': -0.002,
 'side_5_cycle': -0.002,
 'side_4_long': 0.002,
 'side_4_normal': -0.002,
 'side_-1_normal': -0.002,
 'nPos_1': 0.002,
 'side_3_gly': 0.002,
 'side_2_normal': -0.002,
 'side_1_long': 0.001,
 'nS/nT_1': -0.001,
 'nS/nT_4': -0.001,
 'side_2_long': -0.001,
 'phi_psi_beta': -0.001,
 'side_2_cycle': -0.001,
 'side_2_small': 0.001,
 'side_1_very_small': 0.00

## augmented dataset

In [11]:
data = augmented_data
x = augmented_x
y = augmented_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 407 out of 9227 



  model = cd_fast.enet_coordinate_descent(


{'number_of_tyr_185.0639495849609': 0.613,
 'number_of_tyr_336.95086669921875': 0.381,
 'number_of_tyr_280.422119140625': 0.38,
 'number_of_tyr_490.7499694824219': 0.38,
 'number_of_tyr_262.17724609375': 0.379,
 'number_of_tyr_793.8634033203125': 0.377,
 'number_of_tyr_567.920166015625': 0.376,
 'number_of_tyr_226.98684692382807': 0.375,
 'number_of_tyr_375.8166809082031': 0.374,
 'number_of_tyr_111.88921356201172': 0.373,
 'number_of_tyr_445.4957885742188': 0.373,
 'number_of_tyr_251.7066650390625': 0.373,
 'number_of_tyr_530.521484375': 0.373,
 'number_of_tyr_399.1080017089844': 0.373,
 'number_of_tyr_377.1577758789063': 0.372,
 'number_of_tyr_259.5963134765625': 0.372,
 'number_of_tyr_365.3014221191406': 0.372,
 'number_of_tyr_395.9352111816406': 0.372,
 'number_of_tyr_137.2529296875': 0.372,
 'number_of_tyr_205.7987060546875': 0.372,
 'number_of_tyr_361.7418212890625': 0.372,
 'number_of_tyr_158.60873413085938': 0.371,
 'number_of_tyr_535.701904296875': 0.371,
 'number_of_tyr_340.3

## both dataset

In [12]:
data = augmented_data
x = both_x
y = both_y
alpha = 0.0001

model = elasticnet(data, x, y, alpha)
features = elastic_coeff(model, x)

the number of important features: 474 out of 9307 



  model = cd_fast.enet_coordinate_descent(


{'number_of_tyr_185.0639495849609': 0.621,
 'nS/nT_15': 0.602,
 'number_of_tyr_793.8634033203125': 0.384,
 'number_of_tyr_336.95086669921875': 0.383,
 'number_of_tyr_402.0872497558594': 0.382,
 'number_of_tyr_280.422119140625': 0.377,
 'number_of_tyr_567.920166015625': 0.377,
 'number_of_tyr_575.3804931640625': 0.377,
 'number_of_tyr_326.00634765625': 0.376,
 'number_of_tyr_262.17724609375': 0.376,
 'number_of_tyr_226.98684692382807': 0.376,
 'number_of_tyr_461.8367004394531': 0.375,
 'number_of_tyr_259.001220703125': 0.373,
 'number_of_tyr_183.03497314453125': 0.372,
 'number_of_tyr_490.7499694824219': 0.372,
 'number_of_tyr_375.8166809082031': 0.372,
 'number_of_tyr_535.701904296875': 0.372,
 'number_of_tyr_77.9828872680664': 0.371,
 'number_of_tyr_111.88921356201172': 0.37,
 'number_of_tyr_523.3176879882812': 0.37,
 'number_of_tyr_391.8186645507813': 0.37,
 'number_of_tyr_500.8352661132813': 0.369,
 'number_of_tyr_440.1490173339844': 0.368,
 'number_of_tyr_327.4897155761719': 0.367,

# ANOVA model

In [13]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

def anova(data, x=[], y='',
          disp=False, rnd=4):
    data = data[x + [y]]

    f_num = [f"f{num}" for num in range(len(x))]
    data.columns =  f_num + [y]

    n_not = len(f_num) # the number of not important features (initialize to the number of features)
    i = 1
    while n_not > 0:
        print(f'{i}th iteration')
        fs = '+'.join(f_num) 
        model = smf.ols(
            y + '~' + fs,
            data).fit()

        result = sm.stats.anova_lm(model, typ=2).round(rnd)
        n_not = len(result[result['PR(>F)'] > 0.05])
        f_num = [fn for fn in result[result['PR(>F)'] <= 0.05].index]

        i += 1

    result = result.sort_values(by='PR(>F)')
    print(f'the number of important features: {len(f_num)} out of {len(x)}')

    if disp:
        n = 10
        i = 0
        while i < len(result):
            print(result.iloc[i:i+n, :])
            i += n

    x_idx = [int(x.removeprefix('f')) for x in result.index[:-1]]
    x_names = [x[idx] for idx in x_idx]

    print('\norder variables by their importance')
    display(dict(zip(x_idx, x_names)))
    
    return x_names

## original dataset

In [14]:
data = original_data
x = original_x
y = original_y

features = anova(data, x, y)

1th iteration
2th iteration
the number of important features: 1 out of 87

order variables by their importance


{1: 'flexibility'}

## augmented dataset

In [None]:
data = augmented_data
x = augmented_x
y = augmented_y

features = anova(data, x, y)

1th iteration


## both dataset

In [None]:
data = augmented_data
x = both_x
y = both_y

features = anova(data, x, y)

# Random Forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

def randomforest(data, x=[], y='',
                 n_estimators=500, rs=1):
    model = RandomForestClassifier(n_estimators = n_estimators, n_jobs=-1, random_state=rs)
    model.fit(data[x], data[y])
    
    return model

def FI(model, threshold= 0.95, rnd=3, disp=False):
    df = pd.DataFrame(zip(x, model.feature_importances_), columns = ['features', 'score'])
    df = df.sort_values(by='score', ascending=False)

    score_sum = []
    s = 1
    for score in df.score:
        s -= score
        score_sum.append(s)
    df['sum'] = np.array(score_sum)
    df = df.round(rnd)
    
    if disp:
        display(df.head(disp))

    
    df_important = df[df['sum']>= 1-threshold]

    print(f'the number of important features (threshold={threshold}): {len(df_important)} out of {len(df)}')
    display(dict(
        zip(df_important.index, zip(df_important['features'].values, df_important['score'].values, df_important['sum'].values))
    ))
    
    return list(df_important['features'])

## original dataset

In [None]:
data = original_data
x = original_x
y = original_y

model = randomforest(data, x, y)
features = FI(model)

## augmented dataset

In [None]:
data = augmented_data
x = augmented_x
y = augmented_y

model = randomforest(data, x, y)
features = FI(model)

## both dataset

In [None]:
data = augmented_data
x = both_x
y = both_y

model = randomforest(data, x, y)
features = FI(model)

# Result of finding important variables

## original dataset

In [None]:
data = original_data
x = original_x
y = original_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

In [None]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_original_cts_cat_v2.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_original = importance
importance_original

## augmented dataset

In [None]:
data = augmented_data
x = augmented_x
y = augmented_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

In [None]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_augmented_cts_cat_v2.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_augmented = importance
importance_augmented

## both dataset

In [None]:
data = augmented_data
x = both_x
y = both_y

# ElasticNet
alpha = 0.0001
model = elasticnet(data, x, y, alpha)
EN_features = elastic_coeff(model, x)

# ANOVA
AN_features = anova(data, x, y)

# RandomForest
threshold = 0.95
model = randomforest(data, x, y)
RF_features = FI(model, threshold=threshold)

In [None]:
importance = pd.DataFrame(np.zeros((len(x), 3)), columns=['ElasticNet', 'ANOVA', 'RandomForest'], index=x)
for feature in EN_features:
    importance.loc[feature,'ElasticNet'] = 1
for feature in AN_features:
    importance.loc[feature,'ANOVA'] = 1
for feature in RF_features:
    importance.loc[feature,'RandomForest'] = 1
    
importance['Score'] = importance.sum(axis=1)
save_path = './result/important_variables_both_cts_cat_v2.csv'
if not exists(save_path):
    importance.to_csv(save_path)
importance_both = importance
importance_both