# header

In [1]:
"""
Created on Fri Apr 30 14:02:08 2021

@author: Fabian mohr (fmohr@mit.edu)
"""

"""
Revised on Mon Feb 6 2023

by author: Seokyoung Hong (skyhong@mit.edu)
"""

import pandas as pd
import numpy as np
import os
from os.path import exists

ns = [1000, 2000, 4000, 8000, 16000, 32000, 64000]

In [2]:
from sklearn import neighbors
def assess_nonlinearity(data, x_var=[], y_var='', 
                        N=10000, alpha=0.5, # parameters for nonlinearity measure
                        model_type='1NN', random_state=1): # parameter for nonlinear classifier
    n_samples  = len(data)
    n_features = len(x_var)
    data_x = data[x_var]
    data_y = data[y_var]
    
    # data scaling for KNN model
    x_min = data_x.min(0)
    x_max = data_x.max(0)
    data_x_sc = (data_x - x_min) / (x_max - x_min)
    
    # apply nonlinear classifier
    if model_type == '1NN':
        classifier = neighbors.KNeighborsClassifier(n_neighbors = 1)
        classifier.fit(data_x_sc, data_y)
        
    '''nonlinear measure'''
    # pick random data points X_i, shape=(N, n_features)
    np.random.seed(random_state)
    idx_i = np.random.randint(0, n_samples, size=N)
    X_i = data_x.iloc[idx_i]
    Y_i = data_y.iloc[idx_i]
    
    # choose random data point from the same class X_j, shape=(N, n_features)
    X_j = []
    for y_i in Y_i:
        x_j = data_x[data_y==y_i].sample(n=1, random_state=random_state)
        X_j.append(x_j)
    X_j = pd.concat(X_j, axis=0)
    
    # choose random point in between X_between, shape=(N, n_features)
    alpha = np.random.rand(N, 1)
    X_between = alpha*X_i.values + (1-alpha)*X_j.values
    X_between = pd.DataFrame(X_between, columns=x_var)
    X_between_sc = (X_between - x_min) / (x_max - x_min) # data scaling for KNN model
    
    # classifiy X_between
    if model_type == '1NN':
        Y_between = classifier.predict(X_between_sc)
        
    # note if the label is different
    n_between = Y_i.values != Y_between
    
    # calculate nonlinearity measure = # label switch / N
    nonlinearity_measure = n_between.sum() / N
    print("nonlinearity measure:", nonlinearity_measure)
    
    '''overlap measure'''
    # identify nearest neighbor
    X_i_sc = (X_i - x_min) / (x_max - x_min) # data scaling for KNN model
    nearest_idx = classifier.kneighbors(X_i_sc, 2, return_distance=False)[:,1]
    X_nearest = data_x.iloc[nearest_idx]
    
    # classify nearest neighbor
    X_nearest_sc = (X_nearest - x_min) / (x_max - x_min) # data scaling for KNN model
    Y_nearest = classifier.predict(X_nearest_sc)
 
    # note if the label is different
    n_nearest = Y_i.values != Y_nearest
    
    # calculate nonlinearity measure = # label switch / N
    overlap_measure = n_nearest.sum() / N
    print("overlap measure:", overlap_measure)
    
    '''nonlinearity'''
    # calculate nonlinearity = nonlinearity measure / overlap measure
    nonlinearity = nonlinearity_measure / overlap_measure
    print("nonlinearity:", nonlinearity)
    
    return nonlinearity_measure, overlap_measure, nonlinearity

# original dataset

In [3]:
load_name = "./data/data_for_ml.csv"
original_data = pd.read_csv(load_name)

original_data = original_data[(original_data['SEQ']=='S') | (original_data['SEQ']=='T')].reset_index(drop=True)
original_features = dict(
    zip(range(0, len(original_data.columns)), original_data.columns)
)
display(original_features)

{0: '#',
 1: 'SEQ',
 2: 'SS',
 3: 'ASA',
 4: 'Phi',
 5: 'Psi',
 6: 'Theta(i-1=>i+1)',
 7: 'Tau(i-2=>i+2)',
 8: 'HSE_alpha_up',
 9: 'HSE_alpha_down',
 10: 'P(C)',
 11: 'P(H)',
 12: 'P(E)',
 13: 'flexibility',
 14: 'side_-1',
 15: 'side_1',
 16: 'side_2',
 17: 'side_3',
 18: 'side_4',
 19: 'side_5',
 20: 'nAli',
 21: 'nPos',
 22: 'nS/nT',
 23: 'Proline',
 24: 'phi_psi',
 25: 'positivity',
 26: 'protein'}

In [4]:
data = original_data
x_var = list(original_data.columns[3:14]) + list(original_data.columns[20:23])
y_var = 'positivity'

x_var = ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 
         'P(C)', 'P(H)', 'P(E)']
y_var = 'positivity'

for n in ns:
    print(f"N = {n}")
    _, _, _ = assess_nonlinearity(data, x_var, y_var, N=n)
    print()

N = 1000
nonlinearity measure: 0.013
overlap measure: 0.025
nonlinearity: 0.5199999999999999

N = 2000
nonlinearity measure: 0.0145
overlap measure: 0.0215
nonlinearity: 0.6744186046511629

N = 4000
nonlinearity measure: 0.01275
overlap measure: 0.02225
nonlinearity: 0.5730337078651685

N = 8000
nonlinearity measure: 0.013125
overlap measure: 0.02275
nonlinearity: 0.5769230769230769

N = 16000
nonlinearity measure: 0.0126875
overlap measure: 0.022125
nonlinearity: 0.5734463276836159

N = 32000
nonlinearity measure: 0.01296875
overlap measure: 0.022375
nonlinearity: 0.5796089385474861

N = 64000
nonlinearity measure: 0.012234375
overlap measure: 0.023046875
nonlinearity: 0.5308474576271186



In [5]:
# without 'number' related features
data = original_data
x_var = list(original_data.columns[3:14])
y_var = 'positivity'

x_var = ['ASA', 'Phi', 'Psi', 'Theta(i-1=>i+1)', 'Tau(i-2=>i+2)', 'HSE_alpha_up', 'HSE_alpha_down', 
         'P(C)', 'P(H)', 'P(E)']
y_var = 'positivity'

for n in ns:
    print(f"N = {n}")
    _, _, _ = assess_nonlinearity(data, x_var, y_var, N=n)
    print()

N = 1000
nonlinearity measure: 0.013
overlap measure: 0.025
nonlinearity: 0.5199999999999999

N = 2000
nonlinearity measure: 0.0145
overlap measure: 0.0215
nonlinearity: 0.6744186046511629

N = 4000
nonlinearity measure: 0.01275
overlap measure: 0.02225
nonlinearity: 0.5730337078651685

N = 8000
nonlinearity measure: 0.013125
overlap measure: 0.02275
nonlinearity: 0.5769230769230769

N = 16000
nonlinearity measure: 0.0126875
overlap measure: 0.022125
nonlinearity: 0.5734463276836159

N = 32000
nonlinearity measure: 0.01296875
overlap measure: 0.022375
nonlinearity: 0.5796089385474861

N = 64000
nonlinearity measure: 0.012234375
overlap measure: 0.023046875
nonlinearity: 0.5308474576271186



# augmented dataset

In [6]:
protein_list = list(original_data.protein.unique())
protein_augmented = [name for name in protein_list if exists(f'./data/data_for_ml(augmented)/{name}.csv')]

augmented_data = []
for name in protein_augmented:
    load_path = f'./data/data_for_ml(augmented)/{name}.csv'
    augmented_data.append(pd.read_csv(load_path))
augmented_data = pd.concat(augmented_data, axis=0).reset_index(drop=True)
augmented_data = augmented_data[(augmented_data['SEQ']=='S') | (augmented_data['SEQ']=='T')].reset_index(drop=True)

augmented_features = dict(
    zip(range(0, len(augmented_data.columns)), augmented_data.columns)
)
display(augmented_features)

{0: '#',
 1: 'SEQ',
 2: 'SS',
 3: 'ASA',
 4: 'Phi',
 5: 'Psi',
 6: 'Theta(i-1=>i+1)',
 7: 'Tau(i-2=>i+2)',
 8: 'HSE_alpha_up',
 9: 'HSE_alpha_down',
 10: 'P(C)',
 11: 'P(H)',
 12: 'P(E)',
 13: 'flexibility',
 14: 'side_-1',
 15: 'side_1',
 16: 'side_2',
 17: 'side_3',
 18: 'side_4',
 19: 'side_5',
 20: 'nAli',
 21: 'nPos',
 22: 'nS/nT',
 23: 'Proline',
 24: 'phi_psi',
 25: 'positivity',
 26: 'residue_SER_THR',
 27: 'number_of_hydrophobic',
 28: 'number_of_hydrophilic',
 29: 'number_of_polar',
 30: 'number_of_aromatic',
 31: 'number_of_aliphatic',
 32: 'number_of_charged',
 33: 'number_of_positive',
 34: 'number_of_negative',
 35: 'number_of_g',
 36: 'number_of_v',
 37: 'number_of_s',
 38: 'number_of_n',
 39: 'number_of_l',
 40: 'number_of_p',
 41: 'number_of_A',
 42: 'number_of_b',
 43: 'number_of_d',
 44: 'number_of_e',
 45: 'number_of_f',
 46: 'number_of_ala',
 47: 'number_of_cys',
 48: 'number_of_asp',
 49: 'number_of_glu',
 50: 'number_of_phe',
 51: 'number_of_his',
 52: 'number_of

In [7]:
data = augmented_data
x_var = list(augmented_data.columns[26:])
y_var = 'positivity'

for n in ns:
    print(f"N = {n}")
    _, _, _ = assess_nonlinearity(data, x_var, y_var, N=n)
    print()

N = 1000
nonlinearity measure: 0.007
overlap measure: 0.028
nonlinearity: 0.25

N = 2000
nonlinearity measure: 0.0055
overlap measure: 0.0295
nonlinearity: 0.1864406779661017

N = 4000
nonlinearity measure: 0.00825
overlap measure: 0.0275
nonlinearity: 0.3

N = 8000
nonlinearity measure: 0.00975
overlap measure: 0.0295
nonlinearity: 0.3305084745762712

N = 16000
nonlinearity measure: 0.0100625
overlap measure: 0.031
nonlinearity: 0.3245967741935484

N = 32000
nonlinearity measure: 0.00984375
overlap measure: 0.03196875
nonlinearity: 0.30791788856304986

N = 64000
nonlinearity measure: 0.010921875
overlap measure: 0.03390625
nonlinearity: 0.32211981566820275



In [8]:
# without 'number' related features
data = augmented_data
x_var = list(augmented_data.columns[64:])
y_var = 'positivity'

for n in ns:
    print(f"N = {n}")
    _, _, _ = assess_nonlinearity(data, x_var, y_var, N=n)
    print()

N = 1000
nonlinearity measure: 0.01
overlap measure: 0.024
nonlinearity: 0.4166666666666667

N = 2000
nonlinearity measure: 0.008
overlap measure: 0.026
nonlinearity: 0.3076923076923077

N = 4000
nonlinearity measure: 0.0125
overlap measure: 0.027
nonlinearity: 0.462962962962963

N = 8000
nonlinearity measure: 0.01325
overlap measure: 0.02975
nonlinearity: 0.44537815126050423

N = 16000
nonlinearity measure: 0.015
overlap measure: 0.0323125
nonlinearity: 0.46421663442940037

N = 32000
nonlinearity measure: 0.01265625
overlap measure: 0.0326875
nonlinearity: 0.38718929254302104

N = 64000
nonlinearity measure: 0.01484375
overlap measure: 0.03428125
nonlinearity: 0.43299908842297175

