# header

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 30 14:02:08 2021

@author: Fabian mohr (fmohr@mit.edu)
"""

"""
Revised on Mon Feb 6

by author: Seokyoung Hong (skyhong@mit.edu)
"""

import pandas as pd
import numpy as np
import os
from os.path import exists

In [2]:
from sklearn import neighbors
def assess_nonlinearity(data, x_var=[], y_var='', 
                        N=10000, alpha=0.5, # parameters for nonlinearity measure
                        model_type='1NN', random_state=1): # parameter for nonlinear classifier
    n_samples  = len(data)
    n_features = len(x_var)
    data_x = data[x_var]
    data_y = data[y_var]
    
    # data scaling for KNN model
    x_min = data_x.min(0)
    x_max = data_x.max(0)
    data_x_sc = (data_x - x_min) / (x_max - x_min)
    
    # apply nonlinear classifier
    if model_type == '1NN':
        classifier = neighbors.KNeighborsClassifier(n_neighbors = 1)
        classifier.fit(data_x_sc, data_y)
        
    '''nonlinear measure'''
    # pick random data points X_i, shape=(N, n_features)
    np.random.seed(random_state)
    idx_i = np.random.randint(0, n_samples, size=N)
    X_i = data_x.iloc[idx_i]
    Y_i = data_y.iloc[idx_i]
    
    # choose random data point from the same class X_j, shape=(N, n_features)
    X_j = []
    for y_i in Y_i:
        x_j = data_x[data_y==y_i].sample(n=1, random_state=random_state)
        X_j.append(x_j)
    X_j = pd.concat(X_j, axis=0)
    
    # choose random point in between X_between, shape=(N, n_features)
    alpha = np.random.rand(N, 1)
    X_between = alpha*X_i.values + (1-alpha)*X_j.values
    X_between = pd.DataFrame(X_between, columns=x_var)
    X_between_sc = (X_between - x_min) / (x_max - x_min) # data scaling for KNN model
    
    # classifiy X_between
    if model_type == '1NN':
        Y_between = classifier.predict(X_between_sc)
        
    # note if the label is different
    n_between = Y_i.values != Y_between
    
    # calculate nonlinearity measure = # label switch / N
    nonlinearity_measure = n_between.sum() / N
    print("nonlinearity measure:", nonlinearity_measure)
    
    '''overlap measure'''
    # identify nearest neighbor
    X_i_sc = (X_i - x_min) / (x_max - x_min) # data scaling for KNN model
    nearest_idx = classifier.kneighbors(X_i_sc, 2, return_distance=False)[:,1]
    X_nearest = data_x.iloc[nearest_idx]
    
    # classify nearest neighbor
    X_nearest_sc = (X_nearest - x_min) / (x_max - x_min) # data scaling for KNN model
    Y_nearest = classifier.predict(X_nearest_sc)
 
    # note if the label is different
    n_nearest = Y_i.values != Y_nearest
    
    # calculate nonlinearity measure = # label switch / N
    overlap_measure = n_nearest.sum() / N
    print("overlap measure:", overlap_measure)
    
    '''nonlinearity'''
    # calculate nonlinearity = nonlinearity measure / overlap measure
    nonlinearity = nonlinearity_measure / overlap_measure
    print("nonlinearity:", nonlinearity)
    
    return nonlinearity_measure, overlap_measure, nonlinearity

# original dataset

In [33]:
n_sample = 100000
ns = [1000, 10000]
for n_features in range(2,10):
    print(f"the number of features = {n_features}")
    data_x = np.random.rand(n_sample, n_features)
    theta  = np.random.rand(1, n_features)
    data_y = np.sum(data_x * theta, axis=1).reshape(-1,1)
    mean_y = data_y.mean()
    data_y = data_y > mean_y

    data_xy = np.concatenate([data_x, data_y], axis=1)
    data_xy = pd.DataFrame(data_xy, columns = [f"f{x}" for x in range(n_features)] + ['y'])
    data = data_xy
    x_var = [f"f{x}" for x in range(n_features)]
    y_var = 'y'

    for n in ns:
        print(f"N = {n}")
        _, _, _ = assess_nonlinearity(data, x_var, y_var, N=n)
        print()

the number of features = 2
N = 1000
nonlinearity measure: 0.0
overlap measure: 0.002
nonlinearity: 0.0

N = 10000
nonlinearity measure: 0.0
overlap measure: 0.0014
nonlinearity: 0.0

the number of features = 3
N = 1000
nonlinearity measure: 0.0
overlap measure: 0.009
nonlinearity: 0.0

N = 10000
nonlinearity measure: 0.0
overlap measure: 0.007
nonlinearity: 0.0

the number of features = 4
N = 1000
nonlinearity measure: 0.0
overlap measure: 0.015
nonlinearity: 0.0

N = 10000
nonlinearity measure: 0.0007
overlap measure: 0.0158
nonlinearity: 0.044303797468354424

the number of features = 5
N = 1000
nonlinearity measure: 0.001
overlap measure: 0.039
nonlinearity: 0.02564102564102564

N = 10000
nonlinearity measure: 0.0032
overlap measure: 0.0363
nonlinearity: 0.0881542699724518

the number of features = 6
N = 1000
nonlinearity measure: 0.01
overlap measure: 0.04
nonlinearity: 0.25

N = 10000
nonlinearity measure: 0.0065
overlap measure: 0.0429
nonlinearity: 0.15151515151515152

the number

KeyboardInterrupt: 