In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [4]:
random_state = 8
data = pd.read_csv('EEG_Eye_State.csv')
print(data.shape)
print(data)
X = data.loc[:,data.columns !='Eye_detection']
y = np.array(data['Eye_detection'])
print(y)


(14980, 15)
           AF3       F7       F3      FC5       T7       P7       O1       O2  \
0      4329.23  4009.23  4289.23  4148.21  4350.26  4586.15  4096.92  4641.03   
1      4324.62  4004.62  4293.85  4148.72  4342.05  4586.67  4097.44  4638.97   
2      4327.69  4006.67  4295.38  4156.41  4336.92  4583.59  4096.92  4630.26   
3      4328.72  4011.79  4296.41  4155.90  4343.59  4582.56  4097.44  4630.77   
4      4326.15  4011.79  4292.31  4151.28  4347.69  4586.67  4095.90  4627.69   
...        ...      ...      ...      ...      ...      ...      ...      ...   
14975  4281.03  3990.26  4245.64  4116.92  4333.85  4614.36  4074.87  4625.64   
14976  4276.92  3991.79  4245.13  4110.77  4332.82  4615.38  4073.33  4621.54   
14977  4277.44  3990.77  4246.67  4113.85  4333.33  4615.38  4072.82  4623.59   
14978  4284.62  3991.79  4251.28  4122.05  4334.36  4616.41  4080.51  4628.72   
14979  4287.69  3997.44  4260.00  4121.03  4333.33  4616.41  4088.72  4638.46   

            P8 

In [5]:
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw : ', acc*100)


Accuracy on raw :  44.45927903871829


In [6]:
#Feature scaling
scaled_X = preprocessing.scale(X)
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw scaled : ', acc*100)


Accuracy on raw scaled :  90.82109479305741


In [24]:
################ split data #######################
# train: 0.2 total  
# pool: 0.6 total
# test: 0.2 total
# ################################################# 

def split_data(scaled_X, y, noise_probability = 0.0, add_noise_to_train=True):
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state = random_state)
    
    np.random.seed(random_state)
    X_train, X_pool,y_train, y_pool = train_test_split(X_train, y_train, test_size = 0.75 , random_state = random_state)
    if add_noise_to_train:
    #adding noise to train
        y_train = np.abs((np.random.random(y_train.shape)<noise_probability).astype(int)  -y_train)  # -> either |1-y_train|,or |0-y_train| for each data sample
    #adding noise to pool
    y_pool = np.abs((np.random.random(y_pool.shape)<noise_probability).astype(int)  -y_pool)  # -> either |1-y_pool|,or |0-y_pool| for each data sample

    print ("---------")
    print(f"total: {y.size}\ntrain: {y_train.size} -> {y_train.size/y.size:.2f}x \npool: {y_pool.size} -> {y_pool.size/y.size:.2f}x \ntest: {y_test.size} -> {y_test.size/y.size:.2f}x")
    print ("---------")
    return X_train, X_pool, X_test, y_train, y_pool, y_test


In [13]:


def find_most_ambigious(y_proba_pred, y, ambigious_amount =1, method='least_confidence') -> list:
    """This function finds most ambigous predicted data and returns their indexes. It assumes
        we have only two class.

	Args:
		y_proba_pred ([list]): [predicted probabilities]
		y ([list]): [ground truth labels]
		ambigious_amount (int, optional): [quantity of most ambigious]. Defaults to 1.
		method (str, optional): [method type i.e, least_confidence]. Defaults to 'least_confidence'.

	Returns:
		indexes ([list]): [indexes of the most ambigious]
	"""
    indexes = []
    if method == 'least_confidence':
        difference = np.abs(y_proba_pred[:,0]-y_proba_pred[:,1])
        indexes = np.argsort(difference)[:ambigious_amount]
    else:
        print("method is not defined. Use 'least_confidence'")
        
    return indexes

def train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model, ambigious_amount=1  , method='least_confidence'):

    y_proba_pred = model.predict_proba(X_pool)

    most_ambigious_indexes = find_most_ambigious(y_proba_pred, y_pool, ambigious_amount =ambigious_amount, method='least_confidence')
    
    X_train = np.append(X_train,X_pool[most_ambigious_indexes],axis = 0)
    y_train = np.append(y_train, y_pool[most_ambigious_indexes])
    X_pool = np.delete(X_pool,most_ambigious_indexes,axis=0)
    y_pool = np.delete(y_pool,most_ambigious_indexes)
    model.fit(X_train,y_train)
    acc = model.score(X_test,y_test)

    return X_train, y_train, X_pool, y_pool, model, acc



In [27]:
## pure active learning
noise_probability = 0.2
ambigious_amount = 100
K = 10
add_noise_to_train = True  # pool always has noise
#############################
X_train, X_pool, X_test, y_train, y_pool, y_test = split_data(scaled_X, y, noise_probability=noise_probability, add_noise_to_train=add_noise_to_train)
clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
clf1.fit(X_train, y_train)
acc = clf1.score(X_test,y_test)

print (f"iteration -1:   accuracy = {acc:0.5f}")
print ("--")

for k in range(K):
    X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model=clf1, ambigious_amount=ambigious_amount , method='least_confidence')
    print (f"iteration {k}:   accuracy = {acc:0.5f}")
    print ("--")

print(f"train size: {y_train.shape}")



---------
total: 14980
train: 2996 -> 0.20x 
pool: 8988 -> 0.60x 
test: 2996 -> 0.20x
---------
iteration -1:   accuracy = 0.82911
--
iteration 0:   accuracy = 0.82877
--
iteration 1:   accuracy = 0.83244
--
iteration 2:   accuracy = 0.84146
--
iteration 3:   accuracy = 0.82977
--
iteration 4:   accuracy = 0.83578
--
iteration 5:   accuracy = 0.83645
--
iteration 6:   accuracy = 0.83778
--
iteration 7:   accuracy = 0.83478
--
iteration 8:   accuracy = 0.83111
--
iteration 9:   accuracy = 0.84680
--
train size: (3996,)


In [28]:
## Active learning with Ransac

##################### in total N*k times active learning iterations, N*M times Ransac iterations ##############

noise_probability = 0.2
K = 2
N = 5
ambigious_amount = 100    
M = 10   #RANSAC
ransac_percent = 0.95
add_noise_to_train = True  # pool always has noise

################################################################################################################
X_train, X_pool, X_test, y_train, y_pool, y_test = split_data(scaled_X, y,noise_probability = noise_probability, add_noise_to_train=add_noise_to_train)

clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
clf1.fit(X_train, y_train)
acc = clf1.score(X_test,y_test)
print (f"iteration -1:   accuracy = {acc:0.5f}")
print ("--")

for n in range(N):

    print(f"################ Outer iteration {n} ################ ")

    ############ K iteration active learning -> everytime label ambigious_amount data ############
    for k in range(K):
        X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, clf1, ambigious_amount=ambigious_amount , method='least_confidence')
        print (f"AL iteration {k}:   accuracy = {acc:0.5f}")
    print(f"train size: {y_train.shape}")
    print ("--------")
    
    ###########################################################################


    ############ M iteration RANSAC ###########################################
    
    stats_history =[]
    for m in range(M):
        ransac_random_state = random_state + m # to make sure repeatable results

        r_X_train,r_X_outlier, r_y_train, r_y_outlier = train_test_split(X_train, y_train, train_size = ransac_percent, random_state = ransac_random_state)
        clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
        clf1.fit(r_X_train, r_y_train)
        acc = clf1.score(X_test, y_test)
        print (f"Ransac iteration {m}:   accuracy = {acc:0.5f}")
        
        stat = {"model":clf1, "X_train":r_X_train, "y_train":r_y_train, "accuracy":acc}
        stats_history.append(stat)

    # Take the best model and 95% data that gives best accuracy     
    best = sorted(stats_history, key=lambda x: x["accuracy"])[-1]
    clf1 = best["model"]
    X_train = best["X_train"]
    y_train = best["y_train"]
    acc = best["accuracy"]
    print ("----------------")

    print(f"train size: {y_train.shape}")
    print (f"Final accuracy = {acc:0.5f}")
    print ("----------------")




---------
total: 14980
train: 2996 -> 0.20x 
pool: 8988 -> 0.60x 
test: 2996 -> 0.20x
---------
iteration -1:   accuracy = 0.82911
--
################ Outer iteration 0 ################ 
AL iteration 0:   accuracy = 0.82877
AL iteration 1:   accuracy = 0.83244
train size: (3196,)
--------
Ransac iteration 0:   accuracy = 0.82810
Ransac iteration 1:   accuracy = 0.82710
Ransac iteration 2:   accuracy = 0.83344
Ransac iteration 3:   accuracy = 0.82076
Ransac iteration 4:   accuracy = 0.83344
Ransac iteration 5:   accuracy = 0.83511
Ransac iteration 6:   accuracy = 0.82844
Ransac iteration 7:   accuracy = 0.83311
Ransac iteration 8:   accuracy = 0.83211
Ransac iteration 9:   accuracy = 0.82610
----------------
train size: (3036,)
Final accuracy = 0.83511
----------------
################ Outer iteration 1 ################ 
AL iteration 0:   accuracy = 0.83044
AL iteration 1:   accuracy = 0.83545
train size: (3236,)
--------
Ransac iteration 0:   accuracy = 0.83478
Ransac iteration 1:   ac