In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score

In [2]:
random_state = 8
data = pd.read_csv('EEG_Eye_State.csv')
print(data.shape)
print(data)
X = data.loc[:,data.columns !='Eye_detection']
y = np.array(data['Eye_detection'])
print(y)


(14980, 15)
           AF3       F7       F3      FC5       T7       P7       O1       O2  \
0      4329.23  4009.23  4289.23  4148.21  4350.26  4586.15  4096.92  4641.03   
1      4324.62  4004.62  4293.85  4148.72  4342.05  4586.67  4097.44  4638.97   
2      4327.69  4006.67  4295.38  4156.41  4336.92  4583.59  4096.92  4630.26   
3      4328.72  4011.79  4296.41  4155.90  4343.59  4582.56  4097.44  4630.77   
4      4326.15  4011.79  4292.31  4151.28  4347.69  4586.67  4095.90  4627.69   
...        ...      ...      ...      ...      ...      ...      ...      ...   
14975  4281.03  3990.26  4245.64  4116.92  4333.85  4614.36  4074.87  4625.64   
14976  4276.92  3991.79  4245.13  4110.77  4332.82  4615.38  4073.33  4621.54   
14977  4277.44  3990.77  4246.67  4113.85  4333.33  4615.38  4072.82  4623.59   
14978  4284.62  3991.79  4251.28  4122.05  4334.36  4616.41  4080.51  4628.72   
14979  4287.69  3997.44  4260.00  4121.03  4333.33  4616.41  4088.72  4638.46   

            P8 

In [3]:
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw : ', acc*100)


Accuracy on raw :  44.45927903871829


In [4]:
#Feature scaling
scaled_X = preprocessing.scale(X)
# Split into train and test data 
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state = random_state)
# Train a multi-layer perceptron
clf0 = MLPClassifier(hidden_layer_sizes=(100,100),random_state=random_state, verbose=False, max_iter=1000)
clf0.fit(X_train, y_train)
# Predict accuracy of classifier
y_pred = clf0.predict(X_test)
acc = accuracy_score(y_pred, y_test)
print('Accuracy on raw scaled : ', acc*100)


Accuracy on raw scaled :  90.82109479305741


In [5]:
################ split data #######################
# train: 0.1 total  
# pool: 0.5 total
# valid: 0.2 total
# test: 0.2 total
# ################################################# 

def split_data(scaled_X, y, noise_probability = 0.0):
    X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size = 0.2, random_state = random_state)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.25, random_state = random_state)
    np.random.seed(random_state)
    #adding noise to train and pool ,  validation split and test split don't have noise
    y_train = np.abs((np.random.random(y_train.shape)<noise_probability).astype(int)  -y_train)

    X_train, X_pool,y_train, y_pool = train_test_split(X_train, y_train, test_size = 0.84, random_state = random_state)
    print ("---------")
    print(f"total: {y.size}\ntrain: {y_train.size} -> {y_train.size/y.size:.2f}x \npool: {y_pool.size} -> {y_pool.size/y.size:.2f}x")
    print(f"valid: {y_valid.size} -> {y_valid.size/y.size:.2f}x \ntest: {y_test.size} -> {y_test.size/y.size:.2f}x")
    print ("---------")
    return X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test


In [6]:


def find_most_ambigious(y_proba_pred, y, ambigious_amount =1, method='least_confidence') -> list:
    """This function finds most ambigous predicted data and returns their indexes. It assumes
        we have only two class.

	Args:
		y_proba_pred ([list]): [predicted probabilities]
		y ([list]): [ground truth labels]
		ambigious_amount (int, optional): [quantity of most ambigious]. Defaults to 1.
		method (str, optional): [method type i.e, least_confidence]. Defaults to 'least_confidence'.

	Returns:
		indexes ([list]): [indexes of the most ambigious]
	"""
    indexes = []
    if method == 'least_confidence':
        difference = np.abs(y_proba_pred[:,0]-y_proba_pred[:,1])
        indexes = np.argsort(difference)[:ambigious_amount]
    else:
        print("method is not defined. Use 'least_confidence'")
        
    return indexes

def train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model, ambigious_amount=1  , method='least_confidence'):

    y_proba_pred = model.predict_proba(X_pool)

    most_ambigious_indexes = find_most_ambigious(y_proba_pred, y_pool, ambigious_amount =ambigious_amount, method='least_confidence')
    
    X_train = np.append(X_train,X_pool[most_ambigious_indexes],axis = 0)
    y_train = np.append(y_train, y_pool[most_ambigious_indexes])
    X_pool = np.delete(X_pool,most_ambigious_indexes,axis=0)
    y_pool = np.delete(y_pool,most_ambigious_indexes)
    model.fit(X_train,y_train)
    acc = model.score(X_test,y_test)

    return X_train, y_train, X_pool, y_pool, model, acc



In [11]:
## pure active learning
X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test = split_data(scaled_X, y, noise_probability=0.3)
clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
clf1.fit(X_train, y_train)
acc = clf1.score(X_test,y_test)

print (f"iteration -1:   accuracy = {acc:0.4f}")

print ("--")
K = 10
for k in range(K):
    X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, model=clf1, ambigious_amount=50 , method='least_confidence')
    print (f"iteration {k}:   accuracy = {acc:0.14f}")
    print ("--")

print(f"train size: {y_train.shape}")

##### comparison #####
clf1.fit(np.append(X_train,X_valid,axis=0), np.append(y_train,y_valid))
acc = clf1.score(X_test,y_test)


---------
total: 14980
train: 1438 -> 0.10x 
pool: 7550 -> 0.50x
valid: 2996 -> 0.20x 
test: 2996 -> 0.20x
---------
iteration -1:   accuracy = 0.6766
--
iteration 0:   accuracy = 0.66622162883845
--
iteration 1:   accuracy = 0.67957276368491
--
iteration 2:   accuracy = 0.67790387182911
--
iteration 3:   accuracy = 0.66021361815754
--
iteration 4:   accuracy = 0.66021361815754
--
iteration 5:   accuracy = 0.66822429906542
--
iteration 6:   accuracy = 0.66355140186916
--
iteration 7:   accuracy = 0.66488651535381
--
iteration 8:   accuracy = 0.68024032042724
--
iteration 9:   accuracy = 0.67022696929239
--
train size: (1938,)
iteration -1:   accuracy = 0.8111


In [9]:
## Active learning with Ransac

X_train, X_pool, X_valid, X_test, y_train, y_pool, y_valid, y_test = split_data(scaled_X, y,noise_probability = 0.3)

clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
clf1.fit(X_train, y_train)
acc = clf1.score(X_test,y_test)
print (f"iteration -1:   accuracy = {acc:0.4f}")
print ("--")

#################
K = 2
M = 5
N = 5

#### in total N*k times active learning iterations, N*M times Ransac iterations ####
for n in range(N):

    print(f"################ Outer iteration {n} ################ ")

    ############ k iteration active learning -> everytime get m 10 ############
    for k in range(K):
        X_train, y_train, X_pool, y_pool, clf1, acc = train_one_iter_active_learning(X_train, y_train, X_pool, y_pool, X_test, y_test, clf1, ambigious_amount=50 , method='least_confidence')
        print (f"AL iteration {k}:   accuracy = {acc:0.4f}")
    print(f"train size: {y_train.shape}")
    print ("--------")
    
    #####################################################

    M = 5
    stats_history =[]
    for m in range(M):
        ransac_random_state = random_state + m # to make sure repeatable results

        r_X_train,r_X_outlier, r_y_train, r_y_outlier = train_test_split(X_train, y_train, test_size = 0.05, random_state = ransac_random_state)
        clf1 = MLPClassifier(verbose=0, hidden_layer_sizes=(100,100),random_state = random_state)
        clf1.fit(r_X_train, r_y_train)
        acc = clf1.score(X_valid, y_valid)
        print (f"Ransac iteration {m}:   accuracy = {acc:0.14f}")
        
        stat = {"model":clf1, "X_train":r_X_train, "y_train":r_y_train, "accuracy":acc}
        stats_history.append(stat)

    # Take the best model and 95% data that gives best accuracy     
    best = sorted(stats_history, key=lambda x: x["accuracy"])[-1]
    clf1 = best["model"]
    X_train = best["X_train"]
    y_train = best["y_train"]
    print ("----------------")

    acc = clf1.score(X_test, y_test)
    print(f"train size: {y_train.shape}")
    print (f"Final accuracy = {acc:0.4f}")
    print ("----------------")




---------
total: 14980
train: 1438 -> 0.10x 
pool: 7550 -> 0.50x
valid: 2996 -> 0.20x 
test: 2996 -> 0.20x
---------
iteration -1:   accuracy = 0.6766
--
################ Outer iteration 0 ################ 
AL iteration 0:   accuracy = 0.6662
AL iteration 1:   accuracy = 0.6796
train size: (1538,)
--------
Ransac iteration 0:   accuracy = 0.64886515353805
Ransac iteration 1:   accuracy = 0.65620827770360
Ransac iteration 2:   accuracy = 0.67156208277704
Ransac iteration 3:   accuracy = 0.66355140186916
Ransac iteration 4:   accuracy = 0.66889185580774
----------------
train size: (1461,)
Final accuracy = 0.6706
----------------
################ Outer iteration 1 ################ 
AL iteration 0:   accuracy = 0.6656
AL iteration 1:   accuracy = 0.6706
train size: (1561,)
--------
Ransac iteration 0:   accuracy = 0.67022696929239
Ransac iteration 1:   accuracy = 0.65921228304406
Ransac iteration 2:   accuracy = 0.65720961281709
Ransac iteration 3:   accuracy = 0.66154873164219
Ransac ite