# Load libraries and data

In [7]:
import numpy as np
import utils as u
import pickle as pkl

from IPython.display import clear_output

In [8]:
file_name = 'sec3_1_data_output_teff.pkl'
n_features = 9

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 9) (53176,)
(5909, 9) (5909,)


# Compare `margin` and `random` sampling sensitivity for `n` instances

## Random (10% of data)

In [9]:
n_runs = 20
n_rand = int(X_train.shape[0]*0.1)
scores_rand = []

for _ in range(n_runs):
    X_rand, y_rand, _, _ = u.get_initial_sample_pool(X_train, y_train, n_rand)
    scores_rand.append(u.get_baseline_model(X_rand, y_rand, X_test, y_test).sen.at[1])
    
scores_rand = np.array(scores_rand)
rand_mean = np.mean(scores_rand)
rand_std = np.std(scores_rand)

Done!


## Stratified (10% of data)

In [10]:
from sklearn.model_selection import train_test_split

n_runs = 20
n_strat = int(X_train.shape[0]*0.1)
scores_strat = []

for _ in range(n_runs):
    _, X_strat, _, y_strat = train_test_split(X_train, y_train, test_size=n_strat/X_train.shape[0], stratify=y_train)
    scores_strat.append(u.get_baseline_model(X_strat, y_strat, X_test, y_test).sen.at[1])
    
scores_strat = np.array(scores_strat)
strat_mean = np.mean(scores_strat)
strat_std = np.std(scores_strat)

Done!


## Margin (5% of data)

In [11]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [12]:
n_initial = 10
n_instances = int(X_train.shape[0]*0.05 - n_initial)
n_runs = 5

scores_al = []

for run in range(n_runs):
    X_initial, y_initial, X_pool, y_pool = u.get_initial_sample_pool(X_train, y_train, n_initial)
    
    clear_output()
    print('Initial batch size:', n_initial)
    print('Run', run+1, 'of', n_runs)
    scores_al.append(u.test_sampling_method(X_initial, y_initial, X_pool, y_pool, 'rf', 'margin', X_test, y_test, 7, n_instances)[2])

scores_al = np.array(scores_al)
sen_al_mean = np.mean(scores_al, axis=0)
sen_al_std = np.std(scores_al, axis=0)
    
print('\nDone!')

Initial batch size: 10
Run 5 of 5
Query 2648 of 2648
Done!


# Save data to results file

In [13]:
# open results file
file_name = 'results.pkl'

with open(file_name, 'rb') as f:
    data = pkl.load(f)

In [14]:
# add new results to file data
data['rand_10_percent'] = rand_mean, rand_std
data['strat_10_percent'] = strat_mean, strat_std
data['al_marg_5_percent'] = sen_al_mean, sen_al_std

In [15]:
# save updated data to results file
with open(file_name, 'wb') as f:
    pkl.dump(data, f, protocol=pkl.HIGHEST_PROTOCOL)

print("Results saved in file '" + file_name + "'.")

Results saved in file 'results.pkl'.
