# Load libraries

In [26]:
import utils as u
import pickle as pkl

# $T_{\text{eff}}$

## Load data

In [27]:
file_name = 'sec3_1_data_output_teff.pkl'
n_features = 9

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

results_data = {}

(53176, 9) (53176,)
(5909, 9) (5909,)


## Establish baseline model and performance

In [28]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [29]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.94653,0.87618,0.786735,0.982464
1,rf,0.989042,0.906607,0.858847,0.987032
2,gb,0.954921,0.836016,0.765556,0.97675
3,vote,0.989141,0.891409,0.824532,0.984853


In [30]:
results_data['teff_baselines'] = baselines_df

**Baseline model shall be taken as the Random Forest Classifier**

## Performance vs sampling method

### Non-committee strategies

In [31]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [32]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 7, n_initial, 50, 20, 'rf', strategies)
    results_data['teff_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


**Baseline strategy for non-committee models shall be taken as Uncertainty Sampling.**

### Committee strategies

In [33]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [34]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 7, n_initial, 50, 20, 'committee', strategies)
    results_data['teff_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# $\log{g}$

## Load data

In [35]:
file_name = 'sec3_1_data_output_logg.pkl'
n_features = 9

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 9) (53176,)
(5909, 9) (5909,)


## Establish baseline model and performance

In [36]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [37]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.88413,0.591118,0.629072,0.928726
1,rf,0.948981,0.683279,0.70669,0.941926
2,gb,0.928445,0.626683,0.646757,0.931524
3,vote,0.947739,0.672668,0.683193,0.939143


In [38]:
results_data['logg_baselines'] = baselines_df

## Performance vs sampling method

### Non-committee strategies

In [39]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [40]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 6, n_initial, 50, 20, 'rf', strategies)
    results_data['logg_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


### Committee strategies

In [41]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [42]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 6, n_initial, 50, 20, 'committee', strategies)
    results_data['logg_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# Fe/H

## Load data

In [43]:
file_name = 'sec3_1_data_output_feh.pkl'
n_features = 9

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 9) (53176,)
(5909, 9) (5909,)


## Establish baseline model and performance

In [44]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [45]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.919634,0.771872,0.711308,0.938733
1,rf,0.978527,0.856402,0.786596,0.960595
2,gb,0.975037,0.835774,0.775932,0.956443
3,vote,0.980077,0.845228,0.781399,0.957581


In [46]:
results_data['feh_baselines'] = baselines_df

## Performance vs sampling method

### Non-committee strategies

In [47]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [48]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 4, n_initial, 50, 20, 'rf', strategies)
    results_data['feh_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


### Committee strategies

In [49]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [50]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 4, n_initial, 50, 20, 'committee', strategies)
    results_data['feh_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# Save results

In [51]:
file_name = 'results.pkl'
with open(file_name, 'wb') as f:
    pkl.dump(results_data, f, protocol=pkl.HIGHEST_PROTOCOL)

print("Results saved in file '" + file_name + "'.")

Results saved in file 'results.pkl'.
