# Load libraries

In [44]:
import utils as u
import pickle as pkl

# $T_{\text{eff}}$

## Load data

In [25]:
file_name = 'sec3_1_data_output_teff.pkl'
n_features = 10

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 10) (53176,)
(5909, 10) (5909,)


## Establish baseline model and performance

In [27]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [28]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.968268,0.858405,0.778188,0.979325
1,rf,0.992198,0.896279,0.858892,0.984937
2,gb,0.955522,0.876054,0.834062,0.982393
3,vote,0.990975,0.891341,0.842314,0.984197


In [29]:
results_data = {}
results_data['teff_baselines'] = baselines_df

**Baseline model shall be taken as the Random Forest Classifier**

## Performance vs sampling method

### Non-committee strategies

In [1]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [30]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 7, n_initial, 50, 20, 'rf', strategies)
    results_data['teff_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


**Baseline strategy for non-committee models shall be taken as Uncertainty Sampling.**

### Committee strategies

In [2]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [31]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 7, n_initial, 50, 20, 'committee', strategies)
    results_data['teff_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# $\log{g}$

## Load data

In [32]:
file_name = 'sec3_1_data_output_logg.pkl'
n_features = 10

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 10) (53176,)
(5909, 10) (5909,)


## Establish baseline model and performance

In [33]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [34]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.86258,0.563473,0.581039,0.924503
1,rf,0.945514,0.670216,0.684342,0.939053
2,gb,0.926436,0.617199,0.629351,0.92988
3,vote,0.941655,0.652127,0.661236,0.935497


In [35]:
results_data['logg_baselines'] = baselines_df

## Performance vs sampling method

### Non-committee strategies

In [3]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [36]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 6, n_initial, 50, 20, 'rf', strategies)
    results_data['logg_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


### Committee strategies

In [4]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [37]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 6, n_initial, 50, 20, 'committee', strategies)
    results_data['logg_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# Fe/H

## Load data

In [38]:
file_name = 'sec3_1_data_output_feh.pkl'
n_features = 10

X_train, X_test, y_train, y_test = u.load_pkl_data(file_name)
X_train, X_test = X_train[:,:n_features], X_test[:,:n_features]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(53176, 10) (53176,)
(5909, 10) (5909,)


## Establish baseline model and performance

In [39]:
baselines_df = u.get_baseline_model(X_train, y_train, X_test, y_test)

Done!


In [40]:
baselines_df

Unnamed: 0,model,auc,mcc,sen,spec
0,knn,0.921536,0.744883,0.682772,0.933569
1,rf,0.97394,0.842836,0.766464,0.95582
2,gb,0.969269,0.813625,0.728126,0.948481
3,vote,0.975509,0.830038,0.747954,0.951636


In [41]:
results_data['feh_baselines'] = baselines_df

## Performance vs sampling method

### Non-committee strategies

In [5]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [42]:
n_initials = [20, 100, 500]
strategies = ['random', 'entropy', 'margin', 'uncertainty']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 4, n_initial, 50, 20, 'rf', strategies)
    results_data['feh_non_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


### Committee strategies

In [6]:
"""
WARNING: the following cell might take hours to finish running!
"""



In [43]:
n_initials = [20, 100, 500]
strategies = ['random', 'consensus', 'disagreement', 'vote']
for n_initial in n_initials:
    scores = u.run_methods(X_train, y_train, X_test, y_test, 4, n_initial, 50, 20, 'committee', strategies)
    results_data['feh_comm_'+str(n_initial)] = scores

Calculating scores..
Done!


# Save results

In [45]:
file_name = 'results.pkl'
with open(file_name, 'wb') as f:
    pkl.dump(results_data, f, protocol=pkl.HIGHEST_PROTOCOL)

print("Results saved in file '" + file_name + "'.")

Results saved in file 'results.pkl'.
