In [None]:
import sys
sys.path.append('..')

In [None]:
import time
from gerumo.data.dataset import describe_dataset
from gerumo.data.generators import build_generator
from gerumo.utils.engine import (
    setup_cfg, setup_environment, setup_experiment, setup_model,
    build_dataset, build_callbacks, build_metrics, build_optimizer, build_loss
)
from gerumo.models.base import build_model
from gerumo.visualization.metrics import training_history


class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
args = dotdict()

## Select configuration

In [None]:
args['config_file'] = '/home/ir-riqu1/gerumo2/config/rf_classification_debug.yml'
#args['config_file'] = '/home/ir-riqu1/gerumo2/config/cnn_classification.yml'
args['opts'] = []

## Setup

In [None]:
cfg = setup_cfg(args)
output_dir = setup_experiment(cfg)
logger = setup_environment(cfg)

## Load Datasets

In [None]:
train_dataset = build_dataset(cfg, 'train')
describe_dataset(train_dataset, logger,
                save_to=output_dir / "train_description.txt")
validation_dataset = build_dataset(cfg, 'validation')
describe_dataset(validation_dataset, logger,
                save_to=output_dir / "validation_description.txt")

## Build generators

In [None]:
train_generator = build_generator(cfg, train_dataset)
validation_generator = build_generator(cfg, validation_dataset)

In [None]:
print(len(train_generator))
print(len(validation_generator))

In [None]:
n_estimators = 100 # integer
criterion = 'gini' # 'gini' or 'entropy'
max_depth = None # null or integer
min_samples_split = 2 # integer or float
min_samples_leaf = 1 # integer or float
min_weight_fraction_leaf = 0.0 # float
max_features = 3 # 'auto', 'sqrt', 'log2', integer or float
max_leaf_nodes = None # null or integer
min_impurity_decrease = 0.0 # float
bootstrap = False # True or False
oob_score= False # True or False
n_jobs = -1 # null or integer
random_state = 42 # null or integer or RandomState
verbose = 0 # integer
warm_start = False # 'True' or 'False'
class_weight = None # 'balanced', 'balanced_subsample', null, dict or list of dicts
ccp_alpha = 0.0 # non-negative float
max_samples = None # null, integer or float

In [None]:
n_estimators = 100
criterion = 'gini'
min_samples_split = 2
min_samples_leaf = 1
min_weight_fraction_leaf = 0.0
max_features = 3
min_impurity_decrease = 0.0
random_state = 42
bootstrap = True
max_samples = 0.7
verbose = 0
n_jobs = -1
warm_start = True
oob_score = True
max_samples = None # null, integer or float

## Build model

In [None]:
#input_shape = train_generator.get_input_shape()
#model = build_model(cfg, input_shape)
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(
    n_estimators = n_estimators,
    criterion = criterion,
    max_depth = max_depth,
    min_samples_split = min_samples_split,
    min_samples_leaf=min_samples_leaf,
    min_weight_fraction_leaf = min_weight_fraction_leaf,
    max_features = max_features,
    min_impurity_decrease = min_impurity_decrease,
    random_state = random_state,
    bootstrap = bootstrap,
    verbose = verbose,
    n_jobs = n_jobs,
    warm_start = warm_start,
    oob_score = oob_score,
    ccp_alpha = ccp_alpha,
    max_samples = max_samples)
    

## Build training tools

In [None]:
#callbacks = build_callbacks(cfg)
#metrics = build_metrics(cfg)
#optimizer = build_optimizer(cfg)
#loss = build_loss(cfg)

## Compile model

In [None]:
#model = setup_model(
#    model, train_generator, optimizer, loss, metrics
#)

## Start training

In [None]:
train_generator.fit_mode()
validation_generator.fit_mode()
score=[]
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
start_time = time.time()
from tqdm import tqdm
pbar = tqdm(train_generator, total=len(train_generator)) 
for i, (batch_inputs, batch_outputs) in enumerate(pbar):
    labels=enc.fit_transform(batch_outputs).toarray()
    #batch_inputs_norm=sc.fit_transform(batch_inputs)
    rf.fit(batch_inputs, labels)
    rf.n_estimators+=1
    score.append(rf.oob_score_)
    
    
    
training_time = (time.time() - start_time)/60.0

In [None]:
logger.info(f"Training time: {training_time:.3f} [min]")

# Testing

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
pbar = tqdm(train_generator, total=len(train_generator))
predictions_tr=[]
true_tr=[]

for i, (batch_inputs, batch_outputs) in enumerate(pbar):
    labels_tr=enc.fit_transform(batch_outputs).toarray()
    #batch_inputs_norm=sc.fit_transform(batch_inputs)
    predictions_tr.extend(rf.predict(batch_inputs))
    true_tr.extend(labels_tr)

In [None]:
pbar = tqdm(validation_generator, total=len(validation_generator))
predictions=[]
true=[]
true_batches=[]
start_time = time.time()
for i, (batch_inputs, batch_outputs) in enumerate(pbar):
    true_batches.extend(batch_outputs)
    labels=enc.fit_transform(batch_outputs).toarray()
    #batch_inputs_norm=sc.fit_transform(batch_inputs)
    pre = rf.predict(batch_inputs)
    predictions.extend(pre)
    #print(accuracy_score(labels,np.array(pre)))
    true.extend(labels)
    
training_time = (time.time() - start_time)/60.0


In [None]:
logger.info(f"Testing time: {training_time:.3f} [min]")

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
acc=accuracy_score(np.array(true),np.array(predictions))
acc_tr=accuracy_score(np.array(true_tr),np.array(predictions_tr))
print('trained on: {}'.format(cfg.DATASETS.TRAIN.EVENTS[58:]))
print('tested on: {}'.format(cfg.DATASETS.VALIDATION.EVENTS[58:]))
print('{0} Testing Accuracy : {1:.4f}'.format(cfg.MODEL.TELESCOPES[0],acc))
print('{0} Training Accuracy : {1:.4f}'.format(cfg.MODEL.TELESCOPES[0],acc_tr))

In [None]:
np.array(predictions).sum(axis=0)

In [None]:
np.array(true).sum(axis=0).max()/np.array(true).sum(axis=0).sum()

# Random Forest 2nd Try

In [None]:
n_estimators = 100 # integer
criterion = 'gini' # 'gini' or 'entropy'
max_depth = None # null or integer
min_samples_split = 2 # integer or float
min_samples_leaf = 1 # integer or float
min_weight_fraction_leaf = 0.0 # float
max_features = 3 # 'auto', 'sqrt', 'log2', integer or float
max_leaf_nodes = None # null or integer
min_impurity_decrease = 0.0 # float
bootstrap = True # True or False
oob_score= True # True or False
n_jobs = -1 # null or integer
random_state = 42 # null or integer or RandomState
verbose = 1 # integer
warm_start = True # 'True' or 'False'
class_weight = None # 'balanced', 'balanced_subsample', null, dict or list of dicts
ccp_alpha = 0.0 # non-negative float
max_samples = None # null, integer or float

In [None]:
rf2 = RandomForestClassifier(
    n_estimators = n_estimators,
    criterion = criterion,
    max_depth = max_depth,
    min_samples_split = min_samples_split,
    min_samples_leaf=min_samples_leaf,
    min_weight_fraction_leaf = min_weight_fraction_leaf,
    max_features = max_features,
    min_impurity_decrease = min_impurity_decrease,
    random_state = random_state,
    bootstrap = bootstrap,
    verbose = verbose,
    n_jobs = n_jobs,
    warm_start = warm_start,
    oob_score = oob_score,
    ccp_alpha = ccp_alpha,
    max_samples = max_samples)

In [None]:
train_generator = build_generator(cfg, train_dataset)
validation_generator = build_generator(cfg, validation_dataset)

In [None]:
train_generator.fit_mode()
validation_generator.fit_mode()

In [None]:
features=[]
output=[]
for batch_inputs, batch_outputs in train_generator:
    features.extend(batch_inputs)
    labels=enc.fit_transform(batch_outputs).toarray()
    output.extend(labels)
    

In [None]:
test_features=[]
test_output=[]
for batch_inputs, batch_outputs in validation_generator:
    test_features.extend(batch_inputs)
    test_labels=enc.fit_transform(batch_outputs).toarray()
    test_output.extend(test_labels)

In [None]:
rf2.fit(np.array(features),np.array(output))

In [None]:
pred=rf2.predict(np.array(test_features))

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
acc=accuracy_score(np.array(test_output),np.array(pred))

In [None]:
print(acc)

In [None]:
rf2.oob_score_

In [None]:
np.array(test_output).sum(axis=0).max()/np.array(test_output).sum(axis=0).sum()

In [None]:
from tensorflow.keras.metrics import CategoricalAccuracy
m = CategoricalAccuracy()
m.update_state(np.array(pred), np.array(test_output))
m.result().numpy()