In [1]:
import pandas as pd
import json
import numpy as np
import os
from sklearn.model_selection import train_test_split, KFold, cross_validate, cross_val_score
from sklearn import metrics
import time
import optuna
from optuna.samplers import RandomSampler
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import requests

rs = 42

In [2]:
all_features = pd.read_csv('akb_obj_features_snad4.csv')

In [3]:
target = all_features['label']
data = all_features[all_features.columns[2:]]

In [4]:
print('Training model...')
t = time.monotonic()
model = ExtraTreesClassifier(random_state=rs, max_features=1, n_estimators=251, max_depth=39)
score_types = ('accuracy', 'roc_auc', 'f1')

result = cross_validate(model, data, target,
                        cv=KFold(shuffle=True, random_state=rs),
                        scoring=score_types,
                        return_estimator=True,
                        return_train_score=True,
                       )

print('Scores for ExtraTreesClassifier:')
for score in score_types:
    mean = np.mean(result[f'test_{score}'])
    std = np.std(result[f'test_{score}'])
    print(f'{score} = {mean:.2f} +- {std:.2f}')
t = (time.monotonic() - t) / 60
print(f'ExtraTreesClassifier trained (with cross-validation) in {t:.0f} m')

Training model...
Scores for ExtraTreesClassifier:
accuracy = 0.85 +- 0.01
roc_auc = 0.93 +- 0.01
f1 = 0.85 +- 0.01
ExtraTreesClassifier trained (with cross-validation) in 0 m


In [5]:
result

{'fit_time': array([0.42321706, 0.40294003, 0.40281749, 0.40370536, 0.40613842]),
 'score_time': array([0.1031208 , 0.09785342, 0.09729123, 0.09673977, 0.09825945]),
 'estimator': [ExtraTreesClassifier(max_depth=40, max_features=1, n_estimators=186,
                       random_state=42),
  ExtraTreesClassifier(max_depth=40, max_features=1, n_estimators=186,
                       random_state=42),
  ExtraTreesClassifier(max_depth=40, max_features=1, n_estimators=186,
                       random_state=42),
  ExtraTreesClassifier(max_depth=40, max_features=1, n_estimators=186,
                       random_state=42),
  ExtraTreesClassifier(max_depth=40, max_features=1, n_estimators=186,
                       random_state=42)],
 'test_accuracy': array([0.87915408, 0.84743202, 0.82628399, 0.84138973, 0.84417549]),
 'train_accuracy': array([1., 1., 1., 1., 1.]),
 'test_roc_auc': array([0.94884576, 0.92573842, 0.91010378, 0.91916624, 0.93257698]),
 'train_roc_auc': array([1., 1., 1., 1.

# OPTUNA

In [4]:
def objective(trial):
    objs = pd.read_csv('akb_obj_features_snad4.csv')
    target = objs['label']
    data = objs[objs.columns[2:]]
    
    n_jobs = -1
    max_depth = trial.suggest_int("max_depth", 10, 120)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    #min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    #min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    

    clf = ExtraTreesClassifier( random_state      = rs,
                                                  n_jobs            = n_jobs,
                                                  max_depth         = max_depth,
                                                  n_estimators      = n_estimators,
                                                  #min_samples_split = min_samples_split,
                                                  #min_samples_leaf  = min_samples_leaf,
                                                  max_features      = 1
                                                  ) 

  
    score = cross_val_score(clf, data, target, n_jobs=-1, cv=KFold(shuffle=True, random_state=rs))
    accuracy = score.mean()
    return accuracy

In [5]:
# Create "exploration"
study = optuna.create_study(direction="maximize", sampler=RandomSampler(42))
study.optimize(objective, n_trials=1000)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-15 22:29:56,995] A new study created in memory with name: no-name-c0f49843-60e4-4985-a7cf-828eb94e9502
[I 2024-05-15 22:30:01,165] Trial 0 finished with value: 0.8464786028675768 and parameters: {'max_depth': 51, 'n_estimators': 952}. Best is trial 0 with value: 0.8464786028675768.
[I 2024-05-15 22:30:03,598] Trial 1 finished with value: 0.8449675717922582 and parameters: {'max_depth': 91, 'n_estimators': 603}. Best is trial 0 with value: 0.8464786028675768.
[I 2024-05-15 22:30:04,915] Trial 2 finished with value: 0.8425506533632554 and parameters: {'max_depth': 27, 'n_estimators': 164}. Best is trial 0 with value: 0.8464786028675768.
[I 2024-05-15 22:30:08,769] Trial 3 finished with value: 0.8177690124365261 and parameters: {'max_depth': 16, 'n_estimators': 868}. Best is trial 0 with value: 0.8464786028675768.
[I 2024-05-15 22:30:11,355] Trial 4 finished with value: 0.8443633421850075 and parameters: {'max_depth': 76, 'n_estimators': 711}. Best is trial 0 with value: 0.8464

Number of finished trials:  1000
Best trial:
  Value: 0.8510135243222985
  Params: 
    max_depth: 39
    n_estimators: 251


## RandomForest

In [2]:
def objective(trial):
    objs = pd.read_csv('akb_obj_features_snad4.csv')
    target = objs['label']
    data = objs[objs.columns[2:]]
    
    n_jobs = -1
    max_depth = trial.suggest_int("max_depth", 10, 120)
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    #min_samples_split = trial.suggest_int("min_samples_split", 2, 5)
    #min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 5)
    

    clf = RandomForestClassifier( random_state      = rs,
                                                  n_jobs            = n_jobs,
                                                  max_depth         = max_depth,
                                                  n_estimators      = n_estimators,
                                                  #min_samples_split = min_samples_split,
                                                  #min_samples_leaf  = min_samples_leaf,
                                                  #max_features      = 1
                                                  ) 

  
    score = cross_val_score(clf, data, target, n_jobs=-1, cv=KFold(shuffle=True, random_state=rs))
    accuracy = score.mean()
    return accuracy

In [3]:
# Create "exploration"
study = optuna.create_study(direction="maximize", sampler=RandomSampler(42))
study.optimize(objective, n_trials=1000)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-05-13 18:00:37,697] A new study created in memory with name: no-name-4ddf7488-d08c-4ff9-9da5-596fcae5e053
[I 2024-05-13 18:00:43,341] Trial 0 finished with value: 0.8612877129315191 and parameters: {'max_depth': 51, 'n_estimators': 952}. Best is trial 0 with value: 0.8612877129315191.
[I 2024-05-13 18:00:46,800] Trial 1 finished with value: 0.8600783396026344 and parameters: {'max_depth': 91, 'n_estimators': 603}. Best is trial 0 with value: 0.8612877129315191.
[I 2024-05-13 18:00:48,295] Trial 2 finished with value: 0.8612872558743276 and parameters: {'max_depth': 27, 'n_estimators': 164}. Best is trial 0 with value: 0.8612877129315191.
[I 2024-05-13 18:00:52,950] Trial 3 finished with value: 0.8606825692098852 and parameters: {'max_depth': 16, 'n_estimators': 868}. Best is trial 0 with value: 0.8612877129315191.
[I 2024-05-13 18:00:56,784] Trial 4 finished with value: 0.8594736529381922 and parameters: {'max_depth': 76, 'n_estimators': 711}. Best is trial 0 with value: 0.8612

Number of finished trials:  1000
Best trial:
  Value: 0.8652161194930322
  Params: 
    max_depth: 17
    n_estimators: 326


In [4]:
print('Training model...')
t = time.monotonic()
model = RandomForestClassifier(random_state=rs, n_estimators=830, max_depth=18)
score_types = ('accuracy', 'roc_auc', 'f1')

result = cross_validate(model, data, target,
                        cv=KFold(shuffle=True, random_state=rs),
                        scoring=score_types,
                        return_estimator=True,
                        return_train_score=True,
                       )

print('Scores for RandomForestClassifier:')
for score in score_types:
    mean = np.mean(result[f'test_{score}'])
    std = np.std(result[f'test_{score}'])
    print(f'{score} = {mean:.2f} +- {std:.2f}')
t = (time.monotonic() - t) / 60
print(f'RandomForestClassifier trained (with cross-validation) in {t:.0f} m')

Training model...
Scores for RandomForestClassifier:
accuracy = 0.87 +- 0.02
roc_auc = 0.94 +- 0.01
f1 = 0.87 +- 0.02
RandomForestClassifier trained (with cross-validation) in 0 m


# Where clf is wrong

In [3]:
model = RandomForestClassifier(random_state=rs, n_estimators=830, max_depth=17)

In [4]:
objs = pd.read_csv('akb_obj_features_snad4.csv')
train_x, valid_x = train_test_split(objs, test_size=0.25, random_state=rs)

In [5]:
target = train_x['label']
data = train_x[train_x.columns[2:]]
model.fit(data, target)

In [6]:
target = valid_x['label']
data = valid_x[valid_x.columns[2:]]

In [7]:
preds = model.predict(data)
metrics.accuracy_score(target, preds)

0.8792270531400966

In [48]:
def get_oids(filepath, all_lab=False):
    file = open(filepath)
    obj_list = json.load(file)
    file.close()

    oids = []
    tags = []
    for data in obj_list:
        oids.append(data['oid'])
        tags.append(data['tags'])

    targets = [] # 1-artefact,  0-transient
    for tag_list in tags:
        if 'artefact' in tag_list:
            targets.append(1)
        else:
            targets.append(0)
            
    if all_lab:
        return np.array(oids), np.array(targets), tags
        
    return np.array(oids), np.array(targets)

oids, labels, all_tags = get_oids('akb.ztf.snad.space.json', True)

# Statistics for false classifications in test set

In [34]:
wrong_pred = {}
for oid in valid_x[~(target == preds)]['oid']:
    tags = all_tags[np.where(oids == oid)[0][0]]
    if tags[0] == 'artefact' and tags[0] in wrong_pred:
        wrong_pred[tags[0]] += 1
    elif tags[0] == 'artefact' and 'artefact' not in wrong_pred:
        wrong_pred[tags[0]] = 1
    if tags[0] != 'artefact':
        if tags[0] in wrong_pred:
            wrong_pred[tags[0]] += 1
        else:
            wrong_pred[tags[0]] = 1
        

In [35]:
wrong_pred

{'artefact': 44,
 '1-point': 9,
 'Galaxy': 4,
 'M_dwarf_flare': 16,
 'QSO': 2,
 'Cataclysmic': 1,
 'VAR': 9,
 'SN': 2,
 'SNIa': 1,
 'AGN': 1,
 'SLSN': 1,
 'non-catalogued': 3,
 'EB': 1,
 'Pulsating': 1,
 'STAR': 3,
 'Asteroid': 1,
 'transient': 1}

In [36]:
len(valid_x)

828

# Statistics for objects from test set

In [52]:
test_tags = {}
for oid in valid_x['oid']:
    tags = all_tags[np.where(oids == oid)[0][0]]
    if len(tags) == 0:
        continue
    if tags[0] == 'artefact' and tags[0] in test_tags:
        test_tags[tags[0]] += 1
    elif tags[0] == 'artefact' and 'artefact' not in test_tags:
        test_tags[tags[0]] = 1
    if tags[0] != 'artefact':
        if tags[0] in test_tags:
            test_tags[tags[0]] += 1
        else:
            test_tags[tags[0]] = 1
test_tags

{'QSO': 50,
 'artefact': 395,
 'VAR': 50,
 'RRAB': 5,
 'M_dwarf_flare': 33,
 '1-point': 16,
 'Galaxy': 7,
 'Pulsating': 21,
 'EA': 9,
 'AGN': 35,
 'STAR': 24,
 'M': 35,
 'UG': 3,
 'non-catalogued': 12,
 'BY': 1,
 'LPV': 6,
 'SN': 50,
 'Cataclysmic': 4,
 'EW': 11,
 'CCSN': 7,
 'SNIa': 8,
 'RR': 4,
 'Eclipsing': 17,
 'DCEP': 3,
 'SLSN': 1,
 'Eruptive': 1,
 'RSCVn': 2,
 'SR': 6,
 'YSO': 1,
 'UGSS': 2,
 'EB': 1,
 'RSG': 2,
 'Rotating': 1,
 'Asteroid': 1,
 'DSCT': 1,
 'transient': 1}

# Statistics for all objects from akb (3k objs)

In [49]:
stat = {}
for oid in oids:
    tags = all_tags[np.where(oids == oid)[0][0]]
    if len(tags) == 0:
        continue
    if tags[0] == 'artefact' and tags[0] in stat:
        stat[tags[0]] += 1
    elif tags[0] == 'artefact' and 'artefact' not in stat:
        stat[tags[0]] = 1
    if tags[0] != 'artefact':
        if tags[0] in stat:
            stat[tags[0]] += 1
        else:
            stat[tags[0]] = 1

In [50]:
stat

{'artefact': 1646,
 'M_dwarf_flare': 140,
 'Eclipsing': 69,
 'LPV': 13,
 'M': 133,
 'UG': 19,
 'EA': 36,
 'RRAB': 23,
 'Cataclysmic': 44,
 'UGSS': 4,
 'Pulsating': 75,
 'QSO': 186,
 'non-catalogued': 25,
 'VAR': 155,
 'AGN': 162,
 'Rotating': 2,
 'EW': 22,
 'SN': 207,
 'SNIa': 32,
 'Galaxy': 25,
 'AM': 7,
 'CCSN': 23,
 'RR': 16,
 'L': 4,
 'STAR': 79,
 'SR': 23,
 '1-point': 81,
 'uncertain': 3,
 'RSG': 6,
 'N': 3,
 'RSCVn': 4,
 'DCEP': 3,
 'EB': 7,
 'BY': 4,
 'UGZ': 1,
 'DSCT': 5,
 'Eruptive': 3,
 'transient': 2,
 'SLSN': 4,
 'bright_star': 1,
 'YSO': 2,
 'Asteroid': 6,
 'defocusing': 1,
 'spike': 2}