In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import logging
import numpy as np
from sklearn import metrics
from sklearn.metrics import roc_auc_score, confusion_matrix, precision_recall_curve, auc, ndcg_score
from tdescore.classifier.train import train_classifier
from tdescore.classifier.features import host_columns, early_columns, peak_columns, post_peak, parse_columns
from tdescore.classifier.collate import get_classified_sources, convert_to_train_dataset
import matplotlib.patheffects as path_effects
import pandas as pd
from pathlib import Path
import shap
from tdescore.classifier.collate import convert_to_train_dataset
import joblib

In [None]:
logging.getLogger("tdescore").setLevel("INFO")

In [None]:
all_features = [host_columns, early_columns, peak_columns, post_peak]
labels = ["Host Only", "Early", "At Peak", "Full"]

In [None]:
relevant_columns, column_descriptions = parse_columns(all_features[-1])
full_columns, full_descriptions = parse_columns(post_peak)

In [None]:
all_sources = get_classified_sources()
data_to_use = convert_to_train_dataset(all_sources, columns=full_columns)
nan_mask = np.array([np.sum(np.isnan(x)) > 0 for x in data_to_use])
full_info_sources = all_sources[~nan_mask].reset_index(drop=True)
full_info_sources

In [None]:
df_to_use = full_info_sources
df_to_use

In [None]:
n_iter = 10
n_estimators = [100.]

param_res = dict()
param_performance = []

for i, features in enumerate(all_features):
    relevant_columns, column_descriptions = parse_columns(features)

    all_all_res, clfs = train_classifier(
        train_sources=df_to_use,
        n_iter=n_iter,
        columns=relevant_columns,
        n_estimator_set=n_estimators
    )

    n_estimator_set = list(sorted(clfs.keys()))
    
    metric = "precision_recall_area"

    best_index = all_all_res[metric].idxmax()

    best_estimator = all_all_res.iloc[best_index]["n_estimator"]

    print(f"Best value is {best_estimator}")

    clf = clfs[best_estimator]
    all_res = all_all_res[all_all_res["n_estimator"] == best_estimator]["all_res"].iloc[0]
    
    def flatten():
        true_class = []
        all_probs = []
        for i in range(n_iter):
            probs = all_res[f"probs_{i}"]
            true_class += all_res[f"class"].tolist()
            all_probs += probs.tolist()
        return true_class, all_probs
    
    tclass, aprobs = flatten()
    pr, recall, thresholds = metrics.precision_recall_curve(tclass, aprobs)
    
    roc_area = roc_auc_score(tclass, aprobs)
    pr_area = auc(recall, pr)
    
    param_performance.append({"Parameter Set": labels[i], "Total parameters": len(relevant_columns), "ROC area": roc_area, "Precision/Recall Area": pr_area})
    
    param_res[i] = (pr, recall, thresholds)

In [None]:
param_df = pd.DataFrame(param_performance)
param_df

In [None]:
fscale = 4.
figsize=(fscale*1.618, 2*fscale)

plt.figure(figsize=figsize)

ax1 = plt.subplot(211)
ax2 = plt.subplot(212)

keys = list(param_res.keys())[::-1]

for i, key in enumerate(keys):
    (pr, recall, thresholds) = param_res[key]
    c = f"C{i+1}"
    lab = labels[::-1][i]
    ax1.plot(thresholds[:-1], pr[1:-1], color=c)
    ax2.plot(thresholds[:-1], recall[1:-1], label=lab, color=c, linestyle="--")

for ax in [ax1, ax2]:    
    ax.set_xlim(0.0, 1.0)
    ax.set_ylim(0.0, 1.0)

ax2.legend()
ax2.set_xlabel(r"$\it{tdescore}$ threshold")
ax1.set_ylabel("Precision")
ax2.set_ylabel("Recall")

ax1.set_xticklabels([])
ax1.set_xticks([])
ax1.set_yticks([0.2, 0.4, 0.6, 0.8, 1.0])
ax1.set_yticklabels([0.2, 0.4, 0.6, 0.8, 1.0])
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig("figures/precision_recall_comparison.pdf", bbox_inches='tight')
plt.show()

In [None]:
figsize=(1.5*fscale, 1.5*fscale)
plt.figure(figsize=figsize)
for i, key in enumerate(keys):
    (pr, recall, thresholds) = param_res[key]
    c = f"C{i+1}"
    lab = labels[::-1][i]
    plt.plot(pr, recall, label=lab, c=c)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
# plt.legend()
# plt.savefig("figures/precision_recall_comparison.pdf", bbox_inches='tight')
plt.show()

In [None]:
text_str = r"""\begin{table*}[]
        \begin{tabular}{c|c|c|c|c}
        \textbf{Parameter Set} & \textbf{New Parameters} & \textbf{Total Parameters} & \textbf{ROC Area} & \textbf{Precision/Recall Area} \\
        \hline
"""
print(text_str)

used_parameters = []

for i, row in param_df.iterrows():
    relevant_columns, column_descriptions = parse_columns(all_features[i])
    relevant_columns = [x for x in relevant_columns if x not in used_parameters]
    name = relevant_columns[0].replace('_', '\_')
    print("\t" + r" \textbf{" + row[0] + r"}" +f" & {name} & {row[1]} & {row[2]:.2f} & {row[3]:.2f} \\\\")
    for x in relevant_columns[1:]:
        name = x.replace('_', '\_')
        print(f"\t & {name} &  &  & \\\\")
        
    used_parameters += relevant_columns
        
    print("\t \hline")
print(r"\end{tabular}")
print(r"\caption{Performance of \tdes for four parameter sets: information only about the host, information available shortly after discovery, information available by the time of peak, and the full parameter set. The performance of \tdes substantially with more data, but high performance is only achieved for the full dataset.}")
print(r"""\label{tab:parameter_subset}
\end{table*}""")
    

In [None]:
features = pd.DataFrame([relevant_columns, column_descriptions, list(clf.feature_importances_), ]).T
features.sort_values(by=2, ascending=False, inplace=True)
features