In [None]:
import logging
import sys
from ast import literal_eval
from pathlib import Path

import pandas as pd
import torch
from catboost import Pool

from sec_certs.dataset import CCDataset
from sec_certs.model.references_nlp.evaluation import (
    evaluate_model,
)
from sec_certs.model.references_nlp.feature_extraction import (
    build_embeddings,
    dataframe_to_training_arrays,
    extract_geometrical_features,
    extract_language_features,
    extract_prediction_features,
    extract_segments,
    perform_dimensionality_reduction,
)
from sec_certs.model.references_nlp.training import train_model

REPO_ROOT = Path().resolve()
DATASET_PATH = REPO_ROOT / "dataset/cc_november_23/dataset.json"
TENSORBOARD_DATA_DIR = REPO_ROOT / "dataset/tensorboard_visualisation/"
TRAINED_MODEL_PATH = REPO_ROOT / "dataset/reference_prediction/final_model"

print(f"GPU available: {torch.cuda.is_available()}")

logger = logging.getLogger(__name__)
logging.getLogger("setfit").setLevel(logging.CRITICAL)
logging.getLogger("sentence_transformers").setLevel(logging.CRITICAL)
file_handler = logging.StreamHandler(sys.stderr)
file_handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
logging.basicConfig(level=logging.INFO, handlers=[file_handler])

In [None]:
mode = "evaluation"
cc_dset = CCDataset.from_json(DATASET_PATH)

df = extract_segments(cc_dset, mode=mode)
df.to_csv(REPO_ROOT / "dataset/reference_prediction/dataset.csv", index=False)

df = (
    pd.read_csv(REPO_ROOT / "dataset/reference_prediction/dataset.csv")
    .assign(
        segments=lambda df_: df_.segments.apply(literal_eval),
        actual_reference_keywords=lambda df_: df_.actual_reference_keywords.apply(literal_eval),
    )
    .loc[lambda df_: (df_.label != "IRRELEVANT")]
)

label_mapping = {
    "COMPONENT_USED": "COMPONENT_USED",
    "RE-EVALUATION": "PREVIOUS_VERSION",
    "EVALUATION_REUSED": "COMPONENT_USED",
    "PREVIOUS_VERSION": "PREVIOUS_VERSION",
    "COMPONENT_SHARED": "COMPONENT_USED",
}
df.label = df.label.map(label_mapping)

df, annotator = build_embeddings(
    df,
    mode=mode,
    method="transformer",
    model_path=REPO_ROOT / "/dataset/reference_prediction/final_model",
)
df = perform_dimensionality_reduction(
    df,
    mode,
)
df = extract_language_features(df, cc_dset)
df = extract_prediction_features(df, annotator._model)
df = extract_geometrical_features(df)

# Obtained from running the feature selection algorithm below
features_to_use = [
    "pca_mean_x",
    "pca_mean_y",
    "pca_var_y",
    "pca_cov_xy",
    "pca_median_x",
    "pca_median_y",
    "pca_std_distance_to_centroid",
    "pca_point_density",
    "umap_mean_x",
    "umap_mean_y",
    "umap_skew_y",
    "umap_cov_xy",
    "umap_median_x",
    "umap_median_y",
    "umap_max_distance_to_centroid",
    "umap_aspect_ratio",
    "lang_partial_ratio",
    "lang_token_sort_ratio",
    "lang_n_segments",
    "lang_matches_recertification",
    "lang_n_intersection_versions",
    "lang_common_words",
    "lang_bigram_overlap",
    "lang_common_suffix_len",
    "lang_character_trigram_overlap",
    "lang_len_difference",
    "pred_0",
    "pred_2",
    "pred_3",
    "pred_4",
]
df_ = df[features_to_use + ["label", "split"]]

x_train, y_train, x_valid, y_valid, features = dataframe_to_training_arrays(
    df_, mode=mode, use_pca=True, use_umap=True, use_pred=True, use_lang=True
)

clf = train_model(
    mode,
    x_train,
    y_train,
    x_valid,
    y_valid,
    train_baseline=False,
)
evaluate_model(
    clf,
    x_valid,
    y_valid,
    features,
    output_path=None,
)

# Classify the whole dataset and serialize the result
x_all = df[features_to_use].values
df["y_pred"] = clf.predict(x_all)
df["reference_label"] = df.label.fillna(df.y_pred)
df[["dgst", "canonical_reference_keyword", "reference_label"]].to_csv(
    REPO_ROOT / "/dataset/reference_prediction/predictions.csv"
)

## Run feature selection algorithm

In [None]:
train_pool = Pool(x_train, y_train, feature_names=features)
valid_pool = Pool(x_valid, y_valid, feature_names=features)

dct = clf.select_features(
    train_pool,
    eval_set=valid_pool,
    features_for_select=features,
    num_features_to_select=30,
    train_final_model=False,
    verbose=False,
)

features_to_use = dct["selected_features_names"]
df_lim_features = df[features_to_use + ["label", "split"]]
x_train, y_train, x_valid, y_valid, features = dataframe_to_training_arrays(
    df_lim_features, mode=mode, use_pca=True, use_umap=True, use_pred=True, use_lang=True
)

clf = train_model(x_train, y_train, x_valid, y_valid, train_baseline=False)
evaluate_model(
    clf,
    x_valid,
    y_valid,
    features,
    output_path=REPO_ROOT / "dataset/cc_ref_annotator_evaluation/embeddings",
)

## Serialize misclassified instances

In [None]:
scheme_mapping = {x.dgst: x.scheme for x in cc_dset}
all_classified_instances = df.loc[(df.label.notnull())].assign(scheme=lambda df_: df_.dgst.map(scheme_mapping))
misclassified_instances = df.loc[
    (df.y_pred != df.label) & (df.label.notnull()),
    [
        "dgst",
        "canonical_reference_keyword",
        "actual_reference_keywords",
        "label",
        "y_pred",
        "split",
        "segments",
        "referenced_cert_name",
        "cert_versions",
        "referenced_cert_versions",
        "lang_partial_ratio",
        "lang_token_sort_ratio",
    ],
].assign(
    report_link=lambda df_: df_.dgst.map(lambda x: f"https://seccerts.org/cc/{x}/report.pdf"),
    st_link=lambda df_: df_.dgst.map(lambda x: f"https://seccerts.org/cc/{x}/target.pdf"),
    scheme=lambda df_: df_.dgst.map(scheme_mapping),
)

# Then replace all \\/ with / in the corresponding json, as the pandas to_json method escapes the slashes.
misclassified_instances.to_json(
    REPO_ROOT / "dataset/misclassified_references_validation_set.json",
    orient="records",
    indent=4,
)

In [None]:
# Display proportion of misclassifications per scheme. Only DE and FR have sufficient support to make any conclusions.
# FR 4times more likely to be misclassified than DE
misclassified_instances.scheme.value_counts() * 100 / all_classified_instances.scheme.value_counts()