In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

df_path = "../../data/raw/debunking_review.tsv"
df = pd.read_csv(df_path, sep="\t")

In [3]:
from eppi_text_classification import (
    get_features_and_labels,
    get_tfidf_and_names,
)

word_features, labels = get_features_and_labels(df)

tfidf_scores, feature_names = get_tfidf_and_names(word_features)

In [4]:
from eppi_text_classification import OptunaHyperparameterOptimisation
from eppi_text_classification.utils import delete_optuna_study

optimiser = OptunaHyperparameterOptimisation(
    tfidf_scores,
    labels,
    "SVC",
    n_trials_per_job=10,
    n_jobs=-1,
    nfolds=3,
    num_cv_repeats=1,
)

# delete_optuna_study("svc_binary")
best_params = optimiser.optimise_hyperparameters(study_name="svc_binary")

[I 2024-06-22 13:23:20,437] A new study created in RDB with name: svc_binary
[I 2024-06-22 13:23:22,476] Trial 0 finished with value: 0.949653909528477 and parameters: {'C': 811.2091434144492}. Best is trial 0 with value: 0.949653909528477.
[I 2024-06-22 13:23:22,568] Trial 6 finished with value: 0.949653909528477 and parameters: {'C': 76.05702111662264}. Best is trial 0 with value: 0.949653909528477.
[I 2024-06-22 13:23:22,587] Trial 7 finished with value: 0.949653909528477 and parameters: {'C': 139.1509428286575}. Best is trial 0 with value: 0.949653909528477.
[I 2024-06-22 13:23:22,588] Trial 2 finished with value: 0.949653909528477 and parameters: {'C': 107.68540619073511}. Best is trial 0 with value: 0.949653909528477.
[I 2024-06-22 13:23:22,612] Trial 10 finished with value: 0.949653909528477 and parameters: {'C': 3.4801004213494657}. Best is trial 0 with value: 0.949653909528477.
[I 2024-06-22 13:23:22,626] Trial 9 finished with value: 0.949653909528477 and parameters: {'C': 29.

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

model = SVC(**best_params)

# model = SVC(
#     class_weight="balanced",
#     cache_size=1000,
#     probability=False,
#     C=3000,
#     kernel="rbf",
#     shrinking=False,
#     tol=1e-8,
#     gamma="auto",
# )

X_train, X_test, y_train, y_test = train_test_split(
    tfidf_scores, labels, test_size=0.333, stratify=labels, random_state=7
)

model.fit(X_train, y_train)

In [7]:
from eppi_text_classification.predict import predict_probabilities
from eppi_text_classification.plotly_roc import plotly_roc

y_test_pred_probs = predict_probabilities(model, X_test)
plotly_roc(y_test, y_test_pred_probs)

In [8]:
from eppi_text_classification.predict import (
    get_threshold,
    threshold_predict,
)


threshold = get_threshold(model, X_test, y_test, target_tpr=1)

y_test_pred = threshold_predict(model, X_test, threshold)
y_train_pred = threshold_predict(model, X_train, threshold)

In [None]:
from eppi_text_classification import binary_train_valid_confusion_plotly

binary_train_valid_confusion_plotly(
    y_train,
    y_train_pred,
    y_test,
    y_test_pred,
    postive_label="Included",
    negative_label="Excluded",
)