---

In [None]:
from pathlib import Path

path = Path().absolute()

In [None]:
cd ../

---

### Load Dataset
---

In [None]:
import pandas as pd

df = pd.read_csv(path / "assets/datasets/dataset-1.csv", delimiter=";")

### Instantiate Classifier
---

In [None]:
from pipeline.classification import Classification

clf = Classification(n_jobs=4, verbose=3)

### Data Preparation
---

In [None]:
# from sklearn.model_selection import train_test_split

from pipeline.classification import DEFAULT_POS

X = list(df["texts"])
y = list(df["targets"])

# X, _1, y, _2 = train_test_split(
#     X,
#     y,
#     train_size=.05,
#     random_state=42,
#     stratify=y
# )

X_cleaned = clf.clean(X)

# X_tokenized = clf.tokenize(X_cleaned)
X_tokenized = clf.tokenize(X_cleaned, list(set(DEFAULT_POS) - set(["DET","INTJ","NUM","PRON","PROPN","PUNCT","X"])))

X_train, X_test, y_train, y_test = clf.train_test_split(X_tokenized, y)

### Hyper-parameter tuning
---

In [6]:
import pickle
import time

grid_search, estimation = clf.tuning(
    X_train,
    y_train,
    [
        {
            "tfidfvectorizer__ngram_range": ((1, 1),(1,2)),
            "tfidfvectorizer__min_df": (1, 3, 5, 10),
            "tfidfvectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
            "svc__kernel": ("linear",),
            "svc__C": (0.01, 0.1, 1, 10, 100, 1000, 10000)
        }
    ]
)

res = {
    "estimation": estimation,
    "grid_search": grid_search
}

with open(path / f"assets/pickles/hyper-parameter-tuning:{round(time.time()*1000)}.pickle", "wb") as f:
    pickle.dump(res, f)

### Re-train Model
---

In [None]:
model = clf.train(X_train, y_train, grid_search.best_params_)

### Evaluation
---

In [None]:
y_pred = clf.test(model, X_test)
accuracy, mcc = clf.score(y_test, y_pred)

print("Accuracy:", accuracy)
print("MCC:", mcc)

### Confusion Matrix
---

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, y_pred)