---

In [None]:
from pathlib import Path

path = Path().absolute()

In [None]:
cd ../

In [None]:
import pickle
import time

---

### Instantiate Classifier
---

In [None]:
from pipeline.classification import Classification

clf = Classification()

### Load Data
---

In [None]:
import pandas as pd

# Load dataset
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")
testing_set_df = pd.read_csv(path / "assets/datasets/testing-set-1.csv", delimiter=";")

# Select specific categories
# training_set_df = training_set_df[training_set_df["targets"].isin(["kaget","takut"])]
# testing_set_df = testing_set_df[testing_set_df["targets"].isin(["kaget","takut"])]

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

X_test = list(testing_set_df["texts"])
y_test = list(testing_set_df["targets"])

### Data Preparation
---

In [None]:
from pipeline.pos_filter import POS

# Text Preprocessing
X_train = clf.text_preprocessing_pipeline.transform(X_train)
X_test = clf.text_preprocessing_pipeline.transform(X_test)

# Feature Selection

# emotion classification
clf.feature_selection_pipeline.named_steps["pos_filter"].set_params(**{"pos": POS - set(["DET","INTJ","NUM","PRON","PROPN","PUNCT","SYM","X"])})

# aspect classification
# clf.feature_selection_pipeline.named_steps["pos_filter"].set_params(**{"pos": set(["NOUN","PROPN"])})

# extract word features
clf.feature_selection_pipeline.named_steps["document_transformer"].set_params(**{"feat_attrs": ["lemma","upos"]})

X_train = clf.feature_selection_pipeline.transform(X_train)
X_test = clf.feature_selection_pipeline.transform(X_test)

### Hyper-parameters tuning
---

In [None]:
from datetime import timedelta

n_iter = 10
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 1

param_distributions = {
    "tfidfvectorizer__ngram_range": ((1,2),),
    "tfidfvectorizer__min_df": (1,),
    "tfidfvectorizer__max_df": (1.0,),
    "tfidfvectorizer__norm": ("l2",),
    "tfidfvectorizer__sublinear_tf": (True,),
    "svc__kernel": ("rbf",),
    "svc__C": (1,),
    "svc__gamma": ("scale", 0.0001, 0.001, 0.01, 0.1, 1),
    "svc__class_weight": ("balanced",),
    "svc__decision_function_shape": ("ovo",)
}

# param_distributions = {
#     "tfidfvectorizer__ngram_range": ((1, 1), (1, 2), (2, 2)),
#     "tfidfvectorizer__min_df": (0.01, 1, 3, 5, 10),
#     "tfidfvectorizer__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
#     "tfidfvectorizer__norm": (None, "l1", "l2"),
#     "tfidfvectorizer__sublinear_tf": (True, False),
#     "svc__kernel": ("linear",),
#     "svc__C": (0.01, 0.1, 1, 10, 100),
#     "svc__class_weight": (None, "balanced", {"kaget": 1, "cinta": 3, "takut": 4, "marah": 5, "gembira": 10, "sedih": 11}),
#     "svc__decision_function_shape": ("ovo", "ovr")
# }

randomized_search, estimation = clf.tuning(X_train, y_train, param_distributions, n_iter, n_splits, train_size, n_jobs, verbose)
cv_results_df = pd.DataFrame(randomized_search.cv_results_)


In [None]:
cv_results_df

In [None]:
print(f'Fitted {randomized_search.n_splits_} folds of {len(cv_results_df)} candidates, finished in {str(timedelta(seconds=estimation))}.')
print(f"Best score: {randomized_search.best_score_}")
print("Best hyper-parameters:")
randomized_search.best_params_

### DO Update classification pipeline
---

In [None]:
clf.classification_pipeline = randomized_search.best_estimator_

### OR Re-train Model
---

#### Apply the best hyper-parameters
---

In [None]:
# clf.classification_pipeline.set_params(**randomized_search.best_params_)

#### Train model
---

In [None]:
# clf.train_preprocessed(X_train, y_train)

### Save model
---

In [None]:
clf.to_disk(path / "assets/models/")

### Load model
---

In [None]:
# filepath = path / "assets/models/model.2023.05.24.14.40.25.798494.pickle"
# clf.from_disk(filepath)

### Test model
---

In [None]:
y_pred = clf.test_preprocessed(X_test)

### Evaluation
---

In [None]:
accuracy, mcc = clf.score(y_test, y_pred)

print("Accuracy:", accuracy)
print("MCC:", mcc)

### Confusion Matrix
---

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

normalize = None
# normalize="true"

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, normalize=normalize, cmap="YlGn")

## Analysis
---

In [None]:
clf.classification_pipeline.named_steps["tfidfvectorizer"].get_feature_names_out()