---

In [None]:
from pathlib import Path

path = Path().absolute()

In [None]:
cd ../

In [None]:
import pickle
import time

from sklearn.pipeline import Pipeline

---

### Load Data
---

In [None]:
import pandas as pd

# Load training set
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

### Data Preparation
---

#### Text Preprocessing

In [None]:
from pipeline.text_cleaning import TextCleaning
from pipeline.tokenize_mwt_pos_lemma import TokenizeMWTPOSLemma

text_preprocessing_pipeline: Pipeline = Pipeline([
    ("text_cleaning", TextCleaning()),
    ("tokenize_mwt_pos_lemma", TokenizeMWTPOSLemma())
])

X_train = text_preprocessing_pipeline.transform(X_train)

#### Feature Selection

In [None]:
from pipeline.data.stopwords import STOPWORDS
from pipeline.pos_filter import POSFilter
from pipeline.stopword_removal import StopWordRemoval
from pipeline.document_transformer import DocumentTransformer

pos_filter_hyperparameters = {
    "pos": ("ADJ","ADV","NOUN","PART","VERB")
}

stopword_removal_hyperparameters = {
    "stopwords": STOPWORDS
}

document_transformer_hyperparameters = {
    "feat_attrs": ["lemma","upos"]
}

feature_selection_pipeline: Pipeline = Pipeline([
    ("pos_filter", POSFilter(**pos_filter_hyperparameters, verbose=0)),
    ("stopword_removal", StopWordRemoval(**stopword_removal_hyperparameters, verbose=0)),
    ("document_transformer", DocumentTransformer(**document_transformer_hyperparameters, verbose=0))
])

X_train = feature_selection_pipeline.transform(X_train)

### Hyper-parameters tuning
---

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def fun(arg):
    return arg

tfidfvectorizer_hyperparameters = {
    "encoding": "ascii",
    "decode_error": "ignore",
    "strip_accents": "ascii",
    "preprocessor": fun,
    "tokenizer": fun,
    "analyzer": "word",
    "token_pattern": None,
    "max_df": 1.0
}

linearsvc_hyperparameters = {
    "loss": "squared_hinge",
    "dual": False,
    "multi_class": "ovr",
    "max_iter": 1000000,
    "random_state": 42,
    "tol": 0.0001,
    "fit_intercept": True,
}

classification_pipeline: Pipeline = Pipeline([
    ("tfidfvectorizer", TfidfVectorizer(**tfidfvectorizer_hyperparameters)),
    ("linearsvc", LinearSVC(**linearsvc_hyperparameters))
])

In [None]:
import warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("default", category=ConvergenceWarning)

param_distributions = {
    "tfidfvectorizer__ngram_range": ((1, 1), (1, 2), (1, 3)),
    "tfidfvectorizer__min_df": (1, 3, 5, 10),
    "tfidfvectorizer__norm": ("l1", "l2"),
    "tfidfvectorizer__sublinear_tf": (True, False),
    "linearsvc__penalty": ("l1", "l2"),
    "linearsvc__C": (0.01, 0.1, 1),
    "linearsvc__intercept_scaling": (0.1, 1.0, 10, 100),
    "linearsvc__class_weight": (None, "balanced"),
}

n_iter = 200
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 2

randomized_search = RandomizedSearchCV(
    estimator=classification_pipeline,
    param_distributions=param_distributions,
    n_iter=n_iter,
    scoring=make_scorer(matthews_corrcoef),
    n_jobs=n_jobs,
    cv=StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, random_state=42),
    verbose=verbose,
    random_state=42
)

t0 = time.time()
randomized_search.fit(X_train, y_train)
estimation = time.time() - t0

cv_results_df = pd.DataFrame(randomized_search.cv_results_)
cv_results_df = cv_results_df.rename(lambda col_name: col_name.split("__")[-1] if "param_" in col_name else col_name, axis="columns")

col_names = [
    "ngram_range","min_df","norm","sublinear_tf",
    "penalty","C","intercept_scaling","class_weight",
    "split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score","mean_test_score",
    "mean_fit_time",
    "rank_test_score"
]

cv_results_df = cv_results_df.reindex(columns=col_names)
cv_results_df = cv_results_df.fillna("None")
cv_results_df.to_csv(path / "assets/experiments/experiment_3_cv_results.csv", sep=";", index=False)

In [None]:
cv_results_df

In [None]:
from datetime import timedelta

print(f'Fitted {randomized_search.n_splits_} folds of {len(cv_results_df)} candidates, finished in {str(timedelta(seconds=estimation))}.')
print(f"Best score: {randomized_search.best_score_}")
print("Best hyper-parameters:")
randomized_search.best_params_