---

In [1]:
from pathlib import Path

path = Path().absolute()

In [2]:
cd ../

c:\Users\c10nGp4\OneDrive\Documents\GitHub\imbalance-multi-classification


In [3]:
import pickle
import time

from sklearn.pipeline import Pipeline

---

### Load Data
---

In [4]:
import pandas as pd

# Load training set
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

### Load Hyper-parameter Combinations
---

In [5]:
experiment_filename = "experiment_3_cv_results"
experiment_cv_results_df = pd.read_csv(path / f'assets/experiments/{experiment_filename}.csv', delimiter=";").sort_values("rank_test_score").head(20)
experiment_cv_results_df = experiment_cv_results_df.drop(['split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'mean_fit_time', 'rank_test_score'], axis="columns")

### Data Preparation
---

#### Text Preprocessing

In [6]:
from pipeline.text_cleaning import TextCleaning
from pipeline.tokenize_mwt_pos_lemma import TokenizeMWTPOSLemma

text_preprocessing_pipeline: Pipeline = Pipeline([
    ("text_cleaning", TextCleaning()),
    ("tokenize_mwt_pos_lemma", TokenizeMWTPOSLemma())
])

X_train = text_preprocessing_pipeline.transform(X_train)

2023-06-12 09:52:00 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


2023-06-12 09:52:00 INFO: DOWNLOAD STANZA MODEL
2023-06-12 09:52:00 INFO: LOAD STANZA PIPELINE: tokenize,mwt,pos,lemma


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-12 09:52:01 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-06-12 09:52:01 INFO: Using device: cuda
2023-06-12 09:52:01 INFO: Loading: tokenize
2023-06-12 09:52:03 INFO: Loading: mwt
2023-06-12 09:52:03 INFO: Loading: pos
2023-06-12 09:52:03 INFO: Loading: lemma
2023-06-12 09:52:03 INFO: Done loading processors!


2023-06-12 09:52:03 INFO: TEXT CLEANING
2023-06-12 09:52:04 INFO: TOKENIZE, MWT, POS, LEMMA


### Hyper-parameters tuning
---

In [7]:
from pipeline.data.stopwords import STOPWORDS
from pipeline.pos_filter import POSFilter
from pipeline.stopword_removal import StopWordRemoval
from pipeline.document_transformer import DocumentTransformer

pos_filter_hyperparameters = {
    "pos": ("ADJ","ADV","NOUN","PART","VERB")
}

stopword_removal_hyperparameters = {
    "stopwords": STOPWORDS
}

document_transformer_hyperparameters = {
    "feat_attrs": ["lemma","upos"]
}

feature_selection_pipeline: Pipeline = Pipeline([
    ("pos_filter", POSFilter(**pos_filter_hyperparameters, verbose=0)),
    ("stopword_removal", StopWordRemoval(**stopword_removal_hyperparameters, verbose=0)),
    ("document_transformer", DocumentTransformer(**document_transformer_hyperparameters, verbose=0))
])

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def fun(arg):
    return arg

tfidfvectorizer_hyperparameters = {
    "encoding": "ascii",
    "decode_error": "ignore",
    "strip_accents": "ascii",
    "preprocessor": fun,
    "tokenizer": fun,
    "analyzer": "word",
    "token_pattern": None
}

linearsvc_hyperparameters = {
    "loss": "squared_hinge",
    "dual": False,
    "multi_class": "ovr",
    "max_iter": 1000000,
    "random_state": 42
}

classification_pipeline: Pipeline = Pipeline([
    ("tfidfvectorizer", TfidfVectorizer(**tfidfvectorizer_hyperparameters)),
    ("linearsvc", LinearSVC(**linearsvc_hyperparameters))
])

In [9]:
import ast

def restore_dtype(x):
    if isinstance(x, str):
        x = x.replace(" ","")

        try:
            if (
                x in ["None","True","False"] or
                any(y in x for y in ["(","{","."])
            ):
                return ast.literal_eval(x)

            return int(x)

        except:
            pass

    return x

def restore_param(x):
    if x in ["ngram_range", "min_df", "max_df", "norm", "sublinear_tf"]:
        return f'tfidfvectorizer__{x}'

    elif x in ["penalty", "C", "fit_intercept", "intercept_scaling", "class_weight"]:
        return f'linearsvc__{x}'

    raise

param_distributions = []

for d in experiment_cv_results_df.T.to_dict(orient="dict").values():
    new_d = {}

    for k, v in d.items():
        if k == "intercept_scaling" and restore_dtype(v) is None:
            new_d[restore_param(k)] = (1.0, )
        else:
            new_d[restore_param(k)] = (restore_dtype(v),)

    param_distributions.append(new_d)

for param in param_distributions:
    print(param)

{'tfidfvectorizer__ngram_range': ((1, 2),), 'tfidfvectorizer__min_df': (1,), 'tfidfvectorizer__norm': ('l2',), 'tfidfvectorizer__sublinear_tf': (False,), 'linearsvc__penalty': ('l1',), 'linearsvc__C': (1.0,), 'linearsvc__intercept_scaling': (1.0,), 'linearsvc__class_weight': ('balanced',)}
{'tfidfvectorizer__ngram_range': ((1, 2),), 'tfidfvectorizer__min_df': (1,), 'tfidfvectorizer__norm': ('l2',), 'tfidfvectorizer__sublinear_tf': (True,), 'linearsvc__penalty': ('l1',), 'linearsvc__C': (1.0,), 'linearsvc__intercept_scaling': (100.0,), 'linearsvc__class_weight': (None,)}
{'tfidfvectorizer__ngram_range': ((1, 3),), 'tfidfvectorizer__min_df': (3,), 'tfidfvectorizer__norm': ('l2',), 'tfidfvectorizer__sublinear_tf': (True,), 'linearsvc__penalty': ('l2',), 'linearsvc__C': (0.1,), 'linearsvc__intercept_scaling': (10.0,), 'linearsvc__class_weight': ('balanced',)}
{'tfidfvectorizer__ngram_range': ((1, 3),), 'tfidfvectorizer__min_df': (3,), 'tfidfvectorizer__norm': ('l2',), 'tfidfvectorizer__sub

In [10]:
import warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

from pipeline.pos_filter import POS

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("default", category=ConvergenceWarning)

n_iter = 1000
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 2

experiments = [
    ("pos_filter", "pos", {"without": POS}),
    ("stopword_removal", "stopwords", {"without": []}),
    (
        "document_transformer",
        "feat_attrs",
        {
            "text": ["text"],
            "lemma": ["lemma"],
            "text.upos": ["text","upos"],
            "lemma.upos": ["lemma","upos"]
        }
    )
]

results = {}

for transformer_name, parameter_name, parameters in experiments:
    for parameter_label, parameter_value in parameters.items():    
        feature_selection_pipeline.named_steps["pos_filter"].set_params(**pos_filter_hyperparameters)
        feature_selection_pipeline.named_steps["stopword_removal"].set_params(**stopword_removal_hyperparameters)
        feature_selection_pipeline.named_steps["document_transformer"].set_params(**document_transformer_hyperparameters)

        feature_selection_pipeline.named_steps[transformer_name].set_params(**{parameter_name: parameter_value})

        X_selected = feature_selection_pipeline.transform(X_train)

        randomized_search = RandomizedSearchCV(
            estimator=classification_pipeline,
            param_distributions=param_distributions,
            n_iter=n_iter,
            scoring=make_scorer(matthews_corrcoef),
            n_jobs=n_jobs,
            cv=StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, random_state=42),
            verbose=verbose,
            random_state=42
        )

        randomized_search.fit(X_selected, y_train)

        results[f'{parameter_name}__{parameter_label}__mean_test_score'] = randomized_search.cv_results_["mean_test_score"]
        results[f'{parameter_name}__{parameter_label}__mean_fit_time'] = randomized_search.cv_results_["mean_fit_time"]




Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.5s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time= 2.4min
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.5s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time= 3.1min
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.4s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.5s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.5s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.5s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time= 2.5min
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time= 2.3min
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time= 2.6min
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.2s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, tfidfvectorizer__ngram_range=(1, 2), tfidfvectorizer__norm=l2, tfidfvectorizer__sublinear_tf=False; total time=   0.3s
[CV] END linearsvc__C=1.0, linearsvc__class_weight=balanced, linearsvc__intercept_scaling=1.0, linearsvc__penalty=l1, tfidfvectorizer__min_df=1, t

In [11]:
results_df = pd.DataFrame(results)
results_df.to_csv(path / f'assets/experiments/experiment_4_results_from_{experiment_filename}.csv', sep=";", index=False)

In [12]:
results_df

Unnamed: 0,pos__without__mean_test_score,pos__without__mean_fit_time,stopwords__without__mean_test_score,stopwords__without__mean_fit_time,feat_attrs__text__mean_test_score,feat_attrs__text__mean_fit_time,feat_attrs__lemma__mean_test_score,feat_attrs__lemma__mean_fit_time,feat_attrs__text.upos__mean_test_score,feat_attrs__text.upos__mean_fit_time,feat_attrs__lemma.upos__mean_test_score,feat_attrs__lemma.upos__mean_fit_time
0,0.636076,60.434651,0.63878,78.372572,0.654536,0.545763,0.639027,57.867685,0.648291,56.935168,0.638448,0.356971
1,0.63767,0.427876,0.638845,0.651813,0.651462,0.633233,0.634191,0.406069,0.6472,0.373502,0.639279,0.395181
2,0.638104,0.229345,0.637015,0.334634,0.64987,0.336482,0.6402,0.212455,0.640193,0.212092,0.63724,0.21245
3,0.636997,0.206539,0.63706,0.324946,0.648953,0.32111,0.639474,0.201209,0.639125,0.200206,0.637264,0.202652
4,0.635981,0.352052,0.635735,0.522549,0.645335,0.597543,0.635176,0.3186,0.636891,0.350436,0.636001,0.323011
5,0.637077,0.17955,0.635238,0.275357,0.650877,0.271384,0.640226,0.167505,0.635074,0.171846,0.635806,0.168918
6,0.635386,40.286697,0.635387,0.754641,0.651154,21.422666,0.63699,0.466986,0.644335,12.966672,0.636037,15.468637
7,0.634407,22.873041,0.636145,0.940769,0.652196,0.938247,0.636632,0.556394,0.644024,0.526441,0.637145,0.522645
8,0.636124,0.262423,0.633948,0.228952,0.649942,0.226364,0.636932,0.145435,0.640934,0.160106,0.634267,0.154384
9,0.633699,0.211444,0.632258,0.191719,0.645837,0.202513,0.637667,0.119472,0.633854,0.12189,0.632628,0.121913
