---

In [1]:
from pathlib import Path

path = Path().absolute()

In [2]:
cd ../

c:\Users\c10nGp4\OneDrive\Documents\GitHub\imbalance-multi-classification


In [3]:
import pickle
import time

from sklearn.pipeline import Pipeline

---

### Load Data
---

In [4]:
import pandas as pd

# Load training set
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

### Data Preparation
---

#### Text Preprocessing

In [5]:
from pipeline.text_cleaning import TextCleaning
from pipeline.tokenize_mwt_pos_lemma import TokenizeMWTPOSLemma

text_preprocessing_pipeline: Pipeline = Pipeline([
    ("text_cleaning", TextCleaning(verbose=0)),
    ("tokenize_mwt_pos_lemma", TokenizeMWTPOSLemma(verbose=0))
])

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.1.json:   0%|   …

2023-06-14 10:45:45 INFO: Downloading default packages for language: id (Indonesian) ...
2023-06-14 10:45:45 INFO: File exists: C:\Users\c10nGp4\stanza_resources\id\default.zip
2023-06-14 10:45:47 INFO: Finished downloading models and saved to C:\Users\c10nGp4\stanza_resources.
2023-06-14 10:45:47 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-id/resolve/v1.5.0/models/pos/gsd.pt:   0%|          | 0.…

2023-06-14 10:45:53 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-06-14 10:45:53 INFO: Using device: cuda
2023-06-14 10:45:53 INFO: Loading: tokenize
2023-06-14 10:45:55 INFO: Loading: mwt
2023-06-14 10:45:55 INFO: Loading: pos
2023-06-14 10:45:55 INFO: Loading: lemma
2023-06-14 10:45:55 INFO: Done loading processors!


### Hyper-parameters tuning
---

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def fun(arg):
    return arg

tfidfvectorizer_hyperparameters = {
    "encoding": "ascii",
    "decode_error": "ignore",
    "strip_accents": "ascii",
    "preprocessor": fun,
    "tokenizer": fun,
    "analyzer": "word",
    "token_pattern": None,
    "ngram_range": (1, 1),
    "min_df": 1,
    "max_df": 1.0,
    "norm": "l2",
    "sublinear_tf": False
}

linearsvc_hyperparameters = {
    "loss": "squared_hinge",
    "dual": False,
    "multi_class": "ovr",
    "max_iter": 1000000,
    "random_state": 42,
    "tol": 0.0001,
    "penalty": "l2",
    "C": 1,
    "fit_intercept": True,
    "intercept_scaling": 1.0,
    "class_weight": "balanced"
}

classification_pipeline: Pipeline = Pipeline([
    ("tfidfvectorizer", TfidfVectorizer(**tfidfvectorizer_hyperparameters)),
    ("linearsvc", LinearSVC(**linearsvc_hyperparameters))
])

In [7]:
import warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate

from pipeline.pos_filter import POS
from pipeline.data.stopwords import STOPWORDS
from pipeline.pos_filter import POSFilter
from pipeline.stopword_removal import StopWordRemoval
from pipeline.document_transformer import DocumentTransformer

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("default", category=ConvergenceWarning)

n_iter = 10000
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 0

results=[]

for pipe, params in [
    (
        "pos_filter",
        [
            {"pos": ("ADJ","ADV","NOUN","PART","VERB")},
            {"pos": POS}
        ]
    ),
    (
        "stopword_removal",
        [
            {"stopwords": STOPWORDS},
            {"stopwords": None}
        ]
    ),
    (
        "document_transformer",
        [
            {"feat_attrs": ["text"]},
            {"feat_attrs": ["lemma"]},
            {"feat_attrs": ["text","upos"]},
            {"feat_attrs": ["lemma","upos"]}
        ]
    )
]:
    for param in params:
        feature_selection_pipeline: Pipeline = Pipeline([
            ("pos_filter", POSFilter(**{"pos": ("ADJ","ADV","NOUN","PART","VERB")}, verbose=0)),
            ("stopword_removal", StopWordRemoval(**{"stopwords": STOPWORDS}, verbose=0)),
            ("document_transformer", DocumentTransformer(**{"feat_attrs": ["lemma","upos"]}, verbose=0))
        ])
        feature_selection_pipeline.named_steps[pipe].set_params(**param)

        X_temp = text_preprocessing_pipeline.transform(X_train)
        X_temp = feature_selection_pipeline.transform(X_temp)

        cv = cross_validate(
            classification_pipeline,
            X_temp,
            y_train,
            scoring=make_scorer(matthews_corrcoef),
            cv=StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, random_state=42),
            n_jobs=n_jobs,
            verbose=verbose,
            return_estimator=True
        )

        col_names = [
            "split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score","mean_test_score",
            "mean_fit_time"
        ]

        results.append((
            param,
            len(cv["estimator"][cv["test_score"].tolist().index(max(cv["test_score"].tolist()))].named_steps["tfidfvectorizer"].vocabulary_),
            pd.DataFrame({k: v for k, v in cv.items() if k not in ["estimator", "score_time"]})
        ))

In [32]:
for (param, n, cv_results_df) in results:
    print(param)
    print(n)
    print(sum(cv_results_df["fit_time"]/5))
    print(sum(cv_results_df["test_score"]/5))
    print(cv_results_df.T)
    print()

{'pos': ('ADJ', 'ADV', 'NOUN', 'PART', 'VERB')}
7309
0.23271293640136717
0.602673158318661
                   0         1         2         3         4
fit_time    0.238964  0.236616  0.239463  0.231777  0.216743
test_score  0.601934  0.599224  0.598257  0.602419  0.611532

{'pos': {'AUX', 'CCONJ', 'NOUN', 'PROPN', 'ADP', 'ADJ', 'ADV', 'SYM', 'PART', 'VERB', 'X', 'NUM', 'PUNCT', 'DET', 'INTJ', 'SCONJ', 'PRON'}}
8087
0.3093985080718994
0.6025636567351069
                   0         1         2         3         4
fit_time    0.319655  0.324573  0.287171  0.304089  0.311505
test_score  0.601963  0.596623  0.603343  0.604974  0.605915

{'stopwords': {'berdatangan', 'dibuat', 'dimisalkan', 'setibanya', 'dikarenakan', 'ibaratnya', 'rasanya', 'ditunjuki', 'kedua', 'menantikan', 'sebagai', 'harus', 'kelihatannya', 'kata', 'jadi', 'meyakini', 'berjumlah', 'padahal', 'beginilah', 'sepanjang', 'kalau', 'beri', 'bersama', 'berakhirlah', 'enggaknya', 'kan', 'pertama-tama', 'kini', 'selama-lamanya

In [21]:
feature_selection_pipeline.named_steps['pos_filter'].get_params()["pos"]

('ADJ', 'ADV', 'NOUN', 'PART', 'VERB')

In [23]:
len(X_temp)

14160