---

In [1]:
from pathlib import Path

path = Path().absolute()

In [2]:
cd ../

c:\Users\c10nGp4\OneDrive\Documents\GitHub\imbalance-multi-classification


In [3]:
import time

from sklearn.pipeline import Pipeline

---

### Load Data
---

In [4]:
import pandas as pd

# Load dataset
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

### Data Preparation
---

#### Text Preprocessing

In [5]:
from pipeline.text_cleaning import TextCleaning
from pipeline.tokenize_mwt_pos_lemma import TokenizeMWTPOSLemma

text_preprocessing_pipeline: Pipeline = Pipeline([
    ("text_cleaning", TextCleaning()),
    ("tokenize_mwt_pos_lemma", TokenizeMWTPOSLemma())
])

X_train = text_preprocessing_pipeline.transform(X_train)

2023-06-12 06:52:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


2023-06-12 06:52:13 INFO: DOWNLOAD STANZA MODEL
2023-06-12 06:52:13 INFO: LOAD STANZA PIPELINE: tokenize,mwt,pos,lemma


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-12 06:52:13 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-06-12 06:52:13 INFO: Using device: cuda
2023-06-12 06:52:13 INFO: Loading: tokenize
2023-06-12 06:52:16 INFO: Loading: mwt
2023-06-12 06:52:16 INFO: Loading: pos
2023-06-12 06:52:16 INFO: Loading: lemma
2023-06-12 06:52:16 INFO: Done loading processors!


2023-06-12 06:52:16 INFO: TEXT CLEANING
2023-06-12 06:52:18 INFO: TOKENIZE, MWT, POS, LEMMA


#### Feature Selection

In [6]:
from pipeline.data.stopwords import STOPWORDS
from pipeline.pos_filter import POSFilter
from pipeline.stopword_removal import StopWordRemoval
from pipeline.document_transformer import DocumentTransformer

pos_filter_hyperparameters = {
    "pos": ("ADJ","ADV","NOUN","PART","VERB")
}

stopword_removal_hyperparameters = {
    "stopwords": STOPWORDS
}

document_transformer_hyperparameters = {
    "feat_attrs": ["lemma","upos"]
}

feature_selection_pipeline: Pipeline = Pipeline([
    ("pos_filter", POSFilter(**pos_filter_hyperparameters, verbose=0)),
    ("stopword_removal", StopWordRemoval(**stopword_removal_hyperparameters, verbose=0)),
    ("document_transformer", DocumentTransformer(**document_transformer_hyperparameters, verbose=0))
])

X_train = feature_selection_pipeline.transform(X_train)

### Hyper-parameters tuning
---

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def fun(arg):
    return arg

tfidfvectorizer_hyperparameters = {
    "encoding": "ascii",
    "decode_error": "ignore",
    "strip_accents": "ascii",
    "preprocessor": fun,
    "tokenizer": fun,
    "analyzer": "word",
    "token_pattern": None
}

linearsvc_hyperparameters = {
    "loss": "squared_hinge",
    "dual": False,
    "tol": 0.0001,
    "multi_class": "ovr",
    "max_iter": 1000000,
    "random_state": 42
}

classification_pipeline: Pipeline = Pipeline([
    ("tfidfvectorizer", TfidfVectorizer(**tfidfvectorizer_hyperparameters)),
    ("linearsvc", LinearSVC(**linearsvc_hyperparameters))
])

In [8]:
import warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("default", category=ConvergenceWarning)

param_distributions = [
    {
        "tfidfvectorizer__ngram_range": ((1, 1),  (1, 2), (2, 2), (1, 3),  (2, 3), (3, 3)),
        "tfidfvectorizer__min_df": (1, 3, 5, 10, 25),
        "tfidfvectorizer__max_df": (0.001, 0.01, 0.1, 1.0),
        "tfidfvectorizer__norm": (None, "l1", "l2"),
        "tfidfvectorizer__sublinear_tf": (True, False),
        "linearsvc__penalty": ("l1", "l2"),
        "linearsvc__C": (0.001, 0.01, 0.1, 1, 10, 100, 1000),
        "linearsvc__fit_intercept": (True,),
        "linearsvc__intercept_scaling": (0.001, 0.01, 0.1, 1, 10, 100, 1000),
        "linearsvc__class_weight": (None, "balanced"),
    },
    {
        "tfidfvectorizer__ngram_range": ((1, 1),  (1, 2), (2, 2), (1, 3),  (2, 3), (3, 3)),
        "tfidfvectorizer__min_df": (1, 3, 5, 10, 25),
        "tfidfvectorizer__max_df": (0.001, 0.01, 0.1, 1.0),
        "tfidfvectorizer__norm": (None, "l1", "l2"),
        "tfidfvectorizer__sublinear_tf": (True, False),
        "linearsvc__penalty": ("l1", "l2"),
        "linearsvc__C": (0.001, 0.01, 0.1, 1, 10, 100, 1000),
        "linearsvc__fit_intercept": (False,),
        "linearsvc__class_weight": (None, "balanced"),
    },
]

n_iter = 200
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 2

randomized_search = RandomizedSearchCV(
    estimator=classification_pipeline,
    param_distributions=param_distributions,
    n_iter=n_iter,
    scoring=make_scorer(matthews_corrcoef),
    n_jobs=n_jobs,
    cv=StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, random_state=42),
    verbose=verbose,
    random_state=42
)

t0 = time.time()
randomized_search.fit(X_train, y_train)
estimation = time.time() - t0

cv_results_df = pd.DataFrame(randomized_search.cv_results_)
cv_results_df = cv_results_df.rename(lambda col_name: col_name.split("__")[-1] if "param_" in col_name else col_name, axis="columns")

col_names = [
    "ngram_range","min_df","max_df","norm","sublinear_tf",
    "penalty","C","fit_intercept","intercept_scaling","class_weight",
    "split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score","mean_test_score",
    "mean_fit_time",
    "rank_test_score"
]

cv_results_df = cv_results_df.reindex(columns=col_names)
cv_results_df = cv_results_df.fillna("None")
cv_results_df.to_csv(path / "assets/experiments/experiment_1_cv_results.csv", sep=";", index=False)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV] END linearsvc__C=1000, linearsvc__class_weight=None, linearsvc__fit_intercept=True, linearsvc__intercept_scaling=0.001, linearsvc__penalty=l2, tfidfvectorizer__max_df=0.01, tfidfvectorizer__min_df=5, tfidfvectorizer__ngram_range=(2, 3), tfidfvectorizer__norm=l1, tfidfvectorizer__sublinear_tf=True; total time=   4.4s
[CV] END linearsvc__C=1000, linearsvc__class_weight=None, linearsvc__fit_intercept=True, linearsvc__intercept_scaling=0.001, linearsvc__penalty=l2, tfidfvectorizer__max_df=0.01, tfidfvectorizer__min_df=5, tfidfvectorizer__ngram_range=(2, 3), tfidfvectorizer__norm=l1, tfidfvectorizer__sublinear_tf=True; total time=   4.6s
[CV] END linearsvc__C=1000, linearsvc__class_weight=None, linearsvc__fit_intercept=True, linearsvc__intercept_scaling=0.001, linearsvc__penalty=l2, tfidfvectorizer__max_df=0.01, tfidfvectorizer__min_df=5, tfidfvectorizer__ngram_range=(2, 3), tfidfvectorizer__norm=l1, tfidfvectorizer__subli

72 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\c10nGp4\anaconda3\envs\imbalance-multi-classification\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\c10nGp4\anaconda3\envs\imbalance-multi-classification\lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\c10nGp4\anaconda3\envs\imbalance-multi-classification\lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\c10

In [9]:
cv_results_df

Unnamed: 0,ngram_range,min_df,max_df,norm,sublinear_tf,penalty,C,fit_intercept,intercept_scaling,class_weight,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,mean_fit_time,rank_test_score
0,"(2, 3)",5,0.010,l1,True,l2,1000.00,True,0.001,,0.073749,0.08011,0.071846,0.041351,0.079678,0.069347,4.568763,125
1,"(1, 3)",25,1.000,l2,False,l2,0.01,False,,balanced,0.528253,0.517243,0.534561,0.531816,0.523547,0.527084,0.281454,40
2,"(2, 3)",25,0.001,l2,True,l2,1000.00,True,0.001,balanced,,,,,,,0.134429,185
3,"(2, 2)",1,0.001,l1,True,l1,100.00,True,0.1,,0.138052,0.113176,0.11629,0.10113,0.121588,0.118047,2.411102,98
4,"(3, 3)",25,0.010,l2,False,l1,100.00,True,1000,balanced,0.000834,-0.001632,0.021144,-0.005404,-0.007229,0.001543,0.123117,161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,"(1, 3)",25,1.000,,False,l2,1.00,True,1000,balanced,0.530497,0.534157,0.539728,0.533884,0.530841,0.533821,0.232899,35
196,"(2, 3)",25,0.010,l1,False,l2,0.01,True,0.01,,0.047227,0.072866,0.080138,0.037986,0.060941,0.059832,0.106526,134
197,"(2, 2)",1,0.001,l2,True,l2,100.00,False,,,0.120888,0.127675,0.108786,0.121595,0.12275,0.120339,2.614573,97
198,"(1, 1)",25,0.100,,True,l2,10.00,True,0.01,balanced,0.527736,0.531907,0.535517,0.541849,0.526237,0.532649,4.928613,36


In [10]:
from datetime import timedelta

print(f'Fitted {randomized_search.n_splits_} folds of {len(cv_results_df)} candidates, finished in {str(timedelta(seconds=estimation))}.')
print(f"Best score: {randomized_search.best_score_}")
print("Best hyper-parameters:")
randomized_search.best_params_

Fitted 5 folds of 200 candidates, finished in 1:20:13.142074.
Best score: 0.6400934433728541
Best hyper-parameters:


{'tfidfvectorizer__sublinear_tf': False,
 'tfidfvectorizer__norm': 'l2',
 'tfidfvectorizer__ngram_range': (1, 3),
 'tfidfvectorizer__min_df': 1,
 'tfidfvectorizer__max_df': 1.0,
 'linearsvc__penalty': 'l1',
 'linearsvc__intercept_scaling': 10,
 'linearsvc__fit_intercept': True,
 'linearsvc__class_weight': None,
 'linearsvc__C': 1}