---

In [1]:
from pathlib import Path

path = Path().absolute()

In [2]:
cd ../

c:\Users\c10nGp4\OneDrive\Documents\GitHub\imbalance-multi-classification


In [3]:
import pickle
import time

from sklearn.pipeline import Pipeline

---

### Load Data
---

In [4]:
import pandas as pd

# Load training set
training_set_df = pd.read_csv(path / "assets/datasets/training-set-1.csv", delimiter=";")

# Get X and y from dataset
X_train = list(training_set_df["texts"])
y_train = list(training_set_df["targets"])

### Data Preparation
---

#### Text Preprocessing

In [5]:
from pipeline.text_cleaning import TextCleaning
from pipeline.tokenize_mwt_pos_lemma import TokenizeMWTPOSLemma

text_preprocessing_pipeline: Pipeline = Pipeline([
    ("text_cleaning", TextCleaning()),
    ("tokenize_mwt_pos_lemma", TokenizeMWTPOSLemma())
])

X_train = text_preprocessing_pipeline.transform(X_train)

2023-06-12 08:45:56 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


2023-06-12 08:45:56 INFO: DOWNLOAD STANZA MODEL
2023-06-12 08:45:56 INFO: LOAD STANZA PIPELINE: tokenize,mwt,pos,lemma


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-12 08:45:56 INFO: Loading these models for language: id (Indonesian):
| Processor | Package |
-----------------------
| tokenize  | gsd     |
| mwt       | gsd     |
| pos       | gsd     |
| lemma     | gsd     |

2023-06-12 08:45:56 INFO: Using device: cuda
2023-06-12 08:45:56 INFO: Loading: tokenize
2023-06-12 08:45:58 INFO: Loading: mwt
2023-06-12 08:45:58 INFO: Loading: pos
2023-06-12 08:45:58 INFO: Loading: lemma
2023-06-12 08:45:58 INFO: Done loading processors!


2023-06-12 08:45:58 INFO: TEXT CLEANING
2023-06-12 08:46:00 INFO: TOKENIZE, MWT, POS, LEMMA


#### Feature Selection

In [6]:
from pipeline.data.stopwords import STOPWORDS
from pipeline.pos_filter import POSFilter
from pipeline.stopword_removal import StopWordRemoval
from pipeline.document_transformer import DocumentTransformer

pos_filter_hyperparameters = {
    "pos": ("ADJ","ADV","NOUN","PART","VERB")
}

stopword_removal_hyperparameters = {
    "stopwords": STOPWORDS
}

document_transformer_hyperparameters = {
    "feat_attrs": ["lemma","upos"]
}

feature_selection_pipeline: Pipeline = Pipeline([
    ("pos_filter", POSFilter(**pos_filter_hyperparameters, verbose=0)),
    ("stopword_removal", StopWordRemoval(**stopword_removal_hyperparameters, verbose=0)),
    ("document_transformer", DocumentTransformer(**document_transformer_hyperparameters, verbose=0))
])

X_train = feature_selection_pipeline.transform(X_train)

### Hyper-parameters tuning
---

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def fun(arg):
    return arg

tfidfvectorizer_hyperparameters = {
    "encoding": "ascii",
    "decode_error": "ignore",
    "strip_accents": "ascii",
    "preprocessor": fun,
    "tokenizer": fun,
    "analyzer": "word",
    "token_pattern": None,
    "ngram_range": (1, 1),
    "min_df": 1,
    "max_df": 1.0,
    "norm": "l2",
    "sublinear_tf": False,
}

linearsvc_hyperparameters = {
    "loss": "squared_hinge",
    "dual": False,
    "multi_class": "ovr",
    "max_iter": 1000000,
    "random_state": 42,
    "penalty": "l2",
    "tol": 0.0001,
    "C": 1.0,
    "fit_intercept": True,
    "intercept_scaling": 1.0,
    "class_weight": "balanced",
}

classification_pipeline: Pipeline = Pipeline([
    ("tfidfvectorizer", TfidfVectorizer(**tfidfvectorizer_hyperparameters)),
    ("linearsvc", LinearSVC(**linearsvc_hyperparameters))
])

In [8]:
import warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import make_scorer, accuracy_score, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("default", category=ConvergenceWarning)

param_distributions = [
    {"tfidfvectorizer__ngram_range": ((1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3))},
    {"tfidfvectorizer__min_df": (1, 3, 5, 10, 25)},
    {"tfidfvectorizer__max_df": (0.001, 0.01, 0.1, 1.0)},
    {"tfidfvectorizer__norm": (None, "l1", "l2")},
    {"tfidfvectorizer__sublinear_tf": (True, False)},
    {"linearsvc__penalty": ("l1", "l2")},
    {"linearsvc__C": (0.001, 0.01, 0.1, 1, 10, 100, 1000)},
    {
        "linearsvc__fit_intercept": (True,),
        "linearsvc__intercept_scaling": (0.001, 0.01, 0.1, 1, 10, 100, 1000)
    },
    {"linearsvc__fit_intercept": (False,)},
    {"linearsvc__class_weight": (None, "balanced")}
]

n_iter = 10000
n_splits = 5
train_size = 0.8
n_jobs = 1
verbose = 2

randomized_search = RandomizedSearchCV(
    estimator=classification_pipeline,
    param_distributions=param_distributions,
    n_iter=n_iter,
    scoring=make_scorer(matthews_corrcoef),
    n_jobs=n_jobs,
    cv=StratifiedShuffleSplit(n_splits=n_splits, train_size=train_size, random_state=42),
    verbose=verbose,
    random_state=42
)

t0 = time.time()
randomized_search.fit(X_train, y_train)
estimation = time.time() - t0

cv_results_df = pd.DataFrame(randomized_search.cv_results_)
cv_results_df = cv_results_df.rename(lambda col_name: col_name.split("__")[-1] if "param_" in col_name else col_name, axis="columns")

col_names = [
    "ngram_range","min_df","max_df","norm","sublinear_tf",
    "penalty","C","fit_intercept","intercept_scaling","class_weight",
    "split0_test_score","split1_test_score","split2_test_score","split3_test_score","split4_test_score","mean_test_score",
    "mean_fit_time",
    "rank_test_score"
]

cv_results_df = cv_results_df.reindex(columns=col_names)
cv_results_df = cv_results_df.fillna("None")
cv_results_df.to_csv(path / "assets/experiments/experiment_2_cv_results.csv", sep=";", index=False)



Fitting 5 folds for each of 39 candidates, totalling 195 fits
[CV] END ................tfidfvectorizer__ngram_range=(1, 1); total time=   0.2s
[CV] END ................tfidfvectorizer__ngram_range=(1, 1); total time=   0.1s
[CV] END ................tfidfvectorizer__ngram_range=(1, 1); total time=   0.1s
[CV] END ................tfidfvectorizer__ngram_range=(1, 1); total time=   0.1s
[CV] END ................tfidfvectorizer__ngram_range=(1, 1); total time=   0.1s
[CV] END ................tfidfvectorizer__ngram_range=(1, 2); total time=   0.4s
[CV] END ................tfidfvectorizer__ngram_range=(1, 2); total time=   0.5s
[CV] END ................tfidfvectorizer__ngram_range=(1, 2); total time=   0.7s
[CV] END ................tfidfvectorizer__ngram_range=(1, 2); total time=   0.8s
[CV] END ................tfidfvectorizer__ngram_range=(1, 2); total time=   0.7s
[CV] END ................tfidfvectorizer__ngram_range=(2, 2); total time=   0.7s
[CV] END ................tfidfvectorizer__ngram

In [9]:
cv_results_df

Unnamed: 0,ngram_range,min_df,max_df,norm,sublinear_tf,penalty,tol,C,fit_intercept,intercept_scaling,class_weight,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,mean_fit_time,rank_test_score
0,"(1, 1)",,,,,,,,,,,0.601934,0.599224,0.598257,0.602419,0.611532,0.602673,0.219851,12
1,"(1, 2)",,,,,,,,,,,0.617502,0.619286,0.612693,0.629683,0.631827,0.622198,0.697648,4
2,"(2, 2)",,,,,,,,,,,0.165468,0.165107,0.16078,0.15154,0.188462,0.166271,0.584062,36
3,"(1, 3)",,,,,,,,,,,0.618792,0.617633,0.610566,0.629861,0.635524,0.622475,0.654165,3
4,"(2, 3)",,,,,,,,,,,0.168206,0.166085,0.158762,0.149691,0.166421,0.161833,0.400776,37
5,"(3, 3)",,,,,,,,,,,0.050884,0.046324,0.029242,0.00644,0.04327,0.035232,0.23101,39
6,,1.0,,,,,,,,,,0.601934,0.599224,0.598257,0.602419,0.611532,0.602673,0.213691,12
7,,3.0,,,,,,,,,,0.591653,0.601493,0.600736,0.601016,0.61098,0.601176,0.139511,24
8,,5.0,,,,,,,,,,0.602883,0.605923,0.60302,0.604719,0.616016,0.606512,0.13822,6
9,,10.0,,,,,,,,,,0.596422,0.582668,0.592378,0.587082,0.599255,0.591561,0.128613,28


In [10]:
from datetime import timedelta

print(f'Fitted {randomized_search.n_splits_} folds of {len(cv_results_df)} candidates, finished in {str(timedelta(seconds=estimation))}.')
print(f"Best score: {randomized_search.best_score_}")
print("Best hyper-parameters:")
randomized_search.best_params_

Fitted 5 folds of 39 candidates, finished in 0:02:27.616682.
Best score: 0.6337162376913333
Best hyper-parameters:


{'linearsvc__C': 0.1}

In [11]:
for name, values in [
    ("ngram_range", [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3), (3, 3)]),
    ("min_df", [1, 3, 5, 10, 25]),
    ("max_df", [0.001, 0.01, 0.1, 1.0]),
]:
    for value in values:
        classification_pipeline.named_steps["tfidfvectorizer"].set_params(**tfidfvectorizer_hyperparameters)
        classification_pipeline.named_steps["tfidfvectorizer"].set_params(**{name: value})
        classification_pipeline.fit(X_train, y_train)
        print(name, value, len(classification_pipeline.named_steps["tfidfvectorizer"].vocabulary_))

ngram_range (1, 1) 8058
ngram_range (1, 2) 58121
ngram_range (2, 2) 50063
ngram_range (1, 3) 105068
ngram_range (2, 3) 97010
ngram_range (3, 3) 46947
min_df 1 8058
min_df 3 3004
min_df 5 2067
min_df 10 1236
min_df 25 576
max_df 0.001 7168
max_df 0.01 7986
max_df 0.1 8058
max_df 1.0 8058
