In [4]:
!pip install optuna



In [44]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
import xgboost as xgb
import time
from sklearn.feature_extraction.text import CountVectorizer
import optuna
from sklearn.base import clone
import copy

Set paths to preprocessed train and test datasets \
**Prerequisite**: run preprocess_extract_features.ipynb

In [8]:
PATH_TO_PREPROCESSED_TRAIN_DATA = "../../bigdata2023duplicatedetection/q_3_2/preprocessed_data/preprocessed_train_df.pkl"
PATH_TO_PREPROCESSED_TEST_DATA = "../../bigdata2023duplicatedetection/q_3_2/preprocessed_data/preprocessed_test_df.pkl"

Set path where results will be saved as csv 

In [None]:
PATH_TO_SAVE_RESULTS = "../../results/q_3_2/predicted_labels.csv"

In [9]:
train_df = pd.read_pickle(PATH_TO_PREPROCESSED_TRAIN_DATA)
test_df = pd.read_pickle(PATH_TO_PREPROCESSED_TEST_DATA)

In [10]:
train_df.head(5)

Unnamed: 0,Id,Question1,Question2,IsDuplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,what be the step by step guide to invest in sh...,what be the step by step guide to invest in sh...,0,1.0,0.833333,1.0,1.0,0.916667,0.785714,...,0.096296,0.833333,0.227273,0.244186,0.014925,1.0,0.93,0.93,1.0,0.982456
1,1,what be the story of kohinoor kohinoor diamond,what would happen if the indian government ste...,0,0.666667,0.25,0.666667,0.5,0.5,0.307692,...,0.077778,0.222222,0.090909,0.034884,0.074627,0.81,0.6,0.62,0.72,0.553191
2,2,how can i increase the speed of my internet co...,how can internet speed be increase by hack thr...,0,0.6,0.5,0.4,0.25,0.5,0.357143,...,0.088889,0.375,0.136364,0.011628,0.014925,0.73,0.63,0.37,0.44,0.181818
3,3,why be i mentally very lonely how can i solve it,find the remainder when math2324math be divide...,0,0.0,0.0,0.25,0.142857,0.111111,0.090909,...,0.074074,0.0,0.0,0.0,0.029851,0.28,0.25,0.2,0.26,0.081633
4,4,which one dissolve in water quikly sugar salt ...,which fish would survive in salt water,0,0.4,0.2,1.0,0.666667,0.571429,0.307692,...,0.074074,0.153846,0.090909,0.0,0.074627,0.67,0.47,0.36,0.55,0.153846


In [11]:
test_df.head(5)

Unnamed: 0,Id,Question1,Question2,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,mean_len,jaccard_sim,word_overlap,share_n_grams,n_words_diff,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,283003,what can someone do if theyve lose the wireles...,what be the best usb wireless mouse that can b...,0.428571,0.333333,0.5,0.444444,0.411765,0.388889,0.0,...,0.12963,0.230769,0.136364,0.0,0.029851,0.67,0.49,0.46,0.45,0.131579
1,283004,why india need to elect prime minister,be prime minister of india elect or appoint,0.8,0.8,0.0,0.0,0.571429,0.5,0.0,...,0.055556,0.666667,0.181818,0.011628,0.0,0.81,0.67,0.4,0.42,0.384615
2,283005,how can i make money online with free of cost,how can i make money online for free,1.0,0.8,0.75,0.6,0.875,0.7,0.0,...,0.066667,0.8,0.181818,0.104651,0.014925,0.94,0.84,0.81,0.89,0.756757
3,283006,do mdma affect the first and high order moment...,do antipsychotic affect the first and high ord...,0.909091,0.909091,1.0,1.0,0.944444,0.944444,1.0,...,0.133333,0.833333,0.454545,0.337209,0.0,0.97,0.91,0.93,0.94,0.919192
4,283007,i be a saudi national and have sr 3 million in...,where should i invest money to get high return,0.2,0.0625,0.5,0.181818,0.333333,0.103448,0.0,...,0.140741,0.05,0.045455,0.0,0.164179,0.39,0.35,0.24,0.39,0.170213


In [12]:
print(f"Train_df shape: {train_df.shape}")
print(f"Test_df shape: {test_df.shape}")

Train_df shape: (283002, 23)
Test_df shape: (121287, 22)


In [13]:
y = train_df['IsDuplicate']
X = train_df[train_df.drop(columns=['Id', 'IsDuplicate']).columns.tolist()]

## K-Fold

The features corresponding to these names will be used among with features after applying vectorization of Question1 and Question2

In [14]:
extra_features_names = train_df.drop(columns=['Id', 'Question1', 'Question2', 'IsDuplicate']).columns.tolist()

In [15]:
# Apply vectorization to train_q1, test_q1, train_q2, test_q2
def vectorize_questions(q1_vectorizer, q2_vectorizer, train_q1, test_q1, train_q2, test_q2):
    train_q1_vectors = q1_vectorizer.fit_transform(train_q1)
    test_q1_vectors = q1_vectorizer.transform(test_q1)
    
    train_q2_vectors = q2_vectorizer.fit_transform(train_q2)
    test_q2_vectors = q2_vectorizer.transform(test_q2)
    return train_q1_vectors, test_q1_vectors, train_q2_vectors, test_q2_vectors

In [16]:
def train_eval_model(model, X, y, vectorizer, extra_features, k=5, xgb_parameters=False):
    """
    Trains and evaluates a machine learning model using k-fold cross-validation. The function supports
    both traditional sklearn models and XGBoost models. For XGBoost models, additional parameters can be
    passed through the `xgb_parameters` dictionary. The dataset is vectorized using the provided vectorizer,
    and additional features can be included.

    Parameters:
    - model: The machine learning model to be trained. If `xgb_parameters` is provided, this model is ignored.
    - X: DataFrame containing the input features, must include 'Question1' and 'Question2' columns for vectorization.
    - y: Series or array-like containing the target variable.
    - vectorizer: The vectorizer to be used for transforming 'Question1' and 'Question2' into numerical features.
    - extra_features: Array-like or DataFrame containing additional features to be concatenated with the vectorized questions.
    - k: int, optional (default=5), The number of folds for k-fold cross-validation.
    - xgb_parameters: dict, optional (default=False), Parameters for training an XGBoost model. If False, the function
      uses the provided `model` parameter for training. If provided, it should include 'n_estimators' and any other
      XGBoost-specific parameters.

    Returns:
    A tuple containing the mean accuracy, mean precision, mean recall, mean F1 score, and total training time across all folds.
    """
    
    if xgb_parameters:
        n_estimators = xgb_parameters.get('n_estimators') 
        xgb_parameters.pop('n_estimators')
        
    kf = KFold(n_splits=k, shuffle=True, random_state=21)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        extra_features_train, extra_features_test = extra_features[train_index], extra_features[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        q1_vectorizer = clone(vectorizer)
        q2_vectorizer = clone(vectorizer)
        train_q1_vectors, test_q1_vectors, train_q2_vectors, test_q2_vectors = vectorize_questions(q1_vectorizer, q2_vectorizer, X_train['Question1'], 
                                                                                                  X_test['Question1'], X_train['Question2'], X_test['Question2'])
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []
        start_time = time.time()

        # Concatenate all features
        X_train_combined = hstack([train_q1_vectors, train_q2_vectors, extra_features_train])
        X_test_combined = hstack([test_q1_vectors, test_q2_vectors, extra_features_test])

        if xgb_parameters:
            dtrain = xgb.DMatrix(X_train_combined, label=y_train)
            dvalid = xgb.DMatrix(X_test_combined, label=y_test)
            xgb_model = xgb.train(xgb_parameters, dtrain, num_boost_round=n_estimators, evals=[(dvalid, "validation")], 
                                  early_stopping_rounds=10, verbose_eval=False)
            preds = xgb_model.predict(dvalid)
            y_pred = np.rint(preds)
        else:
            clf = clone(model)
            clf.fit(X_train_combined, y_train)
            y_pred = clf.predict(X_test_combined)

        accuracies.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred, average='macro'))
        recalls.append(recall_score(y_test, y_pred, average='macro'))
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    total_time = time.time() - start_time
    mean_accuracy = np.mean(accuracies)
    mean_precision = np.mean(precisions)
    mean_recall = np.mean(recalls)
    mean_f1_score = np.mean(f1_scores)
    return mean_accuracy, mean_precision, mean_recall, mean_f1_score, total_time

Create a dictionary mapping weight factors to weighted_extra_features in order to experiment with diferrent weights later 

In [17]:
extra_features_names = train_df.drop(columns=['Id', 'Question1', 'Question2', 'IsDuplicate']).columns.tolist()
extra_features_weight_factors = [1, 1.5, 2, 3]
weighted_extra_features_sets={}
for weight_factor in extra_features_weight_factors:
    weighted_extra_features = X[extra_features_names] * weight_factor
    weighted_extra_features = csr_matrix(weighted_extra_features.to_numpy())
    weighted_extra_features_sets[weight_factor] = weighted_extra_features

We will try **Logistic Regression** and **XGBoost** using the features **Bag of Words (BoW)** and **TF-IDF** to evaluate them.

**Logistic Regression**:
Logistic Regression is a statistical method for analyzing a dataset in which there are one or more independent variables that determine an outcome. The outcome is measured with a dichotomous variable (in which there are only two possible outcomes). It is used for binary classification tasks, where it models the probability that a given input belongs to a particular category (e.g., 0 or 1, true or false) based on a logistic function. It is a linear model, with the addition of a sigmoid function to convert linear output to a probability between 0 and 1.

**XGBoost (Extreme Gradient Boosting)**:
XGBoost is an advanced implementation of gradient boosting algorithm, designed for speed and performance. It is a scalable and highly efficient end-to-end tree boosting system. XGBoost improves upon the base gradient boosting framework through system optimization and algorithm enhancements, including handling sparse data, tree pruning, and regularized learning to prevent overfitting. It is capable of performing both classification and regression tasks.

We are going to evaluate and report the performance of every model + feature combination
with 5-fold Cross Validation using:
* Accuracy
* Precision
* Recall
* F1
* Training Time

In order to evaluate the model performance we expirement with the hyperparameters of BoW method, TF-IDF method, Logistic Regression model and XGBoost Model. In order to find the optimal ones we use optuna:\
A modern hyperparameter optimization framework that implements both grid search and more sophisticated algorithms like TPE (Tree-structured Parzen Estimator). It's designed for efficiency and ease of use, providing a flexible and clear interface for defining search spaces and optimization targets.

The hyperparameters we experiment with are:

**BoW Parameters**
* **max_df**: Limits the maximum frequency within the documents a given word can have to be considered
* **min_df**: Sets the minimum frequency a term must have to be included in the vocabulary
* **ngram_range**: Specifies the range of n-value for different n-grams to be extracted
* **max_features**: Limits the number of features to the top N features ordered by term frequency across the corpus

**TF-IDF Parameters**
* **max_df**: This parameter is used to remove terms that appear too frequently
* **min_df**: This parameter is used to remove terms that appear too infrequently
* **ngram_range**: This specifies the range of n-grams to be extracted
* **norm**: The norm used to normalize term vectors
* **smooth_idf**: Smooths idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection
* **sublinear_tf**: Apply sublinear tf scaling, i.e., replace tf with 1 + log(tf)
* **max_features**: Limits the number of features to the top N features ordered by term frequency across the corpus

**Logistic Regression Parameters**
* **C**: Controls the amount of shrinkage applied to the model
* **penalty**: Specifies the norm used in the penalization
* **class_weight**: Adjust weights inversely proportional to class frequencies in the input data
* **solver**: Algorithm to use in the optimization problem
* **max_iter**: Maximum number of iterations taken for the solvers to converge

**XGBoost**
* **n_estimators**: It determines how many trees are built. Too few might underfit, while too many can lead to overfitting
* **max_depth**: Controls the depth of the trees. Deeper trees can model more complex patterns but also can lead to overfitting
* **learning_rate**: Determines the step size at each iteration while moving toward a minimum of a loss function
* **subsample**: The fraction of samples to be used for each tree
* **colsample_bytree**: The fraction of features to be used for each tree

Additionally, we explored varying the weight factor applied to the extracted features, excluding the TF-IDF and BoW vectors. This involved adjusting the weight factor to modify the influence of these features in the model, apart from the primary text vectorization techniques.

In [18]:
results=[]

# Logistic Regression with BoW

In [19]:
def objective(trial):
    # Bow parameters
    max_df = trial.suggest_float('max_df', 0.7, 0.95)
    min_df = trial.suggest_int('min_df', 1, 5)  
    ngram_range = (1, trial.suggest_int('ngram_2', 2, 4))   
    bow_max_features = trial.suggest_int('bow_max_features', 10000, 20000)      
    vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, max_features=bow_max_features)

    extra_features_weight_factor = trial.suggest_categorical('extra_features_weight_factor', extra_features_weight_factors)
    extra_features = weighted_extra_features_sets[extra_features_weight_factor]

    C = trial.suggest_float('C', 1e-5, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    solver = 'liblinear' if penalty == 'l1' else trial.suggest_categorical('solver', ['liblinear', 'saga'])
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    model = LogisticRegression(C=C, penalty=penalty, class_weight=class_weight, solver=solver, max_iter=max_iter, random_state=21, n_jobs=-1)
    accuracy, _, _, _, _ = train_eval_model(model, X, y, vectorizer, extra_features, k=3)
    return accuracy 

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-02-13 15:30:13,209] A new study created in memory with name: no-name-300886eb-de6e-400c-9514-219533de74b7


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2024-02-13 15:32:25,005] Trial 0 finished with value: 0.8008565310492506 and parameters: {'max_df': 0.9250145097354617, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 14418, 'extra_features_weight_factor': 1, 'C': 0.1004377449272904, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 188}. Best is trial 0 with value: 0.8008565310492506.




[I 2024-02-13 15:39:44,633] Trial 1 finished with value: 0.8078317467721077 and parameters: {'max_df': 0.8828411142505704, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 17422, 'extra_features_weight_factor': 1, 'C': 1.0286631341667167, 'penalty': 'l1', 'class_weight': None, 'max_iter': 330}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 15:48:48,687] Trial 2 finished with value: 0.7989166154302797 and parameters: {'max_df': 0.7515192410111597, 'min_df': 5, 'ngram_2': 4, 'bow_max_features': 19624, 'extra_features_weight_factor': 1.5, 'C': 3.042558559080279, 'penalty': 'l1', 'class_weight': None, 'max_iter': 374}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 15:56:10,514] Trial 3 finished with value: 0.7665846884474314 and parameters: {'max_df': 0.9462708422964814, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 16496, 'extra_features_weight_factor': 3, 'C': 0.001990212316999337, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'saga', 'max_iter': 490}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:01:03,127] Trial 4 finished with value: 0.7995314520745437 and parameters: {'max_df': 0.7930023088813857, 'min_df': 3, 'ngram_2': 2, 'bow_max_features': 17158, 'extra_features_weight_factor': 1.5, 'C': 0.181320852311232, 'penalty': 'l1', 'class_weight': None, 'max_iter': 821}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:02:18,862] Trial 5 finished with value: 0.75076854580533 and parameters: {'max_df': 0.876775732676067, 'min_df': 4, 'ngram_2': 2, 'bow_max_features': 12715, 'extra_features_weight_factor': 3, 'C': 0.0007494306915828534, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 708}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:04:13,141] Trial 6 finished with value: 0.6924120677592385 and parameters: {'max_df': 0.7645754269249617, 'min_df': 4, 'ngram_2': 3, 'bow_max_features': 10997, 'extra_features_weight_factor': 1, 'C': 0.001411990886008604, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 696}. Best is trial 1 with value: 0.8078317467721077.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-13 16:05:55,223] Trial 7 finished with value: 0.6276316068437678 and parameters: {'max_df': 0.7281224865567089, 'min_df': 3, 'ngram_2': 3, 'bow_max_features': 17415, 'extra_features_weight_factor': 2, 'C': 2.7175862025882936e-05, 'penalty': 'l1', 'class_weight': None, 'max_iter': 287}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:08:38,551] Trial 8 finished with value: 0.6910657875209363 and parameters: {'max_df': 0.9472616212619236, 'min_df': 5, 'ngram_2': 4, 'bow_max_features': 14008, 'extra_features_weight_factor': 3, 'C': 0.0010783240567717534, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 769}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:15:27,206] Trial 9 finished with value: 0.8039519155341658 and parameters: {'max_df': 0.7142407657092962, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 15728, 'extra_features_weight_factor': 2, 'C': 1.0453816074110662, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 478}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:26:38,978] Trial 10 finished with value: 0.7904149087285602 and parameters: {'max_df': 0.8496547021993366, 'min_df': 1, 'ngram_2': 4, 'bow_max_features': 19887, 'extra_features_weight_factor': 1, 'C': 8.437189488312203, 'penalty': 'l1', 'class_weight': None, 'max_iter': 983}. Best is trial 1 with value: 0.8078317467721077.




[I 2024-02-13 16:31:38,880] Trial 11 finished with value: 0.8079695549854772 and parameters: {'max_df': 0.7040892823952472, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 15943, 'extra_features_weight_factor': 2, 'C': 0.388773877161932, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 517}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 16:34:36,369] Trial 12 finished with value: 0.807174507600653 and parameters: {'max_df': 0.8855248150049375, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 18365, 'extra_features_weight_factor': 2, 'C': 0.06196190513972375, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 114}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 16:39:19,612] Trial 13 finished with value: 0.8050543812411219 and parameters: {'max_df': 0.8155177394569909, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 15122, 'extra_features_weight_factor': 2, 'C': 0.720599461810114, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 328}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 16:44:00,920] Trial 14 finished with value: 0.7634257001717303 and parameters: {'max_df': 0.8918548728800539, 'min_df': 1, 'ngram_2': 4, 'bow_max_features': 18138, 'extra_features_weight_factor': 1, 'C': 0.022644061194428532, 'penalty': 'l1', 'class_weight': None, 'max_iter': 578}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 16:47:27,188] Trial 15 finished with value: 0.8067186804333538 and parameters: {'max_df': 0.8398367337954193, 'min_df': 3, 'ngram_2': 2, 'bow_max_features': 13367, 'extra_features_weight_factor': 2, 'C': 0.4431370676198216, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 582}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 16:53:31,447] Trial 16 finished with value: 0.7902877011469883 and parameters: {'max_df': 0.8050383111403421, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 16104, 'extra_features_weight_factor': 1, 'C': 0.00935068442159744, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 390}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:02:06,379] Trial 17 finished with value: 0.7969448979159158 and parameters: {'max_df': 0.7738198111565597, 'min_df': 4, 'ngram_2': 3, 'bow_max_features': 11686, 'extra_features_weight_factor': 1.5, 'C': 3.8279074828881527, 'penalty': 'l1', 'class_weight': None, 'max_iter': 196}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:05:14,026] Trial 18 finished with value: 0.7892276379672228 and parameters: {'max_df': 0.8588261771326966, 'min_df': 1, 'ngram_2': 4, 'bow_max_features': 18472, 'extra_features_weight_factor': 1, 'C': 0.012650739609422848, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 442}. Best is trial 11 with value: 0.8079695549854772.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-13 17:06:14,010] Trial 19 finished with value: 0.6276316068437678 and parameters: {'max_df': 0.7004597980199825, 'min_df': 2, 'ngram_2': 2, 'bow_max_features': 15208, 'extra_features_weight_factor': 2, 'C': 2.706619703995508e-05, 'penalty': 'l1', 'class_weight': None, 'max_iter': 267}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:14:35,069] Trial 20 finished with value: 0.8036868997392245 and parameters: {'max_df': 0.9055961578579996, 'min_df': 3, 'ngram_2': 3, 'bow_max_features': 10131, 'extra_features_weight_factor': 2, 'C': 0.31825204296341264, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 635}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:17:26,954] Trial 21 finished with value: 0.8058494286259461 and parameters: {'max_df': 0.8767062614165133, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 18727, 'extra_features_weight_factor': 2, 'C': 0.047306651841524675, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 112}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:20:32,392] Trial 22 finished with value: 0.8073547183412131 and parameters: {'max_df': 0.9102135883146212, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 17296, 'extra_features_weight_factor': 2, 'C': 1.7868859754666255, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 130}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:24:37,556] Trial 23 finished with value: 0.8057752242033626 and parameters: {'max_df': 0.9037661128752484, 'min_df': 1, 'ngram_2': 3, 'bow_max_features': 17010, 'extra_features_weight_factor': 2, 'C': 2.168656693828978, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 219}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:30:58,438] Trial 24 finished with value: 0.8062098501070664 and parameters: {'max_df': 0.9202595677281497, 'min_df': 2, 'ngram_2': 3, 'bow_max_features': 17610, 'extra_features_weight_factor': 2, 'C': 0.7700357127974151, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 379}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:35:47,115] Trial 25 finished with value: 0.8049377742913477 and parameters: {'max_df': 0.8267297547633592, 'min_df': 3, 'ngram_2': 3, 'bow_max_features': 16480, 'extra_features_weight_factor': 1, 'C': 1.6067805950832785, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 283}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:47:32,686] Trial 26 finished with value: 0.7878071533063371 and parameters: {'max_df': 0.8611763259032363, 'min_df': 2, 'ngram_2': 4, 'bow_max_features': 19094, 'extra_features_weight_factor': 1.5, 'C': 7.3763354974240505, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 526}. Best is trial 11 with value: 0.8079695549854772.




[I 2024-02-13 17:53:01,529] Trial 27 finished with value: 0.8097398604956856 and parameters: {'max_df': 0.9254521151171553, 'min_df': 3, 'ngram_2': 2, 'bow_max_features': 15655, 'extra_features_weight_factor': 3, 'C': 0.1648181388385525, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 427}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 17:58:40,833] Trial 28 finished with value: 0.7992240337524117 and parameters: {'max_df': 0.9329252391538257, 'min_df': 3, 'ngram_2': 2, 'bow_max_features': 14442, 'extra_features_weight_factor': 3, 'C': 0.17974368412997963, 'penalty': 'l1', 'class_weight': None, 'max_iter': 436}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:01:46,445] Trial 29 finished with value: 0.8041957300655119 and parameters: {'max_df': 0.7902545760666787, 'min_df': 4, 'ngram_2': 2, 'bow_max_features': 15549, 'extra_features_weight_factor': 3, 'C': 0.2077371641737866, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 627}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:08:16,369] Trial 30 finished with value: 0.7804609154705621 and parameters: {'max_df': 0.9286685365119814, 'min_df': 3, 'ngram_2': 2, 'bow_max_features': 14048, 'extra_features_weight_factor': 3, 'C': 0.004790755650081039, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 532}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:12:53,447] Trial 31 finished with value: 0.807481925922785 and parameters: {'max_df': 0.9070363734381807, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 16712, 'extra_features_weight_factor': 3, 'C': 0.07273753559944748, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 336}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:17:27,977] Trial 32 finished with value: 0.807757542349524 and parameters: {'max_df': 0.7397044144945459, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 16333, 'extra_features_weight_factor': 3, 'C': 0.07652706761183257, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 332}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:22:56,151] Trial 33 finished with value: 0.8010685436852036 and parameters: {'max_df': 0.7415619780447327, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 16018, 'extra_features_weight_factor': 3, 'C': 0.028299092766752517, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 421}. Best is trial 27 with value: 0.8097398604956856.




[I 2024-02-13 18:27:28,248] Trial 34 finished with value: 0.8108741280980346 and parameters: {'max_df': 0.7300510548118829, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17787, 'extra_features_weight_factor': 3, 'C': 0.1511698403377804, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 331}. Best is trial 34 with value: 0.8108741280980346.




[I 2024-02-13 18:30:54,221] Trial 35 finished with value: 0.8107787224118558 and parameters: {'max_df': 0.7025093275558895, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17803, 'extra_features_weight_factor': 3, 'C': 0.15866097044537372, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 488}. Best is trial 34 with value: 0.8108741280980346.




[I 2024-02-13 18:34:20,368] Trial 36 finished with value: 0.8109377318888206 and parameters: {'max_df': 0.7026725408375757, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19340, 'extra_features_weight_factor': 3, 'C': 0.15042289892477853, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 477}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:37:25,916] Trial 37 finished with value: 0.8058706298895414 and parameters: {'max_df': 0.7227372458391973, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19188, 'extra_features_weight_factor': 3, 'C': 0.1631072781295116, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 469}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:38:36,759] Trial 38 finished with value: 0.7320266287870757 and parameters: {'max_df': 0.759252451938205, 'min_df': 5, 'ngram_2': 2, 'bow_max_features': 17927, 'extra_features_weight_factor': 3, 'C': 0.000247549492311072, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 396}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:40:36,328] Trial 39 finished with value: 0.8014607670617169 and parameters: {'max_df': 0.7329283980849165, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19327, 'extra_features_weight_factor': 3, 'C': 0.026893740878836742, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 586}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:44:07,109] Trial 40 finished with value: 0.7692560476604405 and parameters: {'max_df': 0.7139268996483664, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19863, 'extra_features_weight_factor': 3, 'C': 0.0030806647587049035, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'saga', 'max_iter': 240}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:49:10,566] Trial 41 finished with value: 0.8072275107596413 and parameters: {'max_df': 0.7003955139807795, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 14585, 'extra_features_weight_factor': 3, 'C': 0.44242246000318963, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 541}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:52:20,135] Trial 42 finished with value: 0.8104077002989378 and parameters: {'max_df': 0.71337710856187, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17784, 'extra_features_weight_factor': 3, 'C': 0.14521555697516636, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 490}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:55:41,863] Trial 43 finished with value: 0.8105349078805096 and parameters: {'max_df': 0.7175582764924363, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17776, 'extra_features_weight_factor': 3, 'C': 0.14818694355055187, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 477}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 18:58:28,347] Trial 44 finished with value: 0.8094854453325419 and parameters: {'max_df': 0.7168155324955549, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17833, 'extra_features_weight_factor': 3, 'C': 0.10622244321374816, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 483}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 19:00:14,917] Trial 45 finished with value: 0.7956940233637925 and parameters: {'max_df': 0.7465263714146342, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 18679, 'extra_features_weight_factor': 3, 'C': 0.014819420092290965, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 891}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 19:02:26,179] Trial 46 finished with value: 0.8045243496512392 and parameters: {'max_df': 0.7280489789864468, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 17028, 'extra_features_weight_factor': 3, 'C': 0.04736797130343949, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 676}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 19:05:13,446] Trial 47 finished with value: 0.8101002819768058 and parameters: {'max_df': 0.7125005594130907, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 18861, 'extra_features_weight_factor': 3, 'C': 0.1083008633420337, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 752}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 19:08:31,799] Trial 48 finished with value: 0.8099836750270316 and parameters: {'max_df': 0.754211843794511, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 18384, 'extra_features_weight_factor': 1.5, 'C': 0.2811121766619799, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 497}. Best is trial 36 with value: 0.8109377318888206.




[I 2024-02-13 19:10:04,903] Trial 49 finished with value: 0.7873619267708355 and parameters: {'max_df': 0.7704193073215974, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19396, 'extra_features_weight_factor': 3, 'C': 0.007315769213112299, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 349}. Best is trial 36 with value: 0.8109377318888206.


In [20]:
best_params = study.best_params
print("Best parameters for BoW with Logistic Regression:", best_params)

Best parameters for BoW with Logistic Regression: {'max_df': 0.7026725408375757, 'min_df': 1, 'ngram_2': 2, 'bow_max_features': 19340, 'extra_features_weight_factor': 3, 'C': 0.15042289892477853, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 477}


In [21]:
vectorizer = CountVectorizer(max_df=best_params["max_df"], min_df=best_params["min_df"], ngram_range=(1, best_params["ngram_2"]), max_features=best_params["bow_max_features"])
extra_features = weighted_extra_features_sets[best_params["extra_features_weight_factor"]]
model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], class_weight=best_params['class_weight'], solver=best_params['solver'], max_iter=best_params['max_iter'], random_state=21, n_jobs=-1)
mean_accuracy, mean_precision, mean_recall, mean_f1_score, total_time = train_eval_model(model, X, y, vectorizer, extra_features, k=5)
results.append({"Method": "Logistic Regression with BoW", "Accuracy": mean_accuracy, "Precision": mean_precision, "Recall": mean_recall, "F-Measure": mean_f1_score})



In [None]:
LR_BoW_best_params = best_params

In [22]:
print("Logistic Regression Forest Model with BoW:")
print(f"- Accuracy: {mean_accuracy:.2f}")
print(f"- Precision: {mean_precision:.2f}")
print(f"- Recall: {mean_recall:.2f}")
print(f"- F1: {mean_f1_score:.2f}")
print(f"- Total Time: {total_time:.2f}")

Logistic Regression Forest Model with BoW:
- Accuracy: 0.81
- Precision: 0.80
- Recall: 0.80
- F1: 0.80
- Total Time: 60.37


# Logistic Regression with TF-IDF

In [14]:
def objective(trial):
    # TF-IDF parameters
    max_df = trial.suggest_float('max_df', 0.7, 0.95)
    min_df = trial.suggest_int('min_df', 1, 5)
    ngram_range = (1, trial.suggest_int('ngram_2', 2, 3))      
    norm = trial.suggest_categorical('norm', ['l1', 'l2'])
    smooth_idf = trial.suggest_categorical('smooth_idf', [True, False])
    sublinear_tf = trial.suggest_categorical('sublinear_tf', [True, False])
    max_features_tfidf = trial.suggest_int('max_features_tfidf', 10000, 20000)      

    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, norm=norm, 
                                 smooth_idf=smooth_idf, sublinear_tf=sublinear_tf, max_features=max_features_tfidf)
    
    extra_features_weight_factor = trial.suggest_categorical('extra_features_weight_factor', extra_features_weight_factors)
    extra_features = weighted_extra_features_sets[extra_features_weight_factor]

    C = trial.suggest_float('C', 1e-5, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    solver = 'liblinear' if penalty == 'l1' else trial.suggest_categorical('solver', ['liblinear', 'saga'])
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    model = LogisticRegression(C=C, penalty=penalty, class_weight=class_weight, solver=solver, max_iter=max_iter, random_state=21, n_jobs=-1)

    accuracy, _, _, _, _ = train_eval_model(model, X, y, vectorizer, extra_features, k=3)
    return accuracy 
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True) 

[I 2024-02-14 06:11:17,760] A new study created in memory with name: no-name-89af4475-2703-413f-890a-6882ae049aad


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2024-02-14 06:12:30,466] Trial 0 finished with value: 0.6988042487332246 and parameters: {'max_df': 0.9199435167279855, 'min_df': 2, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': False, 'max_features_tfidf': 13073, 'extra_features_weight_factor': 1, 'C': 0.005724052480584991, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'liblinear', 'max_iter': 870}. Best is trial 0 with value: 0.6988042487332246.




[I 2024-02-14 06:14:54,842] Trial 1 finished with value: 0.794358343757288 and parameters: {'max_df': 0.8941482856316219, 'min_df': 2, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 14104, 'extra_features_weight_factor': 1.5, 'C': 0.13786912219363018, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 971}. Best is trial 1 with value: 0.794358343757288.




[I 2024-02-14 06:18:44,153] Trial 2 finished with value: 0.7878177539381347 and parameters: {'max_df': 0.8171987577616154, 'min_df': 2, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 11671, 'extra_features_weight_factor': 3, 'C': 1.1515274992320799, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 603}. Best is trial 1 with value: 0.794358343757288.




[I 2024-02-14 06:28:46,739] Trial 3 finished with value: 0.8006975215722857 and parameters: {'max_df': 0.9004696444200713, 'min_df': 1, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 19687, 'extra_features_weight_factor': 2, 'C': 6.1962070029410805, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 749}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 06:30:32,884] Trial 4 finished with value: 0.6941611720058516 and parameters: {'max_df': 0.8828169422446907, 'min_df': 5, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': False, 'sublinear_tf': True, 'max_features_tfidf': 16975, 'extra_features_weight_factor': 1.5, 'C': 0.00021609654613261468, 'penalty': 'l2', 'class_weight': None, 'solver': 'liblinear', 'max_iter': 109}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 06:31:37,314] Trial 5 finished with value: 0.6793520893845273 and parameters: {'max_df': 0.852752416608433, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': True, 'max_features_tfidf': 18658, 'extra_features_weight_factor': 2, 'C': 0.0006525210787958934, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 353}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 06:35:25,602] Trial 6 finished with value: 0.7967010833845697 and parameters: {'max_df': 0.7435728725447797, 'min_df': 2, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15492, 'extra_features_weight_factor': 1, 'C': 2.692801090805718, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 193}. Best is trial 3 with value: 0.8006975215722857.
[I 2024-02-14 06:39:52,666] Trial 7 finished with value: 0.787531536879598 and parameters: {'max_df': 0.7208977423738201, 'min_df': 4, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 12270, 'extra_features_weight_factor': 1, 'C': 0.10911019028199953, 'penalty': 'l2', 'class_weight': 'balanced', 'solver': 'saga', 'max_iter': 527}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 06:47:34,234] Trial 8 finished with value: 0.7972841181334408 and parameters: {'max_df': 0.7974122265226217, 'min_df': 5, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 10237, 'extra_features_weight_factor': 1.5, 'C': 1.0972709780321779, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 160}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 06:52:30,066] Trial 9 finished with value: 0.7436025187101151 and parameters: {'max_df': 0.7602329540604393, 'min_df': 4, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 17030, 'extra_features_weight_factor': 1, 'C': 0.057230528648159334, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 716}. Best is trial 3 with value: 0.8006975215722857.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-14 06:53:29,912] Trial 10 finished with value: 0.6276316068437678 and parameters: {'max_df': 0.9434424850401708, 'min_df': 1, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 19911, 'extra_features_weight_factor': 2, 'C': 3.389843151646095e-05, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 772}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:04:01,791] Trial 11 finished with value: 0.7921958148705663 and parameters: {'max_df': 0.8023526879831568, 'min_df': 5, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 10119, 'extra_features_weight_factor': 1.5, 'C': 9.754136491389506, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 327}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:11:14,757] Trial 12 finished with value: 0.7962982593762589 and parameters: {'max_df': 0.7764505818507839, 'min_df': 1, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 15547, 'extra_features_weight_factor': 2, 'C': 0.6960625222268066, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 521}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:20:59,472] Trial 13 finished with value: 0.795460809464244 and parameters: {'max_df': 0.8460221041219045, 'min_df': 4, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': False, 'max_features_tfidf': 10789, 'extra_features_weight_factor': 3, 'C': 7.0901325357678004, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 373}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:23:28,178] Trial 14 finished with value: 0.7055992537155215 and parameters: {'max_df': 0.7930211109348145, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 18120, 'extra_features_weight_factor': 1.5, 'C': 0.010170103087195017, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 685}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:29:58,310] Trial 15 finished with value: 0.7608073441177094 and parameters: {'max_df': 0.8654318546128268, 'min_df': 3, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 13977, 'extra_features_weight_factor': 2, 'C': 0.4784554895475713, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 448}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:34:11,186] Trial 16 finished with value: 0.7178111815464202 and parameters: {'max_df': 0.7002208013915862, 'min_df': 5, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 19885, 'extra_features_weight_factor': 1.5, 'C': 0.02166504731143696, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 838}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:41:25,910] Trial 17 finished with value: 0.794262938071109 and parameters: {'max_df': 0.8309500544688843, 'min_df': 1, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16540, 'extra_features_weight_factor': 2, 'C': 1.987475807976533, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 224}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:47:56,012] Trial 18 finished with value: 0.7845739606080523 and parameters: {'max_df': 0.9032118895920013, 'min_df': 4, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 14319, 'extra_features_weight_factor': 3, 'C': 0.24431450029332127, 'penalty': 'l1', 'class_weight': None, 'max_iter': 605}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:50:12,769] Trial 19 finished with value: 0.6858502766764899 and parameters: {'max_df': 0.936566853635318, 'min_df': 5, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 18564, 'extra_features_weight_factor': 1.5, 'C': 0.002011476693943901, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 988}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 07:59:03,428] Trial 20 finished with value: 0.7985031907901711 and parameters: {'max_df': 0.8707246277452617, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 12781, 'extra_features_weight_factor': 2, 'C': 3.6683872328800016, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 114}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:07:59,353] Trial 21 finished with value: 0.7974219263468103 and parameters: {'max_df': 0.8694063535802163, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 11610, 'extra_features_weight_factor': 2, 'C': 3.7758424020291783, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 139}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:16:57,169] Trial 22 finished with value: 0.7979201560413001 and parameters: {'max_df': 0.8742408476996748, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 12159, 'extra_features_weight_factor': 2, 'C': 3.710742856539992, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 267}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:27:32,554] Trial 23 finished with value: 0.7933088812093201 and parameters: {'max_df': 0.9129914722813903, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 12983, 'extra_features_weight_factor': 2, 'C': 9.705944759059934, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 241}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:33:41,589] Trial 24 finished with value: 0.7898106727160938 and parameters: {'max_df': 0.8828373827066209, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 13022, 'extra_features_weight_factor': 2, 'C': 0.39766970268613894, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 286}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:42:21,720] Trial 25 finished with value: 0.798630398371743 and parameters: {'max_df': 0.8402037457674014, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 11468, 'extra_features_weight_factor': 2, 'C': 2.8463289982762774, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 272}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:46:11,842] Trial 26 finished with value: 0.7193270718934849 and parameters: {'max_df': 0.8317242045983848, 'min_df': 1, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 11090, 'extra_features_weight_factor': 2, 'C': 0.061182690190191936, 'penalty': 'l1', 'class_weight': 'balanced', 'max_iter': 406}. Best is trial 3 with value: 0.8006975215722857.




[I 2024-02-14 08:53:59,477] Trial 27 finished with value: 0.8063900608476265 and parameters: {'max_df': 0.8516229056978248, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 13668, 'extra_features_weight_factor': 2, 'C': 1.6089788731451606, 'penalty': 'l1', 'class_weight': None, 'max_iter': 449}. Best is trial 27 with value: 0.8063900608476265.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[I 2024-02-14 08:54:58,913] Trial 28 finished with value: 0.6276316068437678 and parameters: {'max_df': 0.8486441946171666, 'min_df': 2, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15086, 'extra_features_weight_factor': 2, 'C': 1.1263432370203035e-05, 'penalty': 'l1', 'class_weight': None, 'max_iter': 439}. Best is trial 27 with value: 0.8063900608476265.
[I 2024-02-14 08:59:39,786] Trial 29 finished with value: 0.7907647295778828 and parameters: {'max_df': 0.9277776297696372, 'min_df': 2, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 13771, 'extra_features_weight_factor': 2, 'C': 1.2407555934065209, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 902}. Best is trial 27 with value: 0.8063900608476265.




[I 2024-02-14 09:05:16,430] Trial 30 finished with value: 0.7843089448131109 and parameters: {'max_df': 0.9122476910741082, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 13503, 'extra_features_weight_factor': 2, 'C': 0.21811741583158872, 'penalty': 'l1', 'class_weight': None, 'max_iter': 618}. Best is trial 27 with value: 0.8063900608476265.




[I 2024-02-14 09:14:16,909] Trial 31 finished with value: 0.803178069412937 and parameters: {'max_df': 0.815858531116743, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 12396, 'extra_features_weight_factor': 2, 'C': 3.3958482629238746, 'penalty': 'l1', 'class_weight': None, 'max_iter': 186}. Best is trial 27 with value: 0.8063900608476265.




[I 2024-02-14 09:22:34,739] Trial 32 finished with value: 0.8060084381029109 and parameters: {'max_df': 0.819711455987925, 'min_df': 1, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 14697, 'extra_features_weight_factor': 2, 'C': 1.9953630451876583, 'penalty': 'l1', 'class_weight': None, 'max_iter': 486}. Best is trial 27 with value: 0.8063900608476265.




[I 2024-02-14 09:29:13,565] Trial 33 finished with value: 0.803517289630462 and parameters: {'max_df': 0.8171747305334653, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 14251, 'extra_features_weight_factor': 2, 'C': 0.7395830659729348, 'penalty': 'l1', 'class_weight': None, 'max_iter': 469}. Best is trial 27 with value: 0.8063900608476265.




[I 2024-02-14 09:35:15,141] Trial 34 finished with value: 0.8076303347679522 and parameters: {'max_df': 0.8149653602393341, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 14479, 'extra_features_weight_factor': 3, 'C': 0.735800354813108, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 473}. Best is trial 34 with value: 0.8076303347679522.




[I 2024-02-14 09:41:37,199] Trial 35 finished with value: 0.7750227913583649 and parameters: {'max_df': 0.8161203650509772, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 14496, 'extra_features_weight_factor': 3, 'C': 0.031414992918531996, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 504}. Best is trial 34 with value: 0.8076303347679522.




[I 2024-02-14 09:47:46,848] Trial 36 finished with value: 0.8079483537218818 and parameters: {'max_df': 0.7806064669447217, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16282, 'extra_features_weight_factor': 3, 'C': 0.7065566962380841, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 479}. Best is trial 36 with value: 0.8079483537218818.




[I 2024-02-14 09:54:54,129] Trial 37 finished with value: 0.7946869633430153 and parameters: {'max_df': 0.7720832626021882, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16148, 'extra_features_weight_factor': 3, 'C': 0.12841770581078307, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 577}. Best is trial 36 with value: 0.8079483537218818.
[I 2024-02-14 10:01:47,736] Trial 38 finished with value: 0.803485487735069 and parameters: {'max_df': 0.7835592284296398, 'min_df': 2, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 14770, 'extra_features_weight_factor': 3, 'C': 0.3341589929740676, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 667}. Best is trial 36 with value: 0.8079483537218818.




[I 2024-02-14 10:07:03,138] Trial 39 finished with value: 0.8086903979477177 and parameters: {'max_df': 0.7590369630639975, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16103, 'extra_features_weight_factor': 3, 'C': 0.9241048121016162, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 401}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:12:20,013] Trial 40 finished with value: 0.7406661437021647 and parameters: {'max_df': 0.7525502741366124, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 17690, 'extra_features_weight_factor': 3, 'C': 0.005287294371958565, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 393}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:18:27,961] Trial 41 finished with value: 0.8084465834163717 and parameters: {'max_df': 0.7352626576177363, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15487, 'extra_features_weight_factor': 3, 'C': 1.5530014003759842, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 475}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:23:57,083] Trial 42 finished with value: 0.8086691966841224 and parameters: {'max_df': 0.7329615350681864, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15748, 'extra_features_weight_factor': 3, 'C': 1.1720252924743748, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 414}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:28:42,423] Trial 43 finished with value: 0.8081815676214302 and parameters: {'max_df': 0.7327316489888404, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16185, 'extra_features_weight_factor': 3, 'C': 0.7672318084555475, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 342}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:33:14,886] Trial 44 finished with value: 0.7978565522505141 and parameters: {'max_df': 0.7328779804234051, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15791, 'extra_features_weight_factor': 3, 'C': 0.16716703237047684, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 327}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:40:20,655] Trial 45 finished with value: 0.7890898297538533 and parameters: {'max_df': 0.7184746557556797, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 17063, 'extra_features_weight_factor': 3, 'C': 0.08068105384625134, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 562}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:44:56,183] Trial 46 finished with value: 0.8086267941569317 and parameters: {'max_df': 0.7385543133325222, 'min_df': 4, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 16168, 'extra_features_weight_factor': 3, 'C': 0.9522805036520903, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 330}. Best is trial 39 with value: 0.8086903979477177.
[I 2024-02-14 10:49:24,331] Trial 47 finished with value: 0.7000657239171455 and parameters: {'max_df': 0.7380240686676922, 'min_df': 4, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 15165, 'extra_features_weight_factor': 3, 'C': 0.0003438420266052422, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 341}. Best is trial 39 with value: 0.8086903979477177.




[I 2024-02-14 10:53:38,671] Trial 48 finished with value: 0.8100578794496152 and parameters: {'max_df': 0.7075074852832529, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 17339, 'extra_features_weight_factor': 3, 'C': 1.1926655328748625, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 300}. Best is trial 48 with value: 0.8100578794496152.
[I 2024-02-14 10:57:00,746] Trial 49 finished with value: 0.8064006614794241 and parameters: {'max_df': 0.7003589222167774, 'min_df': 4, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': True, 'max_features_tfidf': 17705, 'extra_features_weight_factor': 1, 'C': 5.677696322117042, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 396}. Best is trial 48 with value: 0.8100578794496152.


In [15]:
best_params = study.best_params
print("Best parameters for TF-IDF with Logistic Regression:", best_params)

Best parameters for TF-IDF with Logistic Regression: {'max_df': 0.7075074852832529, 'min_df': 3, 'ngram_2': 2, 'norm': 'l2', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 17339, 'extra_features_weight_factor': 3, 'C': 1.1926655328748625, 'penalty': 'l2', 'class_weight': None, 'solver': 'saga', 'max_iter': 300}


In [16]:
vectorizer = TfidfVectorizer(max_df=best_params["max_df"], min_df=best_params["min_df"], ngram_range=(1, best_params["ngram_2"]), norm=best_params["norm"], 
                                 smooth_idf=best_params["smooth_idf"], sublinear_tf=best_params["sublinear_tf"], max_features=best_params['max_features_tfidf'])
extra_features = weighted_extra_features_sets[best_params["extra_features_weight_factor"]]
model = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'], class_weight=best_params['class_weight'], solver=best_params['solver'], max_iter=best_params['max_iter'], random_state=21, n_jobs=-1)

mean_accuracy, mean_precision, mean_recall, mean_f1_score, total_time = train_eval_model(model, X, y, vectorizer, extra_features, k=5)
results.append({"Method": "Logistic Regression with TF-IDF", "Accuracy": mean_accuracy, "Precision": mean_precision, "Recall": mean_recall, "F-Measure": mean_f1_score})



In [None]:
LR_tf_idf_best_params = best_params

In [17]:
print("Logistic Regression with TF-IDF:")
print(f"- Accuracy: {mean_accuracy:.2f}")
print(f"- Precision: {mean_precision:.2f}")
print(f"- Recall: {mean_recall:.2f}")
print(f"- F1: {mean_f1_score:.2f}")
print(f"- Total Time: {total_time:.2f}")

Logistic Regression with TF-IDF:
- Accuracy: 0.81
- Precision: 0.80
- Recall: 0.79
- F1: 0.80
- Total Time: 77.23


# XGB with BoW

In [22]:
def objective(trial):
    # Bow parameters
    max_df = trial.suggest_float('max_df', 0.7, 0.95)
    min_df = trial.suggest_int('min_df', 1, 5)  
    ngram_range = (1, trial.suggest_int('ngram_2', 2, 4))   
    bow_max_features = trial.suggest_int('bow_max_features', 10000, 20000)      
    vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, max_features=bow_max_features)

    extra_features_weight_factor = trial.suggest_categorical('extra_features_weight_factor', extra_features_weight_factors)
    extra_features = weighted_extra_features_sets[extra_features_weight_factor]

    param = {
        'objective': 'binary:logistic',  
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'n_jobs': -1,
    }

    accuracy, _, _, _, _ = train_eval_model(None, X, y, vectorizer, extra_features, 3, param)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-02-14 11:42:52,008] A new study created in memory with name: no-name-e26fc725-90b4-4e32-b212-af17e27b0ef8


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-02-14 11:48:11,125] Trial 0 finished with value: 0.8299658659656115 and parameters: {'max_df': 0.7665174882296566, 'min_df': 3, 'ngram_2': 3, 'bow_max_features': 10262, 'extra_features_weight_factor': 2, 'n_estimators': 444, 'max_depth': 9, 'learning_rate': 0.23434869459499993, 'subsample': 0.6420566055900587, 'colsample_bytree': 0.8572448756900397}. Best is trial 0 with value: 0.8299658659656115.
[I 2024-02-14 11:51:45,546] Trial 1 finished with value: 0.8168422838001145 and parameters: {'max_df': 0.7047153605147536, 'min_df': 1, 'ngram_2': 4, 'bow_max_features': 11509, 'extra_features_weight_factor': 3, 'n_estimators': 167, 'max_depth': 8, 'learning_rate': 0.2314214085133344, 'subsample': 0.897912552947423, 'colsample_bytree': 0.33020763972702805}. Best is trial 0 with value: 0.8299658659656115.
[I 2024-02-14 11:57:04,592] Trial 2 finished with value: 0.81790234697988 and parameters: {'max_df': 0.7093859397186328, 'min_df': 3, 'ngram_2': 4, 'bow_max_features': 12131, 'extra_f

In [23]:
best_params = study.best_params
print("Best parameters for BoW with XGBooster:", best_params)

Best parameters for BoW with XGBooster: {'max_df': 0.8806869371783079, 'min_df': 2, 'ngram_2': 4, 'bow_max_features': 16276, 'extra_features_weight_factor': 3, 'n_estimators': 547, 'max_depth': 10, 'learning_rate': 0.1821439563726806, 'subsample': 0.9474658762043557, 'colsample_bytree': 0.8229953057814048}


In [24]:
vectorizer = CountVectorizer(max_df=best_params["max_df"], min_df=best_params["min_df"], ngram_range=(1, best_params["ngram_2"]), max_features=best_params["bow_max_features"])
extra_features = weighted_extra_features_sets[best_params["extra_features_weight_factor"]]
param = {
    'objective': 'binary:logistic',  
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'n_jobs': -1,
}

mean_accuracy, mean_precision, mean_recall, mean_f1_score, total_time = train_eval_model(None, X, y, vectorizer, extra_features, 5, param)
results.append({"Method": "XGBooster with BoW", "Accuracy": mean_accuracy, "Precision": mean_precision, "Recall": mean_recall, "F-Measure": mean_f1_score})

In [None]:
XGB_BoW_best_params = best_params

In [25]:
print("XGBooster Model with BoW:")
print(f"- Accuracy: {mean_accuracy:.2f}")
print(f"- Precision: {mean_precision:.2f}")
print(f"- Recall: {mean_recall:.2f}")
print(f"- F1: {mean_f1_score:.2f}")
print(f"- Total Time: {total_time:.2f}")

XGBooster Model with BoW:
- Accuracy: 0.83
- Precision: 0.82
- Recall: 0.82
- F1: 0.82
- Total Time: 141.58


# XGB with TF-IDF

In [None]:
def objective(trial):
    # TF-IDF parameters
    max_df = trial.suggest_float('max_df', 0.7, 0.95)
    min_df = trial.suggest_int('min_df', 1, 5)
    ngram_range = (1, trial.suggest_int('ngram_2', 2, 3))      
    norm = trial.suggest_categorical('norm', ['l1', 'l2'])
    smooth_idf = trial.suggest_categorical('smooth_idf', [True, False])
    sublinear_tf = trial.suggest_categorical('sublinear_tf', [True, False])
    max_features_tfidf = trial.suggest_int('max_features_tfidf', 10000, 20000)      

    vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, ngram_range=ngram_range, norm=norm, 
                                 smooth_idf=smooth_idf, sublinear_tf=sublinear_tf, max_features=max_features_tfidf)
    
    extra_features_weight_factor = trial.suggest_categorical('extra_features_weight_factor', extra_features_weight_factors)
    extra_features = weighted_extra_features_sets[extra_features_weight_factor]

    param = {
        'objective': 'binary:logistic',  
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'n_jobs': -1,
    }

    accuracy, _, _, _, _ = train_eval_model(None, X, y, vectorizer, extra_features, 3, param)
    return accuracy

study = optuna.create_study(direction='maximize')   
study.optimize(objective, n_trials=50, show_progress_bar=True) 

[I 2024-02-17 15:13:39,781] A new study created in memory with name: no-name-87b32033-d072-4986-8c57-c14cea2bd725


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-02-17 15:30:34,971] Trial 0 finished with value: 0.8203192910297453 and parameters: {'max_df': 0.731377952188161, 'min_df': 1, 'ngram_2': 3, 'norm': 'l2', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 10759, 'extra_features_weight_factor': 3, 'n_estimators': 175, 'max_depth': 10, 'learning_rate': 0.17925346690025834, 'subsample': 0.6152894413906231, 'colsample_bytree': 0.7565823793228494}. Best is trial 0 with value: 0.8203192910297453.
[I 2024-02-17 15:45:04,824] Trial 1 finished with value: 0.8146055504908093 and parameters: {'max_df': 0.7161070165383437, 'min_df': 5, 'ngram_2': 3, 'norm': 'l1', 'smooth_idf': True, 'sublinear_tf': True, 'max_features_tfidf': 12327, 'extra_features_weight_factor': 3, 'n_estimators': 435, 'max_depth': 5, 'learning_rate': 0.1343715561875565, 'subsample': 0.9246878222422912, 'colsample_bytree': 0.7409376466567301}. Best is trial 0 with value: 0.8203192910297453.
[I 2024-02-17 16:05:16,536] Trial 2 finished with value: 0.822757

In [19]:
# best_params = {'max_df': 0.8247885633014839, 'min_df': 1, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 19777, 'extra_features_weight_factor': 2, 'n_estimators': 546, 'max_depth': 10, 'learning_rate': 0.19694488539952623, 'subsample': 0.8574307213675778, 'colsample_bytree': 0.5711394501156387}

In [20]:
best_params = study.best_params
print("Best parameters for TF-IDF with XGBooster:", best_params)

Best parameters for TF-IDF with XGBooster: {'max_df': 0.8247885633014839, 'min_df': 1, 'ngram_2': 2, 'norm': 'l1', 'smooth_idf': False, 'sublinear_tf': False, 'max_features_tfidf': 19777, 'extra_features_weight_factor': 2, 'n_estimators': 546, 'max_depth': 10, 'learning_rate': 0.19694488539952623, 'subsample': 0.8574307213675778, 'colsample_bytree': 0.5711394501156387}


In [22]:
vectorizer = TfidfVectorizer(max_df=best_params["max_df"], min_df=best_params["min_df"], ngram_range=(1, best_params["ngram_2"]), norm=best_params["norm"], 
                                 smooth_idf=best_params["smooth_idf"], sublinear_tf=best_params["sublinear_tf"], max_features=best_params['max_features_tfidf'])
extra_features = weighted_extra_features_sets[best_params["extra_features_weight_factor"]]
param = {
    'objective': 'binary:logistic',  
    'n_estimators': best_params['n_estimators'],
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['learning_rate'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'n_jobs': -1,
}

mean_accuracy, mean_precision, mean_recall, mean_f1_score, total_time = train_eval_model(None, X, y, vectorizer, extra_features, 5, param)
results.append({"Method": "XGBooster with TF-IDF", "Accuracy": mean_accuracy, "Precision": mean_precision, "Recall": mean_recall, "F-Measure": mean_f1_score, "Time": total_time})

In [40]:
XGB_tf_idf_best_params = best_params

In [23]:
print("XGBooster Model with TF-IDF:")
print(f"- Accuracy: {mean_accuracy:.2f}")
print(f"- Precision: {mean_precision:.2f}")
print(f"- Recall: {mean_recall:.2f}")
print(f"- F1: {mean_f1_score:.2f}")
print(f"- Total Time: {total_time:.2f}")

XGBooster Model with TF-IDF:
- Accuracy: 0.83
- Precision: 0.82
- Recall: 0.82
- F1: 0.82
- Total Time: 700.15


In [39]:
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Method,Accuracy,Precision,Recall,F-Measure,Time
0,Logistic Regression with BoW,0.81,0.8,0.8,0.8,60.37
1,Logistic Regression with TF-IDF,0.81,0.8,0.79,0.8,77.23
2,XGBooster with BoW,0.83,0.82,0.82,0.82,141.58
3,XGBooster with TF-IDF,0.83,0.82,0.82,0.82,700.15


# Comments on Results

**Logistic Regression with BoW and TF-IDF**
* **BoW Configuration**: The optimal parameters for the Bag of Words (BoW) approach with Logistic Regression include a specific range for max_df and min_df, a bigram (ngram_2: 2), and a substantial number of features (bow_max_features: 19340). The inclusion of an extra_features_weight_factor of 3 indicates the model benefits from emphasizing additional features alongside the textual content. The regularization strength (C: 0.15042289892477853) and choice of solver as 'liblinear' suggest a model that balances complexity and computational efficiency, achieving solid performance across all metrics (Accuracy, Precision, Recall, F1) at 0.80, with a relatively quick total time of 60.37 seconds.

* **TF-IDF Configuration**: The TF-IDF variant introduces slightly different parameters, with adjustments in max_df, min_df, and max_features_tfidf (17339), alongside TF-IDF specific parameters (norm: 'l2', smooth_idf: True, sublinear_tf: True). The extra_features_weight_factor remains at 3, underscoring the consistent value of additional features. The higher C value (1.1926655328748625) and use of the 'saga' solver indicate a preference for a more robust regularization approach, which maintains comparable performance metrics to the BoW model but with a slightly longer computation time (77.23 seconds).

**XGBooster with BoW and TF-IDF**
* **BoW Configuration**: The XGBoost model with BoW features highlights an aggressive approach to feature selection and ensemble complexity (n_estimators: 547, max_depth: 10, learning_rate: 0.1821439563726806). The extra_features_weight_factor of 3 continues to play a significant role, enhancing model performance with an optimal mix of textual and additional features. This configuration leads to improved accuracy and F1 scores (0.83) compared to Logistic Regression at the cost of increased computational time (141.58 seconds).

* **TF-IDF Configuration**: With TF-IDF, the XGBoost model adjusts its feature handling (max_features_tfidf: 19777) and slightly alters its ensemble strategy (n_estimators: 546, max_depth: 10). Notably, the extra_features_weight_factor is reduced to 2, possibly indicating a different valuation of additional features when combined with TF-IDF's normalized feature set. Despite these changes, performance metrics mirror those of the BoW configuration, but with a significant increase in computation time (700.15 seconds), suggesting a more complex model that is computationally intensive to train.

**Feature Representation**: The results indicate that both Logistic Regression and XGBoost models achieve similar performance levels with BoW and TF-IDF representations\
**Model Selection**: XGBoost shows a slight advantage over Logistic Regression in terms of accuracy and F1 scores, which could justify the additional computational resources required.\
**Computational Efficiency**: The significant difference in training times, especially for the TF-IDF variant of XGBoost, highlights the trade-offs between computational efficiency and model performance.

# Predictions on test dataset

Based on the score table we choose **XGBoost** with **TF-IDF** for our best model

In [47]:
class DuplicatesClassifier:
    def __init__(self, xgb_parameters, vectorizer, extra_features_weight_factor, extra_features_names):
        self.xgb_parameters = xgb_parameters
        self.vectorizer = vectorizer
        self.extra_features_weight_factor = extra_features_weight_factor
        self.extra_features_names = extra_features_names
        self.model = None
        self.q1_vectorizer = None
        self.q2_vectorizer = None
        
        
    def train(self, X, y):
        n_estimators = self.xgb_parameters.get('n_estimators') 
        xgb_parameters = copy.deepcopy(self.xgb_parameters)
        xgb_parameters.pop('n_estimators')

        self.q1_vectorizer = clone(self.vectorizer)
        self.q2_vectorizer = clone(self.vectorizer)

        train_q1_vectors = self.q1_vectorizer.fit_transform(X['Question1'])
        train_q2_vectors = self.q2_vectorizer.fit_transform(X['Question2'])
        
        train_weighted_extra_features = X[self.extra_features_names] * self.extra_features_weight_factor
        train_weighted_extra_features = csr_matrix(train_weighted_extra_features.to_numpy())

        X_train_combined = hstack([train_q1_vectors, train_q2_vectors, train_weighted_extra_features])

        dtrain = xgb.DMatrix(X_train_combined, label=y)
        self.model = xgb.train(xgb_parameters, dtrain, num_boost_round=n_estimators, verbose_eval=False)
        
        
    def predict(self, X):
        test_q1_vectors = self.q1_vectorizer.transform(X['Question1'])
        test_q2_vectors = self.q2_vectorizer.transform(X['Question2'])
        
        test_weighted_extra_features = X[self.extra_features_names] * self.extra_features_weight_factor
        test_weighted_extra_features = csr_matrix(test_weighted_extra_features.to_numpy())

        X_test_combined = hstack([test_q1_vectors, test_q2_vectors, test_weighted_extra_features])

        dvalid = xgb.DMatrix(X_test_combined)
        preds = self.model.predict(dvalid)
        y_pred = np.rint(preds)
        
        return y_pred


Train duplicates_classifier

In [48]:
vectorizer = TfidfVectorizer(max_df=XGB_tf_idf_best_params["max_df"], min_df=XGB_tf_idf_best_params["min_df"], ngram_range=(1, XGB_tf_idf_best_params["ngram_2"]), norm=XGB_tf_idf_best_params["norm"], 
                                 smooth_idf=XGB_tf_idf_best_params["smooth_idf"], sublinear_tf=XGB_tf_idf_best_params["sublinear_tf"], max_features=XGB_tf_idf_best_params['max_features_tfidf'])

param = {
    'objective': 'binary:logistic',  
    'n_estimators': XGB_tf_idf_best_params['n_estimators'],
    'max_depth': XGB_tf_idf_best_params['max_depth'],
    'learning_rate': XGB_tf_idf_best_params['learning_rate'],
    'subsample': XGB_tf_idf_best_params['subsample'],
    'colsample_bytree': XGB_tf_idf_best_params['colsample_bytree'],
    'n_jobs': -1,
}

extra_features_names = X.drop(columns=['Question1', 'Question2']).columns.tolist()

duplicates_classifier = DuplicatesClassifier(param, vectorizer, XGB_tf_idf_best_params["extra_features_weight_factor"], extra_features_names)
duplicates_classifier.train(X, y)


Predict labels for test dataset

In [49]:
X_test = test_df[test_df.drop(columns=['Id']).columns.tolist()]
y_pred = duplicates_classifier.predict(X_test)

We save the predictions separating the fields Id, Predicted with the comma (",") character specifying the article Id from the test set and the predicted label to indicate
whether the documents in the pair with the specified Id are similar or not

In [60]:
output_df = pd.DataFrame({'Id': test_df['Id'], 'Predicted': y_pred})

In [61]:
display(output_df.head(10))

Unnamed: 0,Id,Predicted
0,283003,0.0
1,283004,0.0
2,283005,1.0
3,283006,0.0
4,283007,0.0
5,283008,1.0
6,283009,0.0
7,283010,0.0
8,283011,0.0
9,283012,0.0


In [62]:
output_df.to_csv(PATH_TO_SAVE_RESULTS, index=False)