# Presentation

Pour ce code, nous allons modifier les valeurs d'input dans le Pipeline, en transformant directement les valeurs textuelles en valeurs binaires pour optimiser les valeurs de sortie du pipeline et par rapport aux précédents tests. 

In [15]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

from nltk.stem import WordNetLemmatizer
import nltk

# Loading dataset
data = pd.read_parquet('val-00000-of-00001-66ce8665444026dc.parquet')
data = data.dropna()

# Preprocessing	
# Extracting title and abstract from the dataset
X = data['title'] + ' ' + data['abstract']

# Extracting labels from the dataset (target)
Y_list = data['verified_uat_labels']

# Multi-label binarization of the target
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(Y_list)

# Parameters for the vectorizer and classifier
sgd_params = dict(alpha=1e-5, penalty="l2", loss="log_loss", n_jobs=-1, verbose=1,random_state=42,class_weight="balanced")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8,stop_words="english",strip_accents="ascii",max_features=3000,
                         lowercase=True)

# Pipeline for text classification
pipeline = Pipeline(
    [   
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", OneVsRestClassifier(SGDClassifier(**sgd_params))),
    ]
)

# Evaluation function
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    global y_pred
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Calculate micro-averaged F1 score for multi-label classification
    print("Micro-averaged F1 score on test set: %0.3f" % f1_score(y_test, y_pred, average="micro"))
    print("-" * 10)

# Main script
if __name__ == "__main__":
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Supervised SGDClassifier with multi-label classification:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)


Supervised SGDClassifier with multi-label classification:
-- Epoch 1
Norm: 6653.35, NNZs: 3000, Bias: 35.425471, T: 2416, Avg. loss: 125.848221
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 6064.46, NNZs: 3000, Bias: -1.062653, T: 4832, Avg. loss: 197.273062
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 4838.53, NNZs: 3000, Bias: -27.288941, T: 7248, Avg. loss: 47.669046
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 4035.41, NNZs: 3000, Bias: -36.973116, T: 9664, Avg. loss: 18.085442
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 3460.21, NNZs: 3000, Bias: -42.296486, T: 12080, Avg. loss: 10.571022
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 3027.37, NNZs: 3000, Bias: -45.901370, T: 14496, Avg. loss: 6.969975
Total training time: 0.00 seconds.
-- Epoch 7
Norm: 2690.32, NNZs: 3000, Bias: -48.277335, T: 16912, Avg. loss: 4.710537
Total training time: 0.00 seconds.
-- Epoch 8
Norm: 2420.37, NNZs: 3000, Bias: -49.903127, T: 19328, Avg. loss: 3.421862
Total tr

In [None]:
print(np.count_nonzero(y_pred), np.count_nonzero(y_test))

# print the accuracy, precision, recall and f1_score
print("Accuracy: ", accuracy_score(y_pred,y_test))
print("Precision: ", precision_score(y_pred, y_test, average='micro'))
print("Recall: ", recall_score(y_pred, y_test, average='micro'))
print("F1 Score: ", f1_score(y_pred, y_test, average='micro'))
print("Hamming Loss: ", hamming_loss(y_test, y_pred)) # Getting near to 0 is better for hamming loss metric --> pretty good result

2752 2606
Accuracy:  0.006622516556291391
Precision:  0.2701458173445894
Recall:  0.2558139534883721
F1 Score:  0.15924938290936932
Hamming Loss:  0.008123894533338817


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Conclusion
Le nombre de 1 prédits est supérieur que lors de la 1ère tentative, et le f1-score a aussi augmenté. Prédire plus de labels permet une meilleure précision et un meilleur f1-score. Cela est dû au paramètre class_weight = "balanced" qui détermine les poids du modèle pour ajuster le nombre d'output de sortie 

# Utilisation d'une 2ème méthode

Nous allons utiliser la méthode de LinearSVC

In [17]:
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

# Parameters for the vectorizer and classifier
sgd_params = dict( penalty="l2", loss="hinge",random_state=42,class_weight="balanced")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8,stop_words="english",strip_accents="ascii",max_features=3000,
                         lowercase=True)

# Pipeline for text classification
pipeline = Pipeline(
    [   
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", OneVsRestClassifier(LinearSVC(**sgd_params))),
    ]
)

# Evaluation function
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    global y_pred
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Calculate micro-averaged F1 score for multi-label classification
    print("Micro-averaged F1 score on test set: %0.3f" % f1_score(y_test, y_pred, average="micro"))
    print("-" * 10)

# Main script
if __name__ == "__main__":
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Supervised SGDClassifier with multi-label classification:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

Supervised SGDClassifier with multi-label classification:
Micro-averaged F1 score on test set: 0.324
----------


In [18]:
print(np.count_nonzero(y_pred), np.count_nonzero(y_test))

# print the accuracy, precision, recall and f1_score
print("Accuracy: ", accuracy_score(y_pred,y_test))
print("Precision: ", precision_score(y_pred, y_test, average='micro'))
print("Recall: ", recall_score(y_pred, y_test, average='micro'))
print("F1 Score: ", f1_score(y_pred, y_test, average='micro'))
print("Hamming Loss: ", hamming_loss(y_test, y_pred)) # Getting near to 0 is better for hamming loss metric --> pretty good result

1362 2606
Accuracy:  0.019867549668874173
Precision:  0.24635456638526476
Recall:  0.4713656387665198
F1 Score:  0.3235887096774194
Hamming Loss:  0.005520134918349719


# Conclusion 
Nous voyons que la méthode du LinearSVC est plus efficace que la méthode du SGD (gradient de descente stochastique) car elle permet une amélioration du f1-score pour 