In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier

from nltk.stem import WordNetLemmatizer
import nltk

# Loading dataset
data = pd.read_parquet('val-00000-of-00001-66ce8665444026dc.parquet')
data = data.dropna()

# Preprocessing	
# Extracting title and abstract from the dataset
X = data['title'] + ' ' + data['abstract']

nltk.download('wordnet')
# Lemmatization function
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split()])

X = X.apply(lemmatize_text)

# Extracting labels from the dataset (target)
Y_list = data['verified_uat_labels']

# Multi-label binarization of the target
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(Y_list)

# Parameters for the vectorizer and classifier
sgd_params = dict(alpha=1e-5, penalty="l2", loss="log_loss", n_jobs=-1, verbose=1)
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Pipeline for text classification
pipeline = Pipeline(
    [   
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", OneVsRestClassifier(SGDClassifier(**sgd_params))),
    ]
)

# Evaluation function
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    global y_pred
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Calculate micro-averaged F1 score for multi-label classification
    print("Micro-averaged F1 score on test set: %0.3f" % f1_score(y_test, y_pred, average="micro"))
    print("-" * 10)

# Main script
if __name__ == "__main__":
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Supervised SGDClassifier with multi-label classification:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Quent\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Supervised SGDClassifier with multi-label classification:
-- Epoch 1
Norm: 34.59, NNZs: 19588, Bias: -2.146553, T: 2416, Avg. loss: 0.014425
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 30.92, NNZs: 19588, Bias: -2.479704, T: 4832, Avg. loss: 0.001540
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 29.20, NNZs: 19588, Bias: -2.685695, T: 7248, Avg. loss: 0.001738
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 27.79, NNZs: 19588, Bias: -2.895115, T: 9664, Avg. loss: 0.001596
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 27.00, NNZs: 19588, Bias: -3.038365, T: 12080, Avg. loss: 0.001703
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 26.45, NNZs: 19588, Bias: -3.148638, T: 14496, Avg. loss: 0.001683
Total training time: 0.01 seconds.
-- Epoch 7
Norm: 25.84, NNZs: 19588, Bias: -3.266900, T: 16912, Avg. loss: 0.001566
Total training time: 0.01 seconds.
Convergence after 7 epochs took 0.01 seconds
-- Epoch 1
Norm: 36.09, NNZs: 19588, Bias: -1.997192, T: 2416, Avg. 

In [3]:
print(np.mean(y_pred == y_test))

# print the accuracy, precision, recall and f1_score
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='micro'))
print("Recall: ", recall_score(y_test, y_pred, average='micro'))
print("F1 Score: ", f1_score(y_test, y_pred, average='micro'))


0.9949302784747646
Accuracy:  0.011589403973509934
Precision:  0.7373737373737373
Recall:  0.08403683806600154
F1 Score:  0.15087840165346195


In [39]:
# Doing my own accuracy calculation
# Search for each word in the predicted labels in the Y_test labels and calculate the percentage of words found
percentage = 0
for i in range(len(y_pred)):
    print(y_pred[i])
    print(y_test[i])
    for word in y_pred[i].split(' '):
        #print(word, y_test[i])
        #print(word in y_test[i][0])
        if word in y_test[i][0]:
            percentage += 1/len(y_pred[i].split(' '))
            
print(f"Accuracy: {percentage/len(y_pred)*100:.2f}%")

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

AttributeError: 'numpy.ndarray' object has no attribute 'split'