In [3]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier


# Loading dataset containing first five categories
data = pd.read_parquet('val-00000-of-00001-66ce8665444026dc.parquet')
data = data.dropna()

# Preprocessing	
# Extracting title and abstract from the dataset
X = data['title'] + data['abstract']
X = X.to_numpy()

# Extracting labels from the dataset (target)
Y_list = data['verified_uat_labels']

# Concatenate all labels to a single string to be understood by the classifier
Y = []
for i in Y_list:
    labels = ''
    for j in i:
        labels += j + ' '
    labels = labels[:-1]
    Y.append(labels)
y = pd.DataFrame(Y)
y = y.to_numpy()
# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss",n_jobs=-1)
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [   
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)

def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    global y_pred
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print("-" * 10)
    print()


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)


Supervised SGDClassifier on 100% of the data:
Number of training samples: 2265
Unlabeled samples in training set: 0


  y = column_or_1d(y, warn=True)


Micro-averaged F1 score on test set: 0.013
----------



In [7]:
# Doing my own accuracy calculation
# Search for each word in the predicted labels in the Y_test labels and calculate the percentage of words found
percentage = 0
for i in range(len(y_pred)):
    for word in y_pred[i].split(' '):
        print(word, y_test[i])
        print(word in y_test[i][0])
        if word in y_test[i][0]:
            percentage += 1/len(y_pred[i].split(' '))
            
print(f"Accuracy: {percentage/len(y_pred)*100:.2f}%")

gamma-ray ['gamma-ray bursts']
True
bursts ['gamma-ray bursts']
True
sunspots ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
True
magnetohydrodynamics ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
False
alfven ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
False
waves ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
False
solar ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
True
atmosphere ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
True
solar ['solar chromosphere sunspots solar atmosphere solar oscillations computational methods observational astronomy']
True
chromosphere ['solar chromosp