In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier


from nltk.stem import WordNetLemmatizer
import nltk


# Loading dataset containing first five categories
data = pd.read_parquet('val-00000-of-00001-66ce8665444026dc.parquet')
data = data.dropna()

# Preprocessing	
# Extracting title and abstract from the dataset
X = data['title'] + data['abstract']

nltk.download('wordnet')
# Lemmatization of the text for better results

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split()])

X = X.apply(lemmatize_text)

# Extracting labels from the dataset (target)
Y_list = data['verified_uat_labels']

# Concatenate all labels to a single string to be understood by the classifier
Y = []
for i in Y_list:
    labels = ''
    for j in i:
        labels += j + ' '
    labels = labels[:-1]
    Y.append(labels)
y = pd.DataFrame(Y)
y = y.to_numpy()

# Parameters
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss",n_jobs=-1,verbose=1)
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

# Supervised Pipeline
pipeline = Pipeline(
    [   
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)


def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    global y_pred
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print("-" * 10)
    print()


if __name__ == "__main__":
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Quent\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Supervised SGDClassifier on 100% of the data:
Number of training samples: 2265
Unlabeled samples in training set: 0


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.1s


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 30.57, NNZs: 18451, Bias: -2.073782, T: 2265, Avg. loss: 0.009410
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 31.17, NNZs: 18451, Bias: -2.121871, T: 2265, Avg. loss: 0.009241
Total training time: 0.03 seconds.
-- Epoch 2
Norm: 29.84, NNZs: 18451, Bias: -1.977987, T: 2265, Avg. loss: 0.008690
Total training time: 0.02 seconds.
Norm: 30.33, NNZs: 18451, Bias: -2.025867, T: 2265, Avg. loss: 0.009351
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 31.30, NNZs: 18451, Bias: -2.119390, T: 2265, Avg. loss: 0.008878
Total training time: 0.00 seconds.
Norm: 29.53, NNZs: 18451, Bias: -1.988942, T: 2265, Avg. loss: 0.009402
Total training time: 0.04 seconds.
-- Epoch 2
Norm: 29.93, NNZs: 18451, Bias: -1.987749, T: 2265, Avg. loss: 0.008994
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 2
Norm: 30.39, NNZs: 18451, Bias: -2.056252, T: 2265, A

[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.6s


-- Epoch 6
-- Epoch 4
Norm: 22.70, NNZs: 18451, Bias: -3.220515, T: 15855, Avg. loss: 0.001122
Total training time: 0.04 seconds.
Norm: 25.33, NNZs: 18451, Bias: -2.811040, T: 9060, Avg. loss: 0.001325
Total training time: 0.01 seconds.
Norm: 28.21, NNZs: 18451, Bias: -2.400162, T: 4530, Avg. loss: 0.001444
Total training time: 0.01 seconds.
Norm: 23.06, NNZs: 18451, Bias: -3.242636, T: 15855, Avg. loss: 0.001207
Total training time: 0.04 seconds.
Convergence after 7 epochs took 0.04 seconds
-- Epoch 1
Norm: 28.09, NNZs: 18451, Bias: -2.421896, T: 4530, Avg. loss: 0.001487
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 26.67, NNZs: 18451, Bias: -2.624726, T: 6795, Avg. loss: 0.001469
Total training time: 0.03 seconds.
Convergence after 7 epochs took 0.04 seconds
-- Epoch 1
-- Epoch 4
-- Epoch 3
Norm: 25.67, NNZs: 18451, Bias: -2.840236, T: 9060, Avg. loss: 0.001343
Total training time: 0.03 seconds.
Norm: 26.40, NNZs: 18451, Bias: -2.653151, T: 6795, Avg. loss: 0.001327
Total trai

[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:    1.4s


-- Epoch 1
-- Epoch 7
Norm: 27.18, NNZs: 18451, Bias: -2.635836, T: 6795, Avg. loss: 0.001613
Total training time: 0.02 seconds.
Norm: 24.63, NNZs: 18451, Bias: -2.999145, T: 11325, Avg. loss: 0.001372
Total training time: 0.02 seconds.
Norm: 28.43, NNZs: 18451, Bias: -2.451271, T: 4530, Avg. loss: 0.001764
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 26.69, NNZs: 18451, Bias: -2.667130, T: 6795, Avg. loss: 0.001390
Total training time: 0.02 seconds.
-- Epoch 4
-- Epoch 6
-- Epoch 3
Norm: 25.33, NNZs: 18451, Bias: -2.847273, T: 9060, Avg. loss: 0.001298
Total training time: 0.02 seconds.
Norm: 22.95, NNZs: 18451, Bias: -3.233728, T: 15855, Avg. loss: 0.001196
Total training time: 0.04 seconds.
-- Epoch 5
Norm: 23.09, NNZs: 18451, Bias: -3.220389, T: 15855, Avg. loss: 0.001221
Total training time: 0.04 seconds.
Norm: 26.60, NNZs: 18451, Bias: -2.676446, T: 6795, Avg. loss: 0.001325
Total training time: 0.01 seconds.
Norm: 22.97, NNZs: 18451, Bias: -3.206077, T: 15855, Avg. loss: 

[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:    2.6s


-- Epoch 3-- Epoch 4
-- Epoch 8
-- Epoch 7
Norm: 29.44, NNZs: 18451, Bias: -2.032927, T: 2265, Avg. loss: 0.010215
Total training time: 0.00 seconds.
Norm: 30.15, NNZs: 18451, Bias: -2.072237, T: 2265, Avg. loss: 0.010025
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 27.11, NNZs: 18451, Bias: -2.693185, T: 6795, Avg. loss: 0.001365
Total training time: 0.02 seconds.
-- Epoch 4
-- Epoch 2
Norm: 31.11, NNZs: 18451, Bias: -2.042066, T: 2265, Avg. loss: 0.008848
Total training time: 0.00 seconds.

Norm: 27.09, NNZs: 18451, Bias: -2.693066, T: 6795, Avg. loss: 0.001429
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 23.26, NNZs: 18451, Bias: -3.210503, T: 15855, Avg. loss: 0.001207
Total training time: 0.03 seconds.
Norm: 23.85, NNZs: 18451, Bias: -3.124098, T: 13590, Avg. loss: 0.001201
Total training time: 0.03 seconds.
-- Epoch 7
-- Epoch 3
Norm: 25.17, NNZs: 18451, Bias: -2.830529, T: 9060, Avg. loss: 0.001253
Total training time: 0.02 seconds.
-- Epoch 5
Norm: 23.23, NNZs: 18

[Parallel(n_jobs=-1)]: Done 1226 tasks      | elapsed:    4.1s


Norm: 24.56, NNZs: 18451, Bias: -3.018589, T: 11325, Avg. loss: 0.001330Norm: 30.98, NNZs: 18451, Bias: -2.091298, T: 2265, Avg. loss: 0.009161
Total training time: 0.01 seconds.
Norm: 25.56, NNZs: 18451, Bias: -2.860168, T: 9060, Avg. loss: 0.001332
Total training time: 0.02 seconds.
Norm: 23.00, NNZs: 18451, Bias: -3.243036, T: 15855, Avg. loss: 0.001183
Total training time: 0.03 seconds.
Norm: 28.12, NNZs: 18451, Bias: -2.404425, T: 4530, Avg. loss: 0.001698
Total training time: 0.01 seconds.
Norm: 23.76, NNZs: 18451, Bias: -3.154174, T: 13590, Avg. loss: 0.001249
Total training time: 0.03 seconds.

Total training time: 0.03 seconds.
-- Epoch 6
Norm: 31.11, NNZs: 18451, Bias: -2.059820, T: 2265, Avg. loss: 0.008795
Total training time: 0.00 seconds.
-- Epoch 5
-- Epoch 3
-- Epoch 2
Norm: 24.23, NNZs: 18451, Bias: -2.997174, T: 11325, Avg. loss: 0.001257
Total training time: 0.02 seconds.
-- Epoch 2
Norm: 23.22, NNZs: 18451, Bias: -3.256769, T: 15855, Avg. loss: 0.001196
Total traini

[Parallel(n_jobs=-1)]: Done 1776 tasks      | elapsed:    5.9s


-- Epoch 3Norm: 26.62, NNZs: 18451, Bias: -2.692947, T: 6795, Avg. loss: 0.001365
Total training time: 0.02 seconds.
-- Epoch 1
Norm: 23.62, NNZs: 18451, Bias: -3.106459, T: 13590, Avg. loss: 0.001231
Total training time: 0.04 seconds.
Norm: 28.22, NNZs: 18451, Bias: -2.395478, T: 4530, Avg. loss: 0.001638
Total training time: 0.01 seconds.
-- Epoch 3

Norm: 24.38, NNZs: 18451, Bias: -2.996893, T: 11325, Avg. loss: 0.001244
Total training time: 0.03 seconds.
Norm: 25.39, NNZs: 18451, Bias: -2.812303, T: 9060, Avg. loss: 0.001283
Total training time: 0.02 seconds.
Norm: 31.06, NNZs: 18451, Bias: -2.153165, T: 2265, Avg. loss: 0.008991
Total training time: 0.01 seconds.
-- Epoch 4
-- Epoch 6
-- Epoch 5
Norm: 26.54, NNZs: 18451, Bias: -2.648521, T: 6795, Avg. loss: 0.001368
Total training time: 0.02 seconds.
-- Epoch 2
-- Epoch 7
Norm: 31.48, NNZs: 18451, Bias: -2.099035, T: 2265, Avg. loss: 0.008636
Total training time: 0.00 seconds.
Norm: 28.35, NNZs: 18451, Bias: -2.428033, T: 4530, Av

[Parallel(n_jobs=-1)]: Done 2211 out of 2211 | elapsed:    7.3s finished


Micro-averaged F1 score on test set: 0.011
----------



In [None]:
print(np.mean(y_pred == y_test))

# Print the accuracy, precision, recall and f1_score
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, average='micro'))
print("Recall: ", recall_score(y_test, y_pred, average='micro'))
print("F1 Score: ", f1_score(y_test, y_pred, average='micro'))

0.0001420990307442656
Accuracy:  0.010596026490066225
Precision:  0.010596026490066225
Recall:  0.010596026490066225
F1 Score:  0.010596026490066225


In [11]:
# Doing my own accuracy calculation
# Search for each word in the predicted labels in the Y_test labels and calculate the percentage of words found
percentage = 0
for i in range(len(y_pred)):
    for word in y_pred[i].split(' '):
        print(word, y_test[i])
        print(word in y_test[i][0])
        if word in y_test[i][0]:
            percentage += 1/len(y_pred[i].split(' '))
            
print(f"Accuracy: {percentage/len(y_pred)*100:.2f}%")

mars ['mars planetary atmospheres atmospheric clouds atmospheric variability remote sensing']
True
planetary ['mars planetary atmospheres atmospheric clouds atmospheric variability remote sensing']
True
atmospheres ['mars planetary atmospheres atmospheric clouds atmospheric variability remote sensing']
True
galaxy ['catalogs surveys galaxy evolution photometry high-redshift galaxies observational astronomy astronomical methods']
True
evolution ['catalogs surveys galaxy evolution photometry high-redshift galaxies observational astronomy astronomical methods']
True
galaxy ['catalogs surveys galaxy evolution photometry high-redshift galaxies observational astronomy astronomical methods']
True
formation ['catalogs surveys galaxy evolution photometry high-redshift galaxies observational astronomy astronomical methods']
False
cosmology ['catalogs surveys galaxy evolution photometry high-redshift galaxies observational astronomy astronomical methods']
False
galaxy ['catalogs surveys galaxy ev