In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data
data = pd.read_parquet('val-00000-of-00001-66ce8665444026dc.parquet')

# Drop rows with any missing values
data = data.dropna(ignore_index=True)

# Display the first 5 rows of the data
data.head()

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels
0,2020RNAAS...4..137D,Recommendations for Teaching Introductory Astr...,Colleges and universities around the world wer...,"[1529, 1583, 563, 486, 1145, 74]","[solar system astronomy, stellar astronomy, ga..."
1,2023ApJ...949..109L,The ALMA Survey of 70 μm Dark High-mass Clumps...,We present dynamical properties of 294 cores e...,"[787, 1565, 1569, 732, 1302, 844, 847, 1297]","[infrared dark clouds, star forming regions, s..."
2,2020RNAAS...4....3G,L-band Calibration of the Green Bank Telescope...,"Since 2016, the HI-MaNGA survey has been obtai...","[544, 1360, 1671, 690]","[flux calibration, radio telescopes, surveys, ..."
3,2022RNAAS...6..165V,Search for Extended Sources in the Images from...,We present a convenient tool (ChaSES) which al...,"[1858, 2306, 1968, 1861]","[astronomy data analysis, astronomy image proc..."
4,2021ApJ...910...54K,The Connection between Warm Carbon-chain Chemi...,Some observations of warm carbon-chain chemist...,"[75, 267, 329, 838, 849, 1569, 371]","[astrochemistry, collapsing clouds, cosmic ray..."


After Loading the data, we're going to prepare the data

On va créer un dictionnaire avec les valeurs des ids vérifiés, associé à chaque label vérifié et créer une liste Y

In [34]:
ids_and_labels = data[['verified_uat_ids','verified_uat_labels']]

maxi = 0
for i in data['verified_uat_ids']: 
    for j in i:
        if j>maxi:
            maxi = j

dict_ids_labels = {}

for i in range(len(ids_and_labels['verified_uat_ids'])):
    for j in range(len(ids_and_labels['verified_uat_ids'][i])):
        dict_ids_labels[ids_and_labels['verified_uat_ids'][i][j]] = ids_and_labels['verified_uat_labels'][i][j]


Create Y with columns equals to the value of the labels, with 1 if the label appears in the data for each line

In [35]:
from sklearn.preprocessing import MultiLabelBinarizer

X = data['title'] + data['abstract']

Y = MultiLabelBinarizer().fit_transform(data['verified_uat_ids'])
X

0       Recommendations for Teaching Introductory Astr...
1       The ALMA Survey of 70 μm Dark High-mass Clumps...
2       L-band Calibration of the Green Bank Telescope...
3       Search for Extended Sources in the Images from...
4       The Connection between Warm Carbon-chain Chemi...
                              ...                        
3015    Constraints to Efficiently Find Interstellar O...
3016    Evidence for He I 10830 Å Absorption during th...
3017    The 2020 Eclipse of R Aquarii in the Near-infr...
3018    A Bayesian Analysis of Physical Parameters for...
3019    SKYSURF-3: Testing Crowded Object Catalogs in ...
Length: 3020, dtype: object

We are going to separate our data into train and test datas

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

We are going to tokenize the text, by the frequence of each word, so our model can handle it

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect= TfidfVectorizer(stop_words='english')
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

print(X_train_vect.shape)
print(Y_train.shape)

(2416, 18756)
(2416, 805)


We are now going to train our model

In [38]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

clf = OneVsRestClassifier(LinearSVC(), verbose=1, n_jobs=-1)
clf.fit(X_train_vect, Y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 805 out of 805 | elapsed:    8.9s finished


Prediction of the model

In [39]:
from sklearn.metrics import accuracy_score, f1_score , precision_score, hamming_loss

Y_pred = clf.predict(X_test_vect)

print(np.count_nonzero(Y_pred), np.count_nonzero(Y_test))

accuracy = accuracy_score(Y_pred, Y_test)

print(f'Accuracy: {accuracy*100:.2f}%')
print(f"Precision: {precision_score(Y_pred, Y_test, average='micro'):.2f}")
print(f'F1 Score: {f1_score(Y_pred, Y_test, average="micro"):.2f}')
print(f'Hamming Loss: {hamming_loss(Y_pred, Y_test):.2f}')

308 2606
Accuracy: 1.16%
Precision: 0.08
F1 Score: 0.15
Hamming Loss: 0.01
