In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_parquet('train-00000-of-00001-b21313e511aa601a.parquet')

# Drop rows with any missing values
data = data.dropna(ignore_index=True)

# Display the first 5 rows of the data
data.head()

Unnamed: 0,bibcode,title,abstract,verified_uat_ids,verified_uat_labels
0,2020ApJ...891..100S,Dynamic Potential Sputtering of Lunar Analog M...,"Pyroxenes ((Ca, Mg, Fe, Mn)<SUB>2</SUB>Si<SUB>...","[1534, 499, 1692, 948, 1024, 2004]","[solar wind, exosphere, the moon, lunar compos..."
1,2024ApJ...966L...8B,"Generation of Low-inclination, Neptune-crossin...",The solar system's distant reaches exhibit a w...,"[1705, 1184, 2293]","[trans-neptunian objects, orbits, solar system..."
2,2024PSJ.....5...45C,Leveraging the Gravity Field Spectrum for Icy ...,Understanding the interior structures of icy m...,"[2189, 1248, 770, 1889, 627, 1255]","[europa, planetary interior, hydrosphere, mark..."
3,2022ApJ...932...52H,Inverse Multiview. I. Multicalibrator Inverse ...,Very Long Baseline Interferometry (VLBI) astro...,"[1769, 1337, 1713, 1295]","[very long baseline interferometry, radio astr..."
4,2024ApJS..271...25C,The First LHAASO Catalog of Gamma-Ray Sources,We present the first catalog of very-high-ener...,"[628, 632, 205]","[gamma-ray astronomy, gamma-ray observatories,..."


After Loading the data, we're going to prepare the data

On va créer un dictionnaire avec les valeurs des ids vérifiés, associé à chaque label vérifié et créer une liste Y

In [None]:
ids_and_labels = data[['verified_uat_ids','verified_uat_labels']]

maxi = 0
for i in data['verified_uat_ids']: 
    for j in i:
        if j>maxi:
            maxi = j

dict_ids_labels = {}

for i in range(len(ids_and_labels['verified_uat_ids'])):
    for j in range(len(ids_and_labels['verified_uat_ids'][i])):
        dict_ids_labels[ids_and_labels['verified_uat_ids'][i][j]] = ids_and_labels['verified_uat_labels'][i][j]


Create Y with columns equals to the value of the labels, with 1 if the label appears in the data for each line

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

X = data['title'] + data['abstract']
X = X.apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))

Y = MultiLabelBinarizer().fit_transform(data['verified_uat_ids'])
X

0        Dynamic Potential Sputtering Lunar Analog Mate...
1        Generation Low-inclination, Neptune-crossing T...
2        Leveraging Gravity Field Spectrum Satellite In...
3        Inverse Multiview. Multicalibrator Inverse Pha...
4        First LHAASO Catalog Gamma-Ray SourcesWe prese...
                               ...                        
18628    X-Ray Spectroscopy Microcalorimeter Era. III. ...
18629    Fast Iterative Techniques Polarized Radiative ...
18630    Characterization Supernovae Based Spectral-Tem...
18631    Evidence Coronal Temperature Variation Seyfert...
18632    LoVoCCS. Survey Introduction, Data Processing ...
Length: 18633, dtype: object

We are going to separate our data into train and test datas

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

We are going to tokenize the text, by the frequence of each word, so our model can handle it

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect= TfidfVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

print(X_train_vect.shape)
print(Y_train.shape)

(14906, 47040)
(14906, 2372)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1565026 stored elements and shape (14906, 47040)>
  Coords	Values
  (0, 16789)	0.0641474073605073
  (0, 41797)	0.12546661965716513
  (0, 36953)	0.07746819223684998
  (0, 12109)	0.2483468815673644
  (0, 32222)	0.22011280998651084
  (0, 33491)	0.12230722545290991
  (0, 20225)	0.09160147048607467
  (0, 15428)	0.08271653084556393
  (0, 36958)	0.08271653084556393
  (0, 39833)	0.4022546635872688
  (0, 22333)	0.03138847210395375
  (0, 10602)	0.042048344428250325
  (0, 14042)	0.1376661955174148
  (0, 34851)	0.08200906881668871
  (0, 40001)	0.05044312017615956
  (0, 20912)	0.04534683926249641
  (0, 14521)	0.056315744913267796
  (0, 36123)	0.1343515298110958
  (0, 14806)	0.10460951093944178
  (0, 45976)	0.06540631028568886
  (0, 7999)	0.09948539162996513
  (0, 35260)	0.16631336925205778
  (0, 18068)	0.0703824494629968
  (0, 43125)	0.05694319449308712
  (0, 16812)	0.08573609492888531
  :	:
  (14905, 44804)	

We are now going to train our model

In [29]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = MultiOutputClassifier(OneVsRestClassifier(LogisticRegression()))
clf.fit(X_train_vect, Y_train)



Prediction of the model

In [30]:
from sklearn.metrics import accuracy_score

Y_pred = clf.predict(X_test_vect)

print(np.count_nonzero(Y_pred), np.count_nonzero(Y_test))

accuracy = accuracy_score(Y_test, Y_pred)

print(f'Accuracy: {accuracy*100:.2f}%')

1468 15986
Accuracy: 1.48%
