# Exploitation des papiers

## Importation des modules

In [1]:
# Modules de base
import pandas as pd
# Modules de NLP
import torch
from transformers import AutoTokenizer, AutoModel
# Modules de clustering
from sklearn.cluster import KMeans

## Importation des données

In [4]:
# Importation des données
data_papers = pd.read_csv('../data/papers.csv')
data_papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


## Regroupement des papiers

In [5]:
# Chargement du modèle et du tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Fonction d'embeding des textes
def embed_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**inputs)
    embeddings = model_output.last_hidden_state.mean(dim=1)
    return embeddings

# Embeding des abstracts
abstracts = data_papers['abstract'].tolist()
embeddings = embed_text(abstracts)

# Conversion en numpy array
embeddings_np = embeddings.numpy()

# Définition du nombre de clusters dans lesquels on souhaite regrouper les abstracts
num_clusters = 3  

# Entrainement des K-means
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embeddings_np)
clusters = kmeans.labels_

# Ajout du cluster au dataFrame
data_papers['cluster'] = clusters

data_papers.head()

NameError: name 'AutoTokenizer' is not defined