In [1]:
from pandas import read_csv
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('french')

import sys
sys.path.insert(0, '..')

from preprocessing import preprocessing

model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rachidj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = read_csv('../data/petitions_fr_combi20233.csv')

In [3]:
from preprocessing.dataLoaders import petition_data_loader

In [4]:
petitionsDataLoaders = petition_data_loader.PetitionDataLoader(df, min_nb_signature= 50)

In [5]:
preprocessing = preprocessing.Preprocessor(petitionsDataLoaders, ['title', 'description'])

In [6]:
df_preprocessed = preprocessing.preprocess(
    filter_rows= False,
    replace_words= False
)

In [7]:
from vocabulary.vocabulary import VocabularyCreator
import utils
import os

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
keybert_kwargs={
    "top_n": 8,
    "use_mmr": True,
    "stop_words": "french",
    "keyphrase_ngram_range": (1, 1),
    "nr_candidates": 17
}

api_token = "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"
os.environ['HF_HOME'] = api_token

vocabulary_creator = VocabularyCreator(model_name= model_name, **keybert_kwargs)

In [9]:

import torch
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

docs = df_preprocessed["processed_data"].astype(str).tolist()

# Initialize a GPU-enabled SentenceTransformer model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentenceTransformer(model_name, device=device)

# Extract keywords using KeyBERT
kw_model = KeyBERT(model=model)
keywords = kw_model.extract_keywords(docs, **keybert_kwargs)

# Flatten the list of lists and remove duplicates to create the vocabulary
vocabulary = list(set([word for sublist in keywords for word, _ in sublist]))

In [10]:
vocabulary_list = vocabulary_creator.keybert_vocabulary(df_preprocessed)

In [11]:
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance

from clustering.clustering import ClusteringMethod

clustering = ClusteringMethod(model_name= model_name)

bertopic_kwargs={
    # "nr_topics": 40,
    "ctfidf_model": ClassTfidfTransformer(reduce_frequent_words=True),
    "representation_model": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)], # possible values : KeyBERTInspired() or MaximalMarginalRelevance(diversity=0.5)
    "top_n_words":10,
    "min_topic_size":50,
    "calculate_probabilities": True,
    "n_gram_range": (1,3),
    'vectorizer_model': CountVectorizer(
                    vocabulary=vocabulary_list, 
                    stop_words=stopwords, 
                    lowercase=True, 
                    ngram_range=(1, 3)
                )
}

topics, probs, topic_model, embeddings = clustering.run_bertopic(
    df= df_preprocessed,
    **bertopic_kwargs
    )

clustering.save('../models/model_petitions_fr')

Batches: 100%|██████████| 1892/1892 [04:56<00:00,  6.38it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

ValueError: empty vocabulary passed to fit

In [14]:
topic_model.visualize_topics()

In [15]:
topic_model.visualize_barchart(top_n_topics=65)

In [16]:
hierarchical_topics = topic_model.hierarchical_topics(df_preprocessed['processed_data'])
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 60/60 [16:58<00:00, 16.98s/it]


In [17]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,24037,-1_cette pétition_citoyens_contre_paris,"[cette pétition, citoyens, contre, paris, fran...",[AMÉLIORER LA VIE DU QUARTIER BAGNOLET / BELGR...
1,0,8199,0_fermeture classe école_fermeture classe_non ...,"[fermeture classe école, fermeture classe, non...",[NON à la fermeture d'une classe à l’École R​....
2,1,2908,1_permis construire_urbanisme_projet immobilie...,"[permis construire, urbanisme, projet immobili...",[NON aux lotissements sur la commune de Montes...
3,2,2418,2_médias_public_comédienne_festival,"[médias, public, comédienne, festival, émissio...",[Pour que DataGueule retire leur vidéo «Santé ...
4,3,1543,3_démocratique_démocratie_politiques_tour élec...,"[démocratique, démocratie, politiques, tour él...",[Pour donner une légitimité absolue à l'absten...
...,...,...,...,...,...
57,56,123,56_élèves situation handicap_enfants situation...,"[élèves situation handicap, enfants situation ...",[Projet de Loi Coralie pour une scolarité Egal...
58,57,122,57_contre pesticides_exposition pesticides_épa...,"[contre pesticides, exposition pesticides, épa...","[CONTRE le retour des néonicotinoïdes, ces pes..."
59,58,117,58_politique agricole_politique agricole commu...,"[politique agricole, politique agricole commun...",[Intégrer les coûts environnementaux et de san...
60,59,117,59_assiettes enfants_repas enfants_cantine sco...,"[assiettes enfants, repas enfants, cantine sco...","[La cantine pour tous, en respectant les diffé..."


In [18]:
len(df_preprocessed)

60543

In [18]:
import torch
torch.cuda.empty_cache()