Import des bibliothèque necessaires

In [34]:
# Importations standard
import json
import string
import re

# Importations de bibliothèques tierces
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import copy

# Importations des modules sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

# Importations nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Vérifier et télécharger les ressources nltk si nécessaires
nltk_resources = ['stopwords', 'punkt', 'wordnet']
for resource in nltk_resources:
    try:
        nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
    except LookupError:
        nltk.download(resource)
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cboisron\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\cboisron\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

Classe Data pour tous les pré-traitement, vectorisation et réduction de la matrice.

In [35]:
class Data:
    def __init__(self, data = "train.jsonl"  , limit=5000, language="french", text_column="texte_annonce"):
        self.data = (pd.read_json(data, lines=True)[[text_column, "cal_réponse_signalement"]]).iloc[:limit]
        self.data["cal_réponse_signalement"] = self.data["cal_réponse_signalement"].map({"Pris en compte": 0,"Rejete (hors specs)": 1})
        self.text_column = text_column
        self.language = language


    def get_data(self):
        return self.data

    def supprimer_stopwords(self):
        stop_words = stopwords.words(self.language)

        def nettoyer_texte(texte):
            texte = re.sub(r'[^\w\s]', '', texte)  # Retirer la ponctuation
            texte = re.sub(r'\d+', '', texte)      # Retirer les chiffres
            tokens = nltk.word_tokenize(texte.lower())
            return ' '.join([word for word in tokens if word not in stop_words])

        # Appliquer la fonction de nettoyage à la colonne texte
        self.data[self.text_column] = self.data[self.text_column].apply(nettoyer_texte)

    def rien(self, x):
        pass

    def racinisation(self, ignore_stopwords=None):
        stemmer = SnowballStemmer(self.language)
        self.data[self.text_column] = self.data[self.text_column].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
        self.supprimer_stopwords()

    def lemmatisation(self, mode=None, overwrite=None):
        lemmatizer = WordNetLemmatizer()
        if mode:
            lemmatizer.mode = mode
        if overwrite:
            lemmatizer.overwrite = overwrite
        self.data[self.text_column] = self.data[self.text_column].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in str(x).split()]))
        self.supprimer_stopwords()

    def vectorisation_simple(self, ngram_range=None, max_features=None):
        vectorizer = CountVectorizer()
        if ngram_range:
            vectorizer.ngram_range = ngram_range
        if max_features:
            vectorizer.max_features = max_features
        vect_data = vectorizer.fit_transform(self.data[self.text_column])
        vect_df = pd.DataFrame(vect_data.toarray(), columns=vectorizer.get_feature_names_out())
        self.data = pd.concat([vect_df, self.data.iloc[:, -1]], axis=1)

    def vectorisation_ponderee(self, ngram_range=None, max_features=None, norm=None):
        vectorizer = TfidfVectorizer()
        if ngram_range:
            vectorizer.ngram_range = ngram_range
        if max_features:
            vectorizer.max_features = max_features
        if norm:
            vectorizer.norm = norm
        vect_data = vectorizer.fit_transform(self.data[self.text_column])
        vect_df = pd.DataFrame(vect_data.toarray(), columns=vectorizer.get_feature_names_out())
        self.data = pd.concat([vect_df, self.data.iloc[:, -1]], axis=1)

    def reduction_svd(self, n_components=None, n_iter=None):
        svd = TruncatedSVD()
        if n_components:
            svd.n_components = n_components
        if n_iter:
            svd.n_iter = n_iter
        svd = TruncatedSVD(n_components=n_components)
        svd_result = svd.fit_transform(self.data.iloc[:, :-1])
        self.data = pd.concat([pd.DataFrame(svd_result), self.data.iloc[:, -1]], axis=1)

    def reduction_nmf(self, n_components=50, init=None):
        nmf = NMF(n_components=n_components)
        if init:
            nmf.init = init
        nmf_result = nmf.fit_transform(self.data.iloc[:, :-1])
        self.data = pd.concat([pd.DataFrame(nmf_result), self.data.iloc[:, -1]], axis=1)

    def process_and_export(self, n_components=100):
        # Définir toutes les combinaisons possibles
        preprocessing_methods = [self.rien, self.racinisation, self.lemmatisation]
        vectorisation_methods = [self.vectorisation_simple, self.vectorisation_ponderee]
        reduction_methods = [self.reduction_svd, self.reduction_nmf]

        combinations = product(preprocessing_methods, vectorisation_methods, reduction_methods)

        # Boucler sur toutes les combinaisons
        for i, (preprocess, vectorize, reduce) in enumerate(combinations):
            # Réinitialiser les données à l'état brut
            self.data = self.data_brut.copy()

            # Appliquer les méthodes
            preprocess(self.text_column)
            vectorize(self.text_column)
            reduce(n_components)

            # Exporter le résultat
            output_file = f"output_combination_{i+1}.jsonl"
            self.data.to_json(output_file, orient="records", lines=True)
            print(f"Exporté : {output_file}")


Classe global des classifieurs permettant de faire de l'héritage

In [36]:
class Classifieur:
    def __init__(self, data):
        # Chargement et découpage des données
        self.chargement(data)
        self.decoupage()

    def chargement(self, data):
        # Chargement des données et séparation X, y
        self.data = data
        self.X = data.iloc[:, :-1]  # Toutes les colonnes sauf la dernière (X)
        self.y = data.iloc[:, -1]   # Dernière colonne (y)

    def decoupage(self):
        # Découpage en jeu d'entrainement et test
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.y, test_size=0.6, random_state=0)

    def entrainement(self):
        pass

    def rien(self):
        pass

    def taux_reussite(self):
        # Calcul du taux de réussite
        y_pred = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, y_pred)
        return f"{accuracy:.3f}"

    def f1_score(self):
        # Calcul du F1-score
        from sklearn.metrics import f1_score
        y_pred = self.classifier.predict(self.X_test)
        f1 = f1_score(self.Y_test, y_pred, average='weighted')
        return f"{f1:.3f}"

    def recherche_hyperparametres(self,param_grid):
          # Recherche des meilleurs hyperparamètres avec GridSearchCV
          grid_search = GridSearchCV(
              self.classifier,
              param_grid,
              cv=5,  # Validation croisée 5-fold
              scoring='f1',
          )
          grid_search.fit(self.X_train, self.Y_train) # tres important de preciser le label posit
          self.classifier = grid_search.best_estimator_ # Automatiquement garde le meilleur classifieur
          return grid_search.best_params_


In [37]:
from sklearn.naive_bayes import MultinomialNB


class Multinomial(Classifieur):
    def __init__(self, data, alpha = None, fit_prior = None):
        super().__init__(data)
        self.classifier = MultinomialNB()
        if alpha : self.classifier.alpha = alpha
        if fit_prior : self.classifier.fit_prior = fit_prior

    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)


In [38]:
from sklearn.naive_bayes import BernoulliNB


class Bernoulli(Classifieur):
    def __init__(self, data, alpha = None, fit_prior = None, binarize = None):
        super().__init__(data)

        self.classifier = BernoulliNB()
        if alpha : self.classifier.alpha = alpha
        if fit_prior : self.classifier.fit_prior = fit_prior
        if binarize : self.classifier.binarize = binarize


    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)

In [39]:
from sklearn.naive_bayes import GaussianNB


class Gaussian(Classifieur):
    def __init__(self, data, var_smoothing = None):
        super().__init__(data)
        self.classifier = GaussianNB()
        if var_smoothing : self.classifier.var_smoothing = var_smoothing

    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)

In [40]:
from sklearn.linear_model import LogisticRegression

class RegressionLogistique(Classifieur):
    def __init__(self, data, penalty = None, C = None, solver = None, max_iter = None, l1_ratio = None, random_state = None):
        super().__init__(data)
        self.classifier = LogisticRegression()
        if penalty : self.classifier.penalty = penalty
        if C : self.classifier.C = C
        if solver : self.classifier.solver = solver
        if max_iter : self.classifier.max_iter = max_iter
        if l1_ratio : self.classifier.l1_ratio
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [41]:
from sklearn.neighbors import KNeighborsClassifier

class KPlusProchesVoisins(Classifieur):
    def __init__(self, data, n_neighbors = 3, weights = None, algorithm = None, p = None):
        super().__init__(data)

        self.classifier = KNeighborsClassifier()
        if n_neighbors : self.classifier.n_neighbors = n_neighbors
        if weights : self.classifier.weights = weights
        if algorithm : self.classifier.algorithm = algorithm
        if p : self.classifier.p = p

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [42]:
from sklearn.tree import DecisionTreeClassifier

class ArbreDeDecision(Classifieur):
    def __init__(self, data, criterion = None, max_depth = None, min_samples_split = None, min_samples_leaf = None, max_features = None, ccp_alpha = None, random_state = None):
        super().__init__(data)
        self.classifier = DecisionTreeClassifier()
        if criterion : self.classifier.criterion = criterion
        if max_depth : self.classifier.max_depth = max_depth
        if min_samples_split : self.classifier.min_samples_split = min_samples_split
        if min_samples_leaf : self.classifier.min_samples_leaf = min_samples_leaf
        if max_features : self.classifier.max_features = max_features
        if ccp_alpha : self.classifier.ccp_alpha = ccp_alpha
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [43]:
from sklearn.ensemble import RandomForestClassifier

class ForetAleatoire(Classifieur):
    def __init__(self, data, n_estimators = 100, criterion = None, max_depth = None, min_samples_split = None, min_samples_leaf = None, max_features = None, bootstrap = None, ccp_alpha = None, random_state = 0 ):
        super().__init__(data)
        self.classifier = RandomForestClassifier()
        if n_estimators : self.classifier.n_estimators = n_estimators
        if criterion : self.classifier.criterion = criterion
        if max_depth : self.classifier.max_depth = max_depth
        if min_samples_split : self.classifier.min_samples_split = min_samples_split
        if min_samples_leaf : self.classifier.min_samples_leaf = min_samples_leaf
        if max_features : self.classifier.max_features = max_features
        if bootstrap : self.classifier.bootstrap = bootstrap
        if ccp_alpha : self.classifier.ccp_alpha = ccp_alpha
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [46]:
class Projet() :
    def __init__(self, sequence_data, sequence_classifieur, data_json = "train.jsonl", limit_data = 100, json_resultat = "json_résultat"):

        ###########################################################################################################################################################################
        ###########################################################################################################################################################################
        #########################################################               DEFINITION DES PARAMETRES                ##########################################################
        ###########################################################################################################################################################################
        ###########################################################################################################################################################################

        ###### CHANGER LES PARAMETRES POUR LES SEQUENCE CI-DESSOUS ######

        ## POUR LES DATA ##
        #si les deux méthode d'une étape sont False (ex : lemmatisation et racinisation pour l'étape preprocessing )
        #Alors l'étape serat toujours égale à null
        self.sequence_data = {
            "lemmatisation": True,
            "racinisation": False,                                      #########################################################
            "vectorisation_simple": False,                              ### Attention, une des vectorisation doit être cochée ###
            "vectorisation_ponderee": True,                             ###  Vectorisation simple se mettra par defaut sinom  ###
            "reduction_svd": False,                                     #########################################################
            "reduction_nmf": False
        }

        

        ## POUR LES CLASSIFIEURS ##
        self.sequence_classifieur = {
            Multinomial : True,
            Bernoulli : True,
            Gaussian : True,                       
            RegressionLogistique : True,
            KPlusProchesVoisins : True,
            ArbreDeDecision : True,
            ForetAleatoire : True
        }

        ###### CHANGER SOURCE DES DONNEES ######
        self.data_json = "train.jsonl"
        self.limit_data = 50


        self.json_resultat = "json_résultat"

        self.verif_json_name()

        ###########################################################################################################################################################################
        #########################################################                   FIN DES DEFINITIONS                ############################################################
        ###########################################################################################################################################################################

        with open("hyperpamètres_classifieur.jsonl","r") as file_1:
            self.hyper_classifieur = json.load(file_1)
        with open("hyperparametres_traitement.jsonl", "r") as file_2:
            self.hyper_data = json.load(file_2)


        if self.sequence_data["vectorisation_simple"] == False and self.sequence_data["vectorisation_ponderee"] == False : self.sequence_data["vectorisation_simple"] = True
        

        ### COMBINAISON DES DATA POSSIBLES ###
        self.combo_data = self.generate_data_hyperparam_combinations()
        self.nb_combo_data = len(self.combo_data)
        print("Combo data généré,  nombre : " + str(self.nb_combo_data))

        self.etape_1_sauv = {}
        self.etape_1_2_sauv = {}
        self.etape_1_2_3_sauv = {}
        self.final_res = {}

        self.main_loop()

    def main_loop(self):

        for combinaison in self.combo_data : 
            self.data = Data(data=self.data_json, limit = self.limit_data)
            preprocessing_str = json.dumps(combinaison["preprocessing"], ensure_ascii=False)
            vectorization_str = json.dumps(combinaison["vectorization"], ensure_ascii=False)
            dimension_reduction = json.dumps(combinaison["dimension_reduction"], ensure_ascii=False)
            etape_1_2_str = preprocessing_str + "," + vectorization_str
            etape_1_2_3_str = preprocessing_str + "," + vectorization_str + "," + dimension_reduction
            if etape_1_2_3_str in self.etape_1_2_3_sauv : 
                print("1")
                self.data = copy.deepcopy(self.etape_1_2_3_sauv[etape_1_2_3_str])
                print(self.data)

            # print(preprocessing_str)
            # print(self.etape_1_sauv)

            if etape_1_2_str in self.etape_1_2_sauv and etape_1_2_3_str not in self.etape_1_2_3_sauv: 
                self.data = copy.deepcopy(self.etape_1_2_sauv[etape_1_2_str])
                self.apply_etape_3(combinaison["dimension_reduction"], etape_1_2_str)

            if (etape_1_2_str not in self.etape_1_2_sauv) and (preprocessing_str in self.etape_1_sauv) and (etape_1_2_3_str not in self.etape_1_2_3_sauv): 
                self.data = copy.deepcopy(self.etape_1_sauv[preprocessing_str])
                self.apply_etape_2(combinaison["vectorization"], etape_1_2_str)
                self.apply_etape_3(combinaison["dimension_reduction"], etape_1_2_3_str)
            
            if etape_1_2_str not in self.etape_1_2_sauv and preprocessing_str not in self.etape_1_sauv and etape_1_2_3_str not in self.etape_1_2_3_sauv:
                self.apply_etape_1(combinaison["preprocessing"], preprocessing_str)
                self.apply_etape_2(combinaison["vectorization"], etape_1_2_str)
                self.apply_etape_3(combinaison["dimension_reduction"], etape_1_2_3_str)

            # print(self.etape_1_sauv)
            # print(self.etape_1_2_sauv)
            # print(self.etape_1_2_3_sauv)

            self.data = self.data.get_data()

            for classifieur_type, booleen_classifier in self.sequence_classifieur.items() : 
                if booleen_classifier : 
                    classifieur = classifieur_type(self.data)
                    classifieur_name = classifieur.__class__.__name__
                    param_classifieur = self.hyper_classifieur[classifieur_name]
                    best_param = classifieur.recherche_hyperparametres(param_classifieur)
                    nom_final = preprocessing_str + "," + vectorization_str + "," + dimension_reduction + "," + classifieur_name + "," + json.dumps(param_classifieur, ensure_ascii=False)
                    self.final_res[nom_final] = classifieur.f1_score()
                    print("-"*40)
                    print(nom_final)
                    print("precision = " + str(self.final_res[nom_final]))
                    print("-"*40)

    def mise_en_forme_final_export_json(self):
        pass

    def verif_json_name(self):
        


    def apply_etape_1(self,param, nom):

        if param == None : return 

        if param["method"] == "racinisation" : 
            self.data.racinisation(ignore_stopwords = param["parameters"]["ignore_stopwords"])

        if param["method"] == "lemmatisation" : 
            self.data.lemmatisation(mode = param["parameters"]["mode"], overwrite = param["parameters"]["overwrite"])

        self.etape_1_sauv[nom] = copy.deepcopy(self.data)

    

    def apply_etape_2(self,  param, nom):
        if param == None : return 

        if param["method"] == "vectorisation_simple" :
            self.data.vectorisation_simple(ngram_range = tuple(param["parameters"]["ngram_range"]), max_features = param["parameters"]["max_features"])
        if param["method"] == "vectorisation_ponderee" : 
            self.data.vectorisation_ponderee(ngram_range = tuple(param["parameters"]["ngram_range"]), max_features = param["parameters"]["max_features"], norm = param["parameters"]["norm"])

        self.etape_1_2_sauv[nom] = self.data



    def apply_etape_3(self, param, nom):

        if param == None : return 

        if param["method"] == "reduction_svd" : 
            self.data.reduction_svd(n_components = param["parameters"]["n_components"], n_iter = param["parameters"]["n_iter"])
        
        if param["method"] == "reduction_nmf" : 
            self.data.reduction_nmf(n_components = param["parameters"]["n_components"], init = param["parameters"]["init"])
        
        self.etape_1_2_3_sauv[nom + "," + json.dumps(param, ensure_ascii=False)] = self.data



    def generate_data_hyperparam_combinations(self):
        all_combinations = []

        preprocessing_steps = self.hyper_data.get("preprocessing", {})
        vectorization_steps = self.hyper_data.get("vectorization", {})
        reduction_steps = self.hyper_data.get("dimension_reduction", {})

        # Check if all methods for an entire step are disabled
        if not any(self.sequence_data.get(step, False) for step in preprocessing_steps):
            preprocessing_steps = {}
        if not any(self.sequence_data.get(step, False) for step in vectorization_steps):
            vectorization_steps = {}
        if not any(self.sequence_data.get(step, False) for step in reduction_steps):
            reduction_steps = {}

        preprocessing_combos = self._generate_combos_data(preprocessing_steps, "preprocessing")
        vectorization_combos = self._generate_combos_data(vectorization_steps, "vectorization")
        reduction_combos = self._generate_combos_data(reduction_steps, "dimension_reduction")

        if not preprocessing_combos:
            preprocessing_combos = [None]
        if not vectorization_combos:
            vectorization_combos = [None]
        if not reduction_combos:
            reduction_combos = [None]

        for pre in preprocessing_combos:
            for vec in vectorization_combos:
                for red in reduction_combos:
                    all_combinations.append({
                        "preprocessing": pre,
                        "vectorization": vec,
                        "dimension_reduction": red
                    })

        return all_combinations

    def _generate_combos_data(self, steps, step_type):
        """Generate all valid combinations for a specific step type."""
        step_combinations = []
        for step_name, params in steps.items():
            if self.sequence_data.get(step_name, False):
                param_combos = self._generate_param_data_combinations(params)
                for combo in param_combos:
                    step_combinations.append({"method": step_name, "parameters": combo})
        return step_combinations

    def _generate_param_data_combinations(self, params):
        """Generate all combinations of parameters for a given step."""
        if not params:
            return [{}]

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos



In [47]:
###########################################################################################################################################################################
###########################################################################################################################################################################
#########################################################               DEFINITION DES PARAMETRES                ##########################################################
###########################################################################################################################################################################
###########################################################################################################################################################################

###### CHANGER LES PARAMETRES POUR LES SEQUENCE CI-DESSOUS ######

## POUR LES DATA ##
#si les deux méthode d'une étape sont False (ex : lemmatisation et racinisation pour l'étape preprocessing )
#Alors l'étape serat toujours égale à null
sequence_data = {
    "lemmatisation": True,
    "racinisation": False,                                      #########################################################
    "vectorisation_simple": False,                              ### Attention, une des vectorisation doit être cochée ###
    "vectorisation_ponderee": True,                             ###  Vectorisation simple se mettra par defaut sinom  ###
    "reduction_svd": False,                                     #########################################################
    "reduction_nmf": False
}



## POUR LES CLASSIFIEURS ##
sequence_classifieur = {
    Multinomial : True,
    Bernoulli : True,
    Gaussian : True,                       
    RegressionLogistique : True,
    KPlusProchesVoisins : True,
    ArbreDeDecision : True,
    ForetAleatoire : True
}

###### CHANGER SOURCE DES DONNEES ######
data_json = "train.jsonl"
limit_data = 50


json_resultat = "json_résultat"

###########################################################################################################################################################################
#########################################################                   FIN DES DEFINITIONS                ############################################################
###########################################################################################################################################################################


###########################################################################################################################################################################
###########################################################################################################################################################################
#########################################################               lANCEMENT SCRIPT PRINCIPAL                #########################################################
###########################################################################################################################################################################
###########################################################################################################################################################################

projet = Projet(sequence_data = sequence_data, sequence_classifieur = sequence_classifieur, data_json = data_json, limit_data = limit_data, json_resultat = json_resultat)

Combo data généré,  nombre : 48
Meilleurs hyperparamètres : {'alpha': 0.1, 'fit_prior': True}
----------------------------------------
{"method": "lemmatisation", "parameters": {"mode": "lookup", "overwrite": true}},{"method": "vectorisation_ponderee", "parameters": {"ngram_range": [1, 1], "max_features": null, "norm": "l2"}},null,Multinomial,{"alpha": [0.1, 0.5, 1.0, 2.0], "fit_prior": [true, false]}
precision = 0.450
----------------------------------------
Meilleurs hyperparamètres : {'alpha': 0.5, 'binarize': 0.0, 'fit_prior': True}
----------------------------------------
{"method": "lemmatisation", "parameters": {"mode": "lookup", "overwrite": true}},{"method": "vectorisation_ponderee", "parameters": {"ngram_range": [1, 1], "max_features": null, "norm": "l2"}},null,Bernoulli,{"alpha": [0.1, 0.5, 1.0, 2.0], "binarize": [0.0, 0.5, 1.0, 2.0], "fit_prior": [true, false]}
precision = 0.560
----------------------------------------
Meilleurs hyperparamètres : {'var_smoothing': 1e-09}
--

180 fits failed out of a total of 1440.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ANACONDA\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ANACONDA\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1216, in fit
    self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
                                                ^^^^^^^^^^^^^^^
  File "c:\ANACONDA\Lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
                  ^^^^^^^^^^^^^^^^^^

Meilleurs hyperparamètres : {'C': 0.01, 'l1_ratio': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
----------------------------------------
{"method": "lemmatisation", "parameters": {"mode": "lookup", "overwrite": true}},{"method": "vectorisation_ponderee", "parameters": {"ngram_range": [1, 1], "max_features": null, "norm": "l2"}},null,RegressionLogistique,{"penalty": ["l2", null], "C": [0.01, 0.1, 1.0, 10.0], "solver": ["liblinear", "saga", "lbfgs", "newton-cg"], "max_iter": [100, 500, 1000], "l1_ratio": [0.1, 0.5, 0.9]}
precision = 0.450
----------------------------------------
Meilleurs hyperparamètres : {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
----------------------------------------
{"method": "lemmatisation", "parameters": {"mode": "lookup", "overwrite": true}},{"method": "vectorisation_ponderee", "parameters": {"ngram_range": [1, 1], "max_features": null, "norm": "l2"}},null,KPlusProchesVoisins,{"n_neighbors": [3, 5, 7

KeyboardInterrupt: 

 A PARTIR DE CE MOMENT LA C'EST DES TEST --------------------------------------------

TEST COMBO DATA

In [None]:
from itertools import product

class DataPipeline:
    def __init__(self):
        self.sequence_data = {
            "racinisation": True,
            "lemmatisation": True,
            "vectorisation_simple": True,
            "vectorisation_ponderee": True,
            "reduction_svd": True,
            "reduction_nmf": True
        }

        self.hyper_data = {
            "preprocessing": {
                "racinisation": {
                    "ignore_stopwords": [True, False]
                },
                "lemmatisation": {
                    "mode": ["lookup", "rule"],
                    "overwrite": [True, False]
                }
            },
            "vectorization": {
                "vectorisation_simple": {
                    "ngram_range": [[1, 1], [1, 2]],
                    "max_features": [None, 5000]
                },
                "vectorisation_ponderee": {
                    "ngram_range": [[1, 1], [1, 2], [2, 2]],
                    "max_features": [None, 5000],
                    "norm": ["l2", "l1"]
                }
            },
            "dimension_reduction": {
                "reduction_svd": {
                    "n_components": [50, 100, 200],
                    "n_iter": [5, 10, 15]
                },
                "reduction_nmf": {
                    "n_components": [50, 100, 200],
                    "init": ["random", "nndsvd", "use_idf"]
                }
            }
        }

    def generate_data_hyperparam_combinations(self):
        all_combinations = []

        preprocessing_steps = self.hyper_data.get("preprocessing", {})
        vectorization_steps = self.hyper_data.get("vectorization", {})
        reduction_steps = self.hyper_data.get("dimension_reduction", {})

        # Check if all methods for an entire step are disabled
        if not any(self.sequence_data.get(step, False) for step in preprocessing_steps):
            preprocessing_steps = {}
        if not any(self.sequence_data.get(step, False) for step in vectorization_steps):
            vectorization_steps = {}
        if not any(self.sequence_data.get(step, False) for step in reduction_steps):
            reduction_steps = {}

        preprocessing_combos = self._generate_combos_data(preprocessing_steps, "preprocessing")
        vectorization_combos = self._generate_combos_data(vectorization_steps, "vectorization")
        reduction_combos = self._generate_combos_data(reduction_steps, "dimension_reduction")

        if not preprocessing_combos:
            preprocessing_combos = [None]
        if not vectorization_combos:
            vectorization_combos = [None]
        if not reduction_combos:
            reduction_combos = [None]

        for pre in preprocessing_combos:
            for vec in vectorization_combos:
                for red in reduction_combos:
                    all_combinations.append({
                        "preprocessing": pre,
                        "vectorization": vec,
                        "dimension_reduction": red
                    })

        return all_combinations

    def _generate_combos_data(self, steps, step_type):
        """Generate all valid combinations for a specific step type."""
        step_combinations = []
        for step_name, params in steps.items():
            if self.sequence_data.get(step_name, False):
                param_combos = self._generate_param_data_combinations(params)
                for combo in param_combos:
                    step_combinations.append({"method": step_name, "parameters": combo})
        return step_combinations

    def _generate_param_data_combinations(self, params):
        """Generate all combinations of parameters for a given step."""
        if not params:
            return [{}]

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos

# Example usage
pipeline = DataPipeline()
result = pipeline.generate_data_hyperparam_combinations()

# Output the result in a readable JSON format
import json
print(json.dumps(result, indent=4))


[
    {
        "preprocessing": {
            "method": "lemmatisation",
            "parameters": {
                "mode": "lookup",
                "overwrite": true
            }
        },
        "vectorization": {
            "method": "comptages_simples",
            "parameters": {
                "ngram_range": [
                    1,
                    1
                ],
                "max_features": null
            }
        },
        "dimension_reduction": {
            "method": "reduction_svd",
            "parameters": {
                "n_components": 50,
                "n_iter": 5
            }
        }
    },
    {
        "preprocessing": {
            "method": "lemmatisation",
            "parameters": {
                "mode": "lookup",
                "overwrite": true
            }
        },
        "vectorization": {
            "method": "comptages_simples",
            "parameters": {
                "ngram_range": [
                    1,
      

Fonction ultra débile qui teste tous les classifieurs, pré traitements, vectorisation et réduction afin d'avoir le meilleurs taux de réussite.

In [None]:
def possibilite():
    results = []  # Liste pour stocker les résultats

    with open("hyperpamètres_classifieur.jsonl","r") as file:
        hyper_classifieur = json.load(file)

    for x in ["lemmatisation", "racinisation","rien"]:
            data = Data(limit=100)
            if x == "lemmatisation" :
                for mode in ["lookup", "rule"]:
                  for overit in [True, False]:
                    data.lemmatisation(mode,overit)
                    print("etape 1")
                    data.vectorisation_ponderee()
                    print("etape 2")
                    for y in ["nmf","svd" ]:
                      if y == "nmf":
                          data.reduction_nmf(n_components = 100)
                          print("etape 3")
                          z = "nmf"
                      else:
                          data.reduction_svd(n_components = 100)
                          print("etape 4")
                          z = "svd"
#Meilleurs hyperparamètres : {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
                      # Parcours des classifieurs
                      for i in [ForetAleatoire]:
                          classifieur = i(data.get_data())
                          param_opti = classifieur.recherche_hyperparametres(hyper_classifieur["RandomForestClassifier"])
                          classifieur = i(data.get_data(),max_depth=param_opti["max_depth"], min_samples_leaf = param_opti["min_samples_leaf"], min_samples_split = param_opti["min_samples_split"], n_estimators=param_opti["min_samples_split"] )
                          taux_reussite = classifieur.taux_reussite()
                          f1_score = classifieur.f1_score()

                          # Ajout des résultats à la liste sous forme de tuple
                          results.append(x, mode, overit, y, z, i.__name__, taux_reussite,f1_score)
                          print(results[len(results)])


            elif x == "racinisation" :
              for stop_word in [True, False]:
                data.racinisation(stop_word)
                data.vectorisation_ponderee()
                for y in ["vectorisation_ponderee","vectorisation_simple"]:
                      if y == "vectorisation_ponderee":
                          data.reduction_nmf(n_components = 100)
                          z = "nmf"
                      else:
                          data.reduction_svd(n_components = 100)
                          z = "svd"

                      # Parcours des classifieurs
                      for i in [ForetAleatoire]:
                          classifieur = i(data.get_data())
                          param_opti = classifieur.recherche_hyperparametres(hyper_classifieur["RandomForestClassifier"])
                          classifieur = i(data.get_data(),max_depth=param_opti["max_depth"], min_samples_leaf = param_opti["min_samples_leaf"], min_samples_split = param_opti["min_samples_split"], n_estimators=param_opti["min_samples_split"] )
                          taux_reussite = classifieur.taux_reussite()
                          f1_score = classifieur.f1_score()

                          # Ajout des résultats à la liste sous forme de tuple
                          results.append((x, mode, overit, y, z, i.__name__, taux_reussite,f1_score))
                          print(results[len(results)])

    # Tri des résultats par taux de réussite (du plus élevé au plus faible)
    results_sorted = sorted(results, key=lambda x: x[6], reverse=True)

    # Affichage des résultats triés
    for res in results_sorted:
        print(f"{res[0]} - {res[1]} - {res[2]} - {res[3]} - {res[4]} - {res[5]} : {res[6]}")

possibilite()

Exemple d'utilisation en temps normal pour l'optimisation

In [None]:
data = Data()
data.lemmatisation()
data.vectorisation_ponderee()
data.reduction_nmf()
data = data.get_data()

foret = ForetAleatoire(data)
foret.entrainement()
print(foret.f1_score())

0.643


Optimisation de la foret

In [None]:
#Donne les meilleurs hyperparametre de la foret
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

data = Data(limit = 50)
print("etape 1")
data.lemmatisation()
print("etape 2")
data.vectorisation_ponderee()
print("etape 3")
data.reduction_nmf()
print("etape 4")
data = data.get_data()
print("etape 5")

foret = ForetAleatoire(data)
print("etape 6")
print(param_grid)
print(data)
foret.recherche_hyperparametres(param_grid)
print("etape 7")
foret.f1_score()
