Import des bibliothèque necessaires

In [None]:
# Importations standard
import json
import string
import re

# Importations de bibliothèques tierces
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product

# Importations des modules sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF

# Importations nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

# Vérifier et télécharger les ressources nltk si nécessaires
nltk_resources = ['stopwords', 'punkt', 'wordnet']
for resource in nltk_resources:
    try:
        nltk.data.find(f'tokenizers/{resource}' if resource == 'punkt' else f'corpora/{resource}')
    except LookupError:
        nltk.download(resource)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Brasi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Classe Data pour tous les pré-traitement, vectorisation et réduction de la matrice.

In [None]:
class Data:
    def __init__(self, data = "train.jsonl"  , limit=5000, language="french", text_column="texte_annonce"):
        self.data = (pd.read_json(data, lines=True)[[text_column, "cal_réponse_signalement"]]).iloc[:limit]
        self.data["cal_réponse_signalement"] = self.data["cal_réponse_signalement"].map({"Pris en compte": 0,"Rejete (hors specs)": 1})
        self.text_column = text_column
        self.language = language


    def get_data(self):
        return self.data

    def supprimer_stopwords(self):
        stop_words = stopwords.words(self.language)

        def nettoyer_texte(texte):
            texte = re.sub(r'[^\w\s]', '', texte)  # Retirer la ponctuation
            texte = re.sub(r'\d+', '', texte)      # Retirer les chiffres
            tokens = nltk.word_tokenize(texte.lower())
            return ' '.join([word for word in tokens if word not in stop_words])

        # Appliquer la fonction de nettoyage à la colonne texte
        self.data[self.text_column] = self.data[self.text_column].apply(nettoyer_texte)

    def rien(self, x):
        pass

    def racinisation(self, ignore_stopwords=None):
        stemmer = SnowballStemmer(self.language)
        self.data[self.text_column] = self.data[self.text_column].apply(lambda x: ' '.join([stemmer.stem(word) for word in str(x).split()]))
        self.supprimer_stopwords()

    def lemmatisation(self, mode=None, overwrite=None):
        lemmatizer = WordNetLemmatizer()
        if mode:
            lemmatizer.mode = mode
        if overwrite:
            lemmatizer.overwrite = overwrite
        self.data[self.text_column] = self.data[self.text_column].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in str(x).split()]))
        self.supprimer_stopwords()

    def vectorisation_simple(self, ngram_range=None, max_features=None):
        vectorizer = CountVectorizer()
        if ngram_range:
            vectorizer.ngram_range = ngram_range
        if max_features:
            vectorizer.max_features = max_features
        vect_data = vectorizer.fit_transform(self.data[self.text_column])
        vect_df = pd.DataFrame(vect_data.toarray(), columns=vectorizer.get_feature_names_out())
        self.data = pd.concat([vect_df, self.data.iloc[:, -1]], axis=1)

    def vectorisation_ponderee(self, ngram_range=None, max_features=None, norm=None):
        vectorizer = TfidfVectorizer()
        if ngram_range:
            vectorizer.ngram_range = ngram_range
        if max_features:
            vectorizer.max_features = max_features
        if norm:
            vectorizer.norm = norm
        vect_data = vectorizer.fit_transform(self.data[self.text_column])
        vect_df = pd.DataFrame(vect_data.toarray(), columns=vectorizer.get_feature_names_out())
        self.data = pd.concat([vect_df, self.data.iloc[:, -1]], axis=1)

    def reduction_svd(self, n_components=None, n_iter=None):
        svd = TruncatedSVD()
        if n_components:
            svd.n_components = n_components
        if n_iter:
            svd.n_iter = n_iter
        svd = TruncatedSVD(n_components=n_components)
        svd_result = svd.fit_transform(self.data.iloc[:, :-1])
        self.data = pd.concat([pd.DataFrame(svd_result), self.data.iloc[:, -1]], axis=1)

    def reduction_nmf(self, n_components=50, init=None):
        nmf = NMF(n_components=n_components)
        if init:
            nmf.init = init
        nmf_result = nmf.fit_transform(self.data.iloc[:, :-1])
        self.data = pd.concat([pd.DataFrame(nmf_result), self.data.iloc[:, -1]], axis=1)

    def process_and_export(self, n_components=100):
        # Définir toutes les combinaisons possibles
        preprocessing_methods = [self.rien, self.racinisation, self.lemmatisation]
        vectorisation_methods = [self.vectorisation_simple, self.vectorisation_ponderee]
        reduction_methods = [self.reduction_svd, self.reduction_nmf]

        combinations = product(preprocessing_methods, vectorisation_methods, reduction_methods)

        # Boucler sur toutes les combinaisons
        for i, (preprocess, vectorize, reduce) in enumerate(combinations):
            # Réinitialiser les données à l'état brut
            self.data = self.data_brut.copy()

            # Appliquer les méthodes
            preprocess(self.text_column)
            vectorize(self.text_column)
            reduce(n_components)

            # Exporter le résultat
            output_file = f"output_combination_{i+1}.jsonl"
            self.data.to_json(output_file, orient="records", lines=True)
            print(f"Exporté : {output_file}")


Classe global des classifieurs permettant de faire de l'héritage

In [None]:
class Classifieur:
    def __init__(self, data):
        # Chargement et découpage des données
        self.chargement(data)
        self.decoupage()

    def chargement(self, data):
        # Chargement des données et séparation X, y
        self.data = data
        self.X = data.iloc[:, :-1]  # Toutes les colonnes sauf la dernière (X)
        self.y = data.iloc[:, -1]   # Dernière colonne (y)

    def decoupage(self):
        # Découpage en jeu d'entrainement et test
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X, self.y, test_size=0.6, random_state=0)

    def entrainement(self):
        pass

    def rien(self):
        pass

    def taux_reussite(self):
        # Calcul du taux de réussite
        y_pred = self.classifier.predict(self.X_test)
        accuracy = accuracy_score(self.Y_test, y_pred)
        return f"{accuracy:.3f}"

    def f1_score(self):
        # Calcul du F1-score
        from sklearn.metrics import f1_score
        y_pred = self.classifier.predict(self.X_test)
        f1 = f1_score(self.Y_test, y_pred, average='weighted')
        return f"{f1:.3f}"

    def recherche_hyperparametres(self,param_grid):
          # Recherche des meilleurs hyperparamètres avec GridSearchCV
          grid_search = GridSearchCV(
              self.classifier,
              param_grid,
              cv=5,  # Validation croisée 5-fold
              scoring='f1',
          )
          grid_search.fit(self.X_train, self.Y_train) # tres important de preciser le label posit
          self.classifier = grid_search.best_estimator_ # Automatiquement garde le meilleur classifieur
          print("Meilleurs hyperparamètres :", grid_search.best_params_)


In [None]:
from sklearn.naive_bayes import MultinomialNB


class Multinomial(Classifieur):
    def __init__(self, data, alpha = None, fit_prior = None):
        super().__init__(data)
        self.classifier = MultinomialNB()
        if alpha : self.classifier.alpha = alpha
        if fit_prior : self.classifier.fit_prior = fit_prior

    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)


In [None]:
from sklearn.naive_bayes import BernoulliNB


class Bernoulli(Classifieur):
    def __init__(self, data, alpha = None, fit_prior = None, binarize = None):
        super().__init__(data)

        self.classifier = BernoulliNB()
        if alpha : self.classifier.alpha = alpha
        if fit_prior : self.classifier.fit_prior = fit_prior
        if binarize : self.classifier.binarize = binarize


    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)

In [None]:
from sklearn.naive_bayes import GaussianNB


class Gaussian(Classifieur):
    def __init__(self, data, var_smoothing = None):
        super().__init__(data)
        self.classifier = GaussianNB()
        if var_smoothing : self.classifier.var_smoothing = var_smoothing

    def entrainement(self):
        # Entraînement du modèle avec les données
        self.classifier.fit(self.X_train, self.Y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

class RegressionLogistique(Classifieur):
    def __init__(self, data, penalty = None, C = None, solver = None, max_iter = None, l1_ratio = None, random_state = None):
        super().__init__(data)
        self.classifier = LogisticRegression()
        if penalty : self.classifier.penalty = penalty
        if C : self.classifier.C = C
        if solver : self.classifier.solver = solver
        if max_iter : self.classifier.max_iter = max_iter
        if l1_ratio : self.classifier.l1_ratio
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [None]:
from sklearn.neighbors import KNeighborsClassifier

class KPlusProchesVoisins(Classifieur):
    def __init__(self, data, n_neighbors = 3, weights = None, algorithm = None, p = None):
        super().__init__(data)

        self.classifier = KNeighborsClassifier()
        if n_neighbors : self.classifier.n_neighbors = n_neighbors
        if weights : self.classifier.weights = weights
        if algorithm : self.classifier.algorithm = algorithm
        if p : self.classifier.p = p

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [None]:
from sklearn.tree import DecisionTreeClassifier

class ArbreDeDecision(Classifieur):
    def __init__(self, data, criterion = None, max_depth = None, min_samples_split = None, min_samples_leaf = None, max_features = None, ccp_alpha = None, random_state = None):
        super().__init__(data)
        self.classifier = DecisionTreeClassifier()
        if criterion : self.classifier.criterion = criterion
        if max_depth : self.classifier.max_depth = max_depth
        if min_samples_split : self.classifier.min_samples_split = min_samples_split
        if min_samples_leaf : self.classifier.min_samples_leaf = min_samples_leaf
        if max_features : self.classifier.max_features = max_features
        if ccp_alpha : self.classifier.ccp_alpha = ccp_alpha
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier

class ForetAleatoire(Classifieur):
    def __init__(self, data, n_estimators = 100, criterion = None, max_depth = None, min_samples_split = None, min_samples_leaf = None, max_features = None, bootstrap = None, ccp_alpha = None, random_state = 0 ):
        super().__init__(data)
        self.classifier = RandomForestClassifier()
        if n_estimators : self.classifier.n_estimators = n_estimators
        if criterion : self.classifier.criterion = criterion
        if max_depth : self.classifier.max_depth = max_depth
        if min_samples_split : self.classifier.min_samples_split = min_samples_split
        if min_samples_leaf : self.classifier.min_samples_leaf = min_samples_leaf
        if max_features : self.classifier.max_features = max_features
        if bootstrap : self.classifier.bootstrap = bootstrap
        if ccp_alpha : self.classifier.ccp_alpha = ccp_alpha
        if random_state : self.classifier.random_state = random_state

    def entrainement(self):
        self.classifier.fit(self.X_train, self.Y_train)


In [35]:
class Projet() :
    def __init__(self, ):
        with open("hyperpamètres_classifieur.jsonl","r") as file_1:
            self.hyper_classifieur = json.load(file_1)
        with open("hyperparametres_traitement.jsonl", "r") as file_2:
            self.hyper_data = json.load(file_2)

        ###### CHANGER LES PARAMETRES POUR LES SEQUENCE CI-DESSOUS ######

        ## POUR LES DATA ##
        #si les deux méthode d'une étape sont False (ex : lemmatisation et racinisation pour l'étape preprocessing )
        #Alors l'étape serat toujours égale à null
        self.sequence_data = {
            "lemmatisation": True,
            "racinisation": True,
            "comptages_ponderes": True,
            "decomposition_tronquee": True,
            "reduction_svd": True,
            "reduction_nmf": True
        }

        ## POUR LES CLASSIFIEURS ##
        self.sequence_classifieur = {
            "MultinomialNB" : True,
            "GaussianNB" : True,
            "BernoulliNB" : True,
            "LogisticRegression" : True,
            "KNeighborsClassifier" : True,
            "DecisionTreeClassifier" : True,
            "RandomForestClassifier" : True
        }

        ###### CHANGER SOURCE DES DONNEES ######
        self.data = "train.jsonl"
        self.limit_data = 100

        ### COMBINAISON DES DATA POSSIBLES ###
        self.combo_data = self.generate_data_hyperparam_combinations()
        self.nb_combo_data = len(self.combo_data)
        print("Combo data généré,  nombre : " + str(self.nb_combo_data))


        ### COMBINAISON DES CLASSIFIEUR POSSIBLES ###
        self.combo_classifieur = self.generate_classifier_hyperparam_combinations()
        self.nb_combo_classifieur = len(self.combo_classifieur)
        print("Combo classifieur généré, nombre : " + str(self.nb_combo_classifieur))

    def generate_data_hyperparam_combinations(self):
        combinations = []

        # Étape 1 : Prétraitement (une seule étape)
        preprocessing_combos = self._generate_combos_data("preprocessing", single_choice=True)

        # Étape 2 : Vectorisation (une seule étape)
        vectorization_combos = self._generate_combos_data("vectorization", single_choice=True)

        # Étape 3 : Réduction de dimension (une seule étape)
        reduction_combos = self._generate_combos_data("dimension_reduction", single_choice=True)

        # Étape 4 : Générer toutes les combinaisons possibles entre ces catégories
        for preprocessing in preprocessing_combos:
            for vectorization in vectorization_combos:
                for reduction in reduction_combos:
                    combinations.append({
                        "Preprocessing": preprocessing,
                        "Vectorization": vectorization,
                        "Dimension Reduction": reduction
                    })

        return combinations

    def _generate_combos_data(self, category, single_choice):
        """Génère les combinaisons d'une catégorie (preprocessing, vectorization, dimension_reduction)"""
        category_data = self.hyper_data.get(category, {})
        active_steps = [step for step in category_data if self.sequence_data.get(step, False)]

        if not active_steps:
            return [{}]  # Rien d'activé dans cette catégorie

        combos = []

        for step in active_steps:
            hyper_params = category_data.get(step, {})
            step_combos = self._generate_param_data_combinations(hyper_params, prefix=step)
            combos.extend(step_combos)

        if single_choice:
            return combos  # Retourne toutes les combinaisons possibles pour une seule étape
        else:
            return [{}]  # Non pertinent ici, puisque single_choice est True partout.

    def _generate_param_data_combinations(self, hyper_params, prefix=""):
        """Génère toutes les combinaisons d'hyperparamètres pour une étape donnée."""
        if not hyper_params:
            return [{}]

        keys, values = zip(*hyper_params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return [combo for combo in param_combos]

    def generate_classifier_hyperparam_combinations(self):
        all_combinations = []

        # Parcours des classifieurs activés dans sequence_classifieur
        for classifier, is_active in self.sequence_classifieur.items():
            if is_active:
                # Générer les combinaisons des hyperparamètres pour ce classifieur
                classifier_params = self.hyper_classifieur.get(classifier, {})
                combinations = self._generate_classifier_param_combinations(classifier_params)
                all_combinations.append({
                    classifier: combinations  # La clé ici est directement le nom du classifieur
                })

        return all_combinations

    def _generate_classifier_param_combinations(self, params):
        """Génère toutes les combinaisons d'hyperparamètres pour un classifieur donné"""
        if not params:
            return [{}]

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos

projet = Projet()

Combo data généré,  nombre : 1296
Combo classifieur généré, nombre : 7


TEST COMBO DATA

In [24]:
from itertools import product

class DataPipeline:
    def __init__(self):
        self.sequence_data = {
            "racinisation": True,
            "lemmatisation": True,
            "comptages_simples": True,
            "comptages_ponderes": True,
            "reduction_svd": True,
            "reduction_nmf": True
        }

        self.hyper_data = {
            "preprocessing": {
                "racinisation": {
                    "ignore_stopwords": [True, False]
                },
                "lemmatisation": {
                    "mode": ["lookup", "rule"],
                    "overwrite": [True, False]
                }
            },
            "vectorization": {
                "comptages_simples": {
                    "ngram_range": [[1, 1], [1, 2]],
                    "max_features": [None, 5000]
                },
                "comptages_ponderes": {
                    "ngram_range": [[1, 1], [1, 2], [2, 2]],
                    "max_features": [None, 5000],
                    "norm": ["l2", "l1"]
                }
            },
            "dimension_reduction": {
                "reduction_svd": {
                    "n_components": [50, 100, 200],
                    "n_iter": [5, 10, 15]
                },
                "reduction_nmf": {
                    "n_components": [50, 100, 200],
                    "init": ["random", "nndsvd", "use_idf"]
                }
            }
        }

    def generate_data_hyperparam_combinations(self):
        all_combinations = []

        preprocessing_steps = self.hyper_data.get("preprocessing", {})
        vectorization_steps = self.hyper_data.get("vectorization", {})
        reduction_steps = self.hyper_data.get("dimension_reduction", {})

        # Check if all methods for an entire step are disabled
        if not any(self.sequence_data.get(step, False) for step in preprocessing_steps):
            preprocessing_steps = {}
        if not any(self.sequence_data.get(step, False) for step in vectorization_steps):
            vectorization_steps = {}
        if not any(self.sequence_data.get(step, False) for step in reduction_steps):
            reduction_steps = {}

        preprocessing_combos = self._generate_combos_data(preprocessing_steps, "preprocessing")
        vectorization_combos = self._generate_combos_data(vectorization_steps, "vectorization")
        reduction_combos = self._generate_combos_data(reduction_steps, "dimension_reduction")

        if not preprocessing_combos:
            preprocessing_combos = [None]
        if not vectorization_combos:
            vectorization_combos = [None]
        if not reduction_combos:
            reduction_combos = [None]

        for pre in preprocessing_combos:
            for vec in vectorization_combos:
                for red in reduction_combos:
                    all_combinations.append({
                        "preprocessing": pre,
                        "vectorization": vec,
                        "dimension_reduction": red
                    })

        return all_combinations

    def _generate_combos_data(self, steps, step_type):
        """Generate all valid combinations for a specific step type."""
        step_combinations = []
        for step_name, params in steps.items():
            if self.sequence_data.get(step_name, False):
                param_combos = self._generate_param_data_combinations(params)
                for combo in param_combos:
                    step_combinations.append({"method": step_name, "parameters": combo})
        return step_combinations

    def _generate_param_data_combinations(self, params):
        """Generate all combinations of parameters for a given step."""
        if not params:
            return [{}]

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos

# Example usage
pipeline = DataPipeline()
result = pipeline.generate_data_hyperparam_combinations()

# Output the result in a readable JSON format
import json
print(json.dumps(result, indent=4))


[
    {
        "preprocessing": {
            "method": "lemmatisation",
            "parameters": {
                "mode": "lookup",
                "overwrite": true
            }
        },
        "vectorization": {
            "method": "comptages_simples",
            "parameters": {
                "ngram_range": [
                    1,
                    1
                ],
                "max_features": null
            }
        },
        "dimension_reduction": {
            "method": "reduction_svd",
            "parameters": {
                "n_components": 50,
                "n_iter": 5
            }
        }
    },
    {
        "preprocessing": {
            "method": "lemmatisation",
            "parameters": {
                "mode": "lookup",
                "overwrite": true
            }
        },
        "vectorization": {
            "method": "comptages_simples",
            "parameters": {
                "ngram_range": [
                    1,
      

TEST COMBO CLASSIFIER

In [33]:
from itertools import product

class ClassifierPipeline:
    def __init__(self):
        self.sequence_classifieur = {
            "MultinomialNB" : True,
            "GaussianNB" : True,
            "BernoulliNB" : True,
            "LogisticRegression" : True,
            "KNeighborsClassifier" : False,
            "DecisionTreeClassifier" : True,
            "RandomForestClassifier" : True
        }

        self.hyper_classifieur = {
            "MultinomialNB": {
                "alpha": [0.1, 0.5, 1.0, 2.0],
                "fit_prior": [True, False]
            },
            "GaussianNB": {
                "var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6]
            },
            "BernoulliNB": {
                "alpha": [0.1, 0.5, 1.0, 2.0],
                "binarize": [0.0, 0.5, 1.0, 2.0],
                "fit_prior": [True, False]
            },
            "LogisticRegression": {
                "penalty": ["l1", "l2", "elasticnet", "none"],
                "C": [0.01, 0.1, 1.0, 10.0],
                "solver": ["liblinear", "saga", "lbfgs", "newton-cg"],
                "max_iter": [100, 500, 1000],
                "l1_ratio": [0.1, 0.5, 0.9]
            },
            "KNeighborsClassifier": {
                "n_neighbors": [3, 5, 7, 10],
                "weights": ["uniform", "distance"],
                "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                "leaf_size": [20, 30, 50],
                "p": [1, 2]
            },
            "DecisionTreeClassifier": {
                "criterion": ["gini", "entropy"],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 5],
                "max_features": [None, "sqrt", "log2"],
                "ccp_alpha": [0.0, 0.01, 0.05]
            },
            "RandomForestClassifier": {
                "n_estimators": [100, 200, 500],
                "criterion": ["gini", "entropy"],
                "max_depth": [None, 10, 20, 30],
                "min_samples_split": [2, 5, 10],
                "min_samples_leaf": [1, 2, 5],
                "max_features": ["sqrt", "log2", None],
                "bootstrap": [True, False],
                "ccp_alpha": [0.0, 0.01, 0.05]
            }
        }

    def generate_classifier_hyperparam_combinations(self):
        all_combinations = []

        # Parcours des classifieurs activés dans sequence_classifieur
        for classifier, is_active in self.sequence_classifieur.items():
            if is_active:
                # Générer les combinaisons des hyperparamètres pour ce classifieur
                classifier_params = self.hyper_classifieur.get(classifier, {})
                combinations = self._generate_classifier_param_combinations(classifier_params)
                all_combinations.append({
                    classifier: combinations  # La clé ici est directement le nom du classifieur
                })

        return all_combinations

    def _generate_classifier_param_combinations(self, params):
        """Génère toutes les combinaisons d'hyperparamètres pour un classifieur donné"""
        if not params:
            return [{}]

        keys, values = zip(*params.items())
        param_combos = [dict(zip(keys, combination)) for combination in product(*values)]
        return param_combos

# Exemple d'utilisation
pipeline = ClassifierPipeline()
result = pipeline.generate_classifier_hyperparam_combinations()

# Retourner le résultat final sous forme structurée
import json
print(json.dumps(result, indent=4))  # Affichage complet des combinaisons pour les classifieurs activés
print(len(result))


[
    {
        "MultinomialNB": [
            {
                "alpha": 0.1,
                "fit_prior": true
            },
            {
                "alpha": 0.1,
                "fit_prior": false
            },
            {
                "alpha": 0.5,
                "fit_prior": true
            },
            {
                "alpha": 0.5,
                "fit_prior": false
            },
            {
                "alpha": 1.0,
                "fit_prior": true
            },
            {
                "alpha": 1.0,
                "fit_prior": false
            },
            {
                "alpha": 2.0,
                "fit_prior": true
            },
            {
                "alpha": 2.0,
                "fit_prior": false
            }
        ]
    },
    {
        "GaussianNB": [
            {
                "var_smoothing": 1e-09
            },
            {
                "var_smoothing": 1e-08
            },
            {
                "var_

Fonction ultra débile qui teste tous les classifieurs, pré traitements, vectorisation et réduction afin d'avoir le meilleurs taux de réussite.

In [None]:
def possibilite():
    results = []  # Liste pour stocker les résultats

    with open("hyperpamètres_classifieur.jsonl","r") as file:
        hyper_classifieur = json.load(file)

    for x in ["lemmatisation", "racinisation","rien"]:
            data = Data(limit=100)
            if x == "lemmatisation" :
                for mode in ["lookup", "rule"]:
                  for overit in [True, False]:
                    data.lemmatisation(mode,overit)
                    print("etape 1")
                    data.vectorisation_ponderee()
                    print("etape 2")
                    for y in ["nmf","svd" ]:
                      if y == "nmf":
                          data.reduction_nmf(n_components = 100)
                          print("etape 3")
                          z = "nmf"
                      else:
                          data.reduction_svd(n_components = 100)
                          print("etape 4")
                          z = "svd"
#Meilleurs hyperparamètres : {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
                      # Parcours des classifieurs
                      for i in [ForetAleatoire]:
                          classifieur = i(data.get_data())
                          param_opti = classifieur.recherche_hyperparametres(hyper_classifieur["RandomForestClassifier"])
                          classifieur = i(data.get_data(),max_depth=param_opti["max_depth"], min_samples_leaf = param_opti["min_samples_leaf"], min_samples_split = param_opti["min_samples_split"], n_estimators=param_opti["min_samples_split"] )
                          taux_reussite = classifieur.taux_reussite()
                          f1_score = classifieur.f1_score()

                          # Ajout des résultats à la liste sous forme de tuple
                          results.append(x, mode, overit, y, z, i.__name__, taux_reussite,f1_score)
                          print(results[len(results)])


            elif x == "racinisation" :
              for stop_word in [True, False]:
                data.racinisation(stop_word)
                data.vectorisation_ponderee()
                for y in ["vectorisation_ponderee","vectorisation_simple"]:
                      if y == "vectorisation_ponderee":
                          data.reduction_nmf(n_components = 100)
                          z = "nmf"
                      else:
                          data.reduction_svd(n_components = 100)
                          z = "svd"

                      # Parcours des classifieurs
                      for i in [ForetAleatoire]:
                          classifieur = i(data.get_data())
                          param_opti = classifieur.recherche_hyperparametres(hyper_classifieur["RandomForestClassifier"])
                          classifieur = i(data.get_data(),max_depth=param_opti["max_depth"], min_samples_leaf = param_opti["min_samples_leaf"], min_samples_split = param_opti["min_samples_split"], n_estimators=param_opti["min_samples_split"] )
                          taux_reussite = classifieur.taux_reussite()
                          f1_score = classifieur.f1_score()

                          # Ajout des résultats à la liste sous forme de tuple
                          results.append((x, mode, overit, y, z, i.__name__, taux_reussite,f1_score))
                          print(results[len(results)])

    # Tri des résultats par taux de réussite (du plus élevé au plus faible)
    results_sorted = sorted(results, key=lambda x: x[6], reverse=True)

    # Affichage des résultats triés
    for res in results_sorted:
        print(f"{res[0]} - {res[1]} - {res[2]} - {res[3]} - {res[4]} - {res[5]} : {res[6]}")

possibilite()

FileNotFoundError: [Errno 2] No such file or directory: 'hyperpamètres_classifieur.jsonl'

Exemple d'utilisation en temps normal pour l'optimisation

In [None]:
data = Data()
data.lemmatisation()
data.vectorisation_ponderee()
data.reduction_nmf()
data = data.get_data()

foret = ForetAleatoire(data)
foret.entrainement()
print(foret.f1_score())

0.643


Optimisation de la foret

In [None]:
#Donne les meilleurs hyperparametre de la foret
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

data = Data(limit = 50)
print("etape 1")
data.lemmatisation()
print("etape 2")
data.vectorisation_ponderee()
print("etape 3")
data.reduction_nmf()
print("etape 4")
data = data.get_data()
print("etape 5")

foret = ForetAleatoire(data)
print("etape 6")
foret.recherche_hyperparametres(param_grid)
print("etape 7")
foret.f1_score()


etape 1
etape 2
etape 3
etape 4
etape 5
etape 6
Meilleurs hyperparamètres : {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
etape 7




'0.521'