In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Extraction des fonctionnalités (features) à partir des phrases
vectorizer = CountVectorizer(vocabulary=mots_cles)
X = vectorizer.fit_transform(data['body_clean'])

# Labels correspondants
y = data['main_tag']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres à tester pour la recherche en grille
param_grid = {'alpha': [0.1, 0.5, 1.0]}

# Entraînement du modèle de classification avec recherche en grille des hyperparamètres
model = MultinomialNB()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Prédictions sur l'ensemble de test
y_pred = best_model.predict(X_test)

# Calcul de l'accuracy
accuracy = accuracy_score(y_test, y_pred)

# Affichage de l'accuracy
print('Accuracy:', accuracy)

# Rapport de classification
classification_report = classification_report(y_test, y_pred)
print('Classification Report:\n', classification_report)

# Exemple d'une nouvelle phrase à classifier
nouvelle_phrase = 'I want to create a class in C#'

# Extraction des fonctionnalités (features) de la nouvelle phrase
nouvelle_phrase_features = vectorizer.transform([nouvelle_phrase])

# Classification de la nouvelle phrase avec le meilleur modèle
prediction = best_model.predict(nouvelle_phrase_features)

# Affichage de la prédiction
print('Phrase:', nouvelle_phrase)
print('Prédiction:', prediction)

Accuracy: 0.3160850681443762
Classification Report:
               precision    recall  f1-score   support

        .net       0.00      0.00      0.00       288
     android       0.65      0.23      0.34       872
          c#       0.22      0.97      0.35      2385
         c++       0.00      0.00      0.00      1593
        html       0.00      0.00      0.00       267
         ios       0.10      0.00      0.00       885
        java       0.87      0.36      0.51      2212
  javascript       0.06      0.01      0.02      2108
         php       0.00      0.00      0.00       744
      python       0.90      0.45      0.60      2000

    accuracy                           0.32     13354
   macro avg       0.28      0.20      0.18     13354
weighted avg       0.38      0.32      0.26     13354

Phrase: I want to create a class in C#
Prédiction: ['c#']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Extraction des fonctionnalités (features) à partir des phrases
vectorizer = TfidfVectorizer(vocabulary=mots_cles)
X = vectorizer.fit_transform(data['body_clean'])

# Labels correspondants
y = data['main_tag']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres à tester pour la recherche en grille
param_grid = {'alpha': [0.1, 0.5, 1.0]}

# Entraînement du modèle de classification avec recherche en grille des hyperparamètres
model = MultinomialNB()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Prédictions sur l'ensemble de test
y_pred = best_model.predict(X_test)

# Calcul de l'accuracy
accuracy = accuracy_score(y_test, y_pred)

# Affichage de l'accuracy
print('Accuracy:', accuracy)

# Rapport de classification
classification_report = classification_report(y_test, y_pred)
print('Classification Report:\n', classification_report)

# Exemple d'une nouvelle phrase à classifier
nouvelle_phrase = 'I want to create a class in C# with android in visual studio code'

# Extraction des fonctionnalités (features) de la nouvelle phrase
nouvelle_phrase_features = vectorizer.transform([nouvelle_phrase])

# Classification de la nouvelle phrase avec le meilleur modèle
prediction = best_model.predict(nouvelle_phrase_features)

# Affichage de la prédiction
print('Phrase:', nouvelle_phrase)
print('Prédiction:', prediction)

Accuracy: 0.30155758574209973
Classification Report:
               precision    recall  f1-score   support

        .net       0.00      0.00      0.00       288
     android       0.00      0.00      0.00       872
          c#       0.22      0.97      0.35      2385
         c++       0.03      0.00      0.00      1593
        html       0.00      0.00      0.00       267
         ios       0.00      0.00      0.00       885
        java       0.88      0.35      0.50      2212
  javascript       0.06      0.02      0.03      2108
         php       0.00      0.00      0.00       744
      python       0.89      0.45      0.60      2000

    accuracy                           0.30     13354
   macro avg       0.21      0.18      0.15     13354
weighted avg       0.33      0.30      0.24     13354

Phrase: I want to create a class in C# with android in visual studio code
Prédiction: ['javascript']


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# Exemple de données d'entraînement
data = pd.DataFrame({
    'text': ['I love apples', 'I eat bananas', 'The cat is sleeping', 'The dog is barking'],
    'labels': [['fruits'], ['fruits'], ['animals'], ['animals', 'sounds']]
})

# Extraction des fonctionnalités (features) à partir des phrases
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])

# Labels correspondants (conversion en liste de tags)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['labels'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres à tester pour la recherche en grille
param_grid = {'alpha': [0.1, 0.5, 1.0]}

# Entraînement du modèle de classification avec recherche en grille des hyperparamètres
model = MultinomialNB()
grid_search = GridSearchCV(model, param_grid, cv=2)
grid_search.fit(X_train, y_train)

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Prédictions sur l'ensemble de test
y_pred = best_model.predict(X_test)

# Calcul de l'accuracy
accuracy = accuracy_score(y_test, y_pred)

# Affichage de l'accuracy
print('Accuracy:', accuracy)

# Rapport de classification
classification_report = classification_report(y_test, y_pred, target_names=mlb.classes_)
print('Classification Report:\n', classification_report)

# Exemple d'une nouvelle phrase à classifier
nouvelle_phrase = 'I want to eat an apple'

# Extraction des fonctionnalités (features) de la nouvelle phrase
nouvelle_phrase_features = vectorizer.transform([nouvelle_phrase])

# Classification de la nouvelle phrase avec le meilleur modèle
predictions = best_model.predict(nouvelle_phrase_features)

# Décodage des tags prédits
predicted_tags = mlb.inverse_transform(predictions)

# Affichage des tags prédits
print('Phrase:', nouvelle_phrase)
print('Tags prédits:', predicted_tags)


ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 749, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 583, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1122, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1143, in _check_y
    y = column_or_1d(y, warn=True)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (1, 3) instead.

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 749, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 583, in _check_X_y
    return self._validate_data(X, y, accept_sparse="csr", reset=reset)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\base.py", line 565, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1122, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1143, in _check_y
    y = column_or_1d(y, warn=True)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 1202, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (2, 3) instead.


In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

# Séparer les caractéristiques (X) et les étiquettes (y)
X = data['body_clean']
y = data['Tags']

# Vectorisation des textes
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Conversion des étiquettes en format binaire
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres à ajuster
param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}

# Entraînement du modèle de classification avec recherche en grille des hyperparamètres
model = MultinomialNB()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, np.argmax(y_train, axis=1))

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Prédiction des probabilités sur l'ensemble de test
y_pred_prob = best_model.predict_proba(X_test)

# Binarisation des probabilités prédites
threshold = 0.5
y_pred = (y_pred_prob >= threshold).astype(int)

# Conversion des étiquettes prédites en format original
y_pred_labels = mlb.inverse_transform(y_pred)
y_test_labels = mlb.inverse_transform(y_test)

# Affichage du rapport de classification
print(classification_report(y_test_labels, y_pred_labels))

ValueError: Expected indicator for 45 classes, but got 1

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Séparer les caractéristiques (X) et les étiquettes (y)
X = data['body_clean']
y = data['Tags']

# Vectorisation des textes
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition des hyperparamètres à ajuster
param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}

# Entraînement du modèle de classification avec recherche en grille des hyperparamètres
model = MultinomialNB()
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Meilleur modèle trouvé
best_model = grid_search.best_estimator_

# Prédiction des étiquettes sur l'ensemble de test
y_pred = best_model.predict(X_test)

# Affichage du rapport de classification
print(classification_report(y_test, y_pred))

# Calcul de la précision globale
accuracy = accuracy_score(y_test, y_pred)
print("Précision globale :", accuracy)




ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 776, in fit
    self._count(X, Y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 899, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 189, in safe_sparse_dot
    ret = a @ b
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 636, in __rmatmul__
    return self._rmul_dispatch(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 614, in _rmul_dispatch
    ret = self.transpose()._mul_dispatch(tr)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 532, in _mul_dispatch
    return self._mul_multivector(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_compressed.py", line 497, in _mul_multivector
    result = np.zeros((M, n_vecs),
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 18.7 GiB for an array with shape (60101, 41656) and data type int64

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 776, in fit
    self._count(X, Y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 899, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 189, in safe_sparse_dot
    ret = a @ b
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 636, in __rmatmul__
    return self._rmul_dispatch(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 614, in _rmul_dispatch
    ret = self.transpose()._mul_dispatch(tr)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 532, in _mul_dispatch
    return self._mul_multivector(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_compressed.py", line 497, in _mul_multivector
    result = np.zeros((M, n_vecs),
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 18.7 GiB for an array with shape (60101, 41689) and data type int64

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 776, in fit
    self._count(X, Y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 899, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 189, in safe_sparse_dot
    ret = a @ b
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 636, in __rmatmul__
    return self._rmul_dispatch(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 614, in _rmul_dispatch
    ret = self.transpose()._mul_dispatch(tr)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 532, in _mul_dispatch
    return self._mul_multivector(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_compressed.py", line 497, in _mul_multivector
    result = np.zeros((M, n_vecs),
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 18.7 GiB for an array with shape (60101, 41733) and data type int64

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 776, in fit
    self._count(X, Y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 899, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 189, in safe_sparse_dot
    ret = a @ b
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 636, in __rmatmul__
    return self._rmul_dispatch(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 614, in _rmul_dispatch
    ret = self.transpose()._mul_dispatch(tr)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 532, in _mul_dispatch
    return self._mul_multivector(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_compressed.py", line 497, in _mul_multivector
    result = np.zeros((M, n_vecs),
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 18.7 GiB for an array with shape (60101, 41710) and data type int64

--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 776, in fit
    self._count(X, Y)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 899, in _count
    self.feature_count_ += safe_sparse_dot(Y.T, X)
  File "C:\Users\omira\anaconda3\lib\site-packages\sklearn\utils\extmath.py", line 189, in safe_sparse_dot
    ret = a @ b
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 636, in __rmatmul__
    return self._rmul_dispatch(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 614, in _rmul_dispatch
    ret = self.transpose()._mul_dispatch(tr)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_base.py", line 532, in _mul_dispatch
    return self._mul_multivector(other)
  File "C:\Users\omira\anaconda3\lib\site-packages\scipy\sparse\_compressed.py", line 497, in _mul_multivector
    result = np.zeros((M, n_vecs),
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 18.7 GiB for an array with shape (60101, 41677) and data type int64


In [7]:
# Vérifier si la colonne "body_clean" contient des valeurs float
float_values = data['body_clean'][data['body_clean'].apply(lambda x: isinstance(x, float))]
print(float_values)

19729    NaN
Name: body_clean, dtype: object


In [3]:
# on supprimme l'échantillon qui contient un float
data.drop(19729, inplace=True)

In [14]:
lignes_manquantes = data[data['body_clean'].isnull()].index

In [15]:
# Supprimer les lignes contenant des valeurs manquantes
data.drop(lignes_manquantes, inplace=True)

# Réindexer le DataFrame
data.reset_index(drop=True, inplace=True)

In [23]:
# Chargement des données depuis un DataFrame
data = pd.read_csv('QueryResultsFinal.csv')

In [25]:
data = data.dropna()

In [4]:
data.isnull().sum()

Title               0
Tags                0
Id                  0
Score               0
ViewCount           0
AnswerCount         0
CreationDate        0
LastActivityDate    0
CommentCount        0
body_clean          0
main_tag            0
ActivityTime        0
dtype: int64

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

# Séparation des données en variables prédictives (X) et variable cible (y)
X = data['body_clean']
y = data['main_tag']

# Extraction des mots clés pré-définis
mots_cles = ['c#', 'java', 'javascript', 'python', 'c++', 'android', 'ios', '.net', 'html', 'php'] 

# Création du vecteur de caractéristiques en utilisant TF-IDF
vectorizer = TfidfVectorizer(vocabulary=mots_cles)

# Transformation du texte en vecteurs
X_vecteurs = vectorizer.fit_transform(X)

# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_vecteurs, y, test_size=0.2, random_state=42)

# Définition du modèle de classification (SVM)
model = SVC()

# Définition des hyperparamètres à rechercher avec GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001]}

# Recherche sur grille pour trouver les meilleurs paramètres
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Affichage des meilleurs paramètres trouvés
print("Meilleurs paramètres :", grid_search.best_params_)

# Prédiction sur l'ensemble de test avec les meilleurs paramètres
y_pred = grid_search.predict(X_test)

# Évaluation des performances du modèle
accuracy = (y_pred == y_test).mean()
print("Précision :", accuracy)

Meilleurs paramètres : {'C': 10, 'gamma': 0.1}
Précision : 0.33592930957016626
