In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np # For matrex manipulations.

df = pd.read_csv('../data/train.csv') # read the file (df = dataframe).
df.head() # display first 5 rows of the file.


# Assuming `features` contains textual data in the 'titre', 'difficulte', 'cout', 'ingredients', and 'recette' columns
text_data = df['titre'] + ' ' + df['ingredients'] + ' ' + df['recette']

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.30, random_state=42)

# Vectorizing the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Training the classifier (Logistic Regression)
clf = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary
clf.fit(x_train_tfidf, y_train)

# Evaluating the classifier
accuracy = clf.score(x_test_tfidf, y_test)
print("Accuracy (TF-IDF):", accuracy * 100, '%')

# Confusion Matrix & Classification Report for the testing sample
y_pred = clf.predict(x_test_tfidf)
print("\nTesting Classification Report (TF-IDF):\n\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix Of Testing (TF-IDF):\n\n", confusion_matrix(y_test, y_pred))

# Vectorizing the text data using Bag of Words (BoW)
bow_vectorizer = CountVectorizer(max_features=1000)
x_train_bow = bow_vectorizer.fit_transform(x_train)
x_test_bow = bow_vectorizer.transform(x_test)

# Training the classifier (Logistic Regression)
clf = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary
clf.fit(x_train_bow, y_train)

# Evaluating the classifier
accuracy = clf.score(x_test_bow, y_test)
print("Accuracy (BoW):", accuracy * 100, '%')

# Confusion Matrix & Classification Report for the testing sample
y_pred = clf.predict(x_test_bow)
print("\nTesting Classification Report (BoW):\n\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix Of Testing (BoW):\n\n", confusion_matrix(y_test, y_pred))


Accuracy (TF-IDF): 85.75628006413683 %

Testing Classification Report (TF-IDF):

                 precision    recall  f1-score   support

       Dessert       0.98      0.99      0.99      1092
        Entrée       0.77      0.61      0.68       912
Plat principal       0.82      0.90      0.86      1738

      accuracy                           0.86      3742
     macro avg       0.86      0.84      0.84      3742
  weighted avg       0.85      0.86      0.85      3742


Confusion Matrix Of Testing (TF-IDF):

 [[1086    3    3]
 [  14  556  342]
 [  12  159 1567]]
Accuracy (BoW): 84.12613575628006 %

Testing Classification Report (BoW):

                 precision    recall  f1-score   support

       Dessert       0.98      0.98      0.98      1092
        Entrée       0.70      0.64      0.67       912
Plat principal       0.82      0.86      0.84      1738

      accuracy                           0.84      3742
     macro avg       0.83      0.83      0.83      3742
  weighted av