In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [10]:
# Load training data
train_df = pd.read_csv('../data/train.csv')
train_df.head()

Unnamed: 0,doc_id,titre,type,difficulte,cout,ingredients,recette
0,recette_221358.xml,"Feuilleté de saumon et de poireau, sauce aux c...",Plat principal,Facile,Moyen,- 1 gros pavé de saumon - 100 g de crevettes d...,Couper finement le blanc et un peu de vert des...
1,recette_48656.xml,Cake poulet/moutarde/amandes,Entrée,Très facile,Bon marché,- 3 œufs - 150 g de farine - 1 sachet de levur...,"Couper finement l'échalote, la faire revenir à..."
2,recette_30049.xml,Bûche à la truite fumée (7ème rencontre),Entrée,Moyennement difficile,Assez Cher,- 800 g de filet de truite saumonnée fumée en ...,Faites blanchir les épinards à l'eau bouillant...
3,recette_71424.xml,Gâteau au yaourt au coco sans huile de laetitia,Dessert,Très facile,Bon marché,- 1 pot de yaourt - 1 pot de lait de coco - 3 ...,Mélanger dans l'ordre tous les ingrédients en ...
4,recette_217204.xml,Crêpes au canard laqué,Entrée,Moyennement difficile,Moyen,- 90 g de farine - 45 g de maïzena - 2 œufs - ...,"Fouetter les œufs avec l'eau, le lait et le su..."


In [11]:
# Load testing data
test_df = pd.read_csv('../data/test.csv')
test_df.head()

Unnamed: 0,doc_id,titre,type,difficulte,cout,ingredients,recette
0,recette_84191.xml,Roulé à la confiture de lait,Dessert,Moyennement difficile,Bon marché,- Pour la garniture: - 1 boîte de lait concent...,"La veille, préparer de la confiture de lait en..."
1,recette_26585.xml,Croissants aux amandes,Dessert,Moyennement difficile,Moyen,- 250 g de farine - 1 pincée de sel - 1 cuillè...,Croissants : Pétrir les ingrédients pour en fa...
2,recette_176139.xml,Quinoa Phileas (aux légumes croquants et sauci...,Plat principal,Moyennement difficile,Moyen,- 250 g de quinoa - 1 gros oignon blanc - 1 g...,Faites cuire le quinoa pendant 12 minutes dans...
3,recette_14285.xml,Magret de canard à la crème de mûre,Plat principal,Moyennement difficile,Moyen,- 1 magret de canard - 5 cl de crème de mûre -...,"Après avoir confectionné la sauce au vin, lui ..."
4,recette_20895.xml,St-Jacques a la sauce aux huitres et aux asperges,Entrée,Moyennement difficile,Moyen,- 10 cl de vin blanc sec - 18 asperges vertes ...,"Ouvrez, videz et nettoyez les coquilles Saint-..."


In [12]:
# Preprocessing the textual data for training
train_text_data = train_df['titre'] + ' ' + train_df['ingredients']
train_labels = train_df['type']

# Preprocessing the textual data for testing
test_text_data = test_df['titre'] + ' ' + test_df['ingredients']
test_labels = test_df['type']

In [13]:
# Vectorizing the text data using TF-IDF for training
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
x_train_tfidf = tfidf_vectorizer.fit_transform(train_text_data)

# Vectorizing the text data using TF-IDF for testing
x_test_tfidf = tfidf_vectorizer.transform(test_text_data)

# Training the classifier with TF-IDF using SVM
clf = SVC(kernel='linear')
clf.fit(x_train_tfidf, train_labels)

# Prediction for TF-IDF
y_pred_tfidf = clf.predict(x_test_tfidf)

In [14]:
# Compute evaluation metrics for TF-IDF
accuracy_tfidf = accuracy_score(test_labels, y_pred_tfidf)
precision_tfidf = precision_score(test_labels, y_pred_tfidf, average='weighted')
recall_tfidf = recall_score(test_labels, y_pred_tfidf, average='weighted')
f1_tfidf = f1_score(test_labels, y_pred_tfidf, average='weighted')
conf_matrix_tfidf = confusion_matrix(test_labels, y_pred_tfidf)
class_report_tfidf = classification_report(test_labels, y_pred_tfidf)

# Print the evaluation metrics for TF-IDF
print("Evaluation Metrics (TF-IDF):\n")
print(f"Accuracy: {accuracy_tfidf}")
print(f"Precision: {precision_tfidf}")
print(f"Recall: {recall_tfidf}")
print(f"F1 Score: {f1_tfidf}")
print("Confusion Matrix (TF-IDF):")
print(conf_matrix_tfidf)
print("Classification Report (TF-IDF):")
print(class_report_tfidf)

Evaluation Metrics (TF-IDF):

Accuracy: 0.8378962536023055
Precision: 0.8329809137907512
Recall: 0.8378962536023055
F1 Score: 0.8339419020878843
Confusion Matrix (TF-IDF):
[[406   1   0]
 [  6 200 131]
 [  2  85 557]]
Classification Report (TF-IDF):
                precision    recall  f1-score   support

       Dessert       0.98      1.00      0.99       407
        Entrée       0.70      0.59      0.64       337
Plat principal       0.81      0.86      0.84       644

      accuracy                           0.84      1388
     macro avg       0.83      0.82      0.82      1388
  weighted avg       0.83      0.84      0.83      1388



In [15]:
# Vectorizing the text data using Bag of Words (BoW) for training
bow_vectorizer = CountVectorizer(max_features=1000)
x_train_bow = bow_vectorizer.fit_transform(train_text_data)

# Vectorizing the text data using Bag of Words (BoW) for testing
x_test_bow = bow_vectorizer.transform(test_text_data)

# Training the classifier with BoW using SVM
clf = SVC(kernel='linear')
clf.fit(x_train_bow, train_labels)

# Prediction for BoW
y_pred_bow = clf.predict(x_test_bow)

In [16]:
# Compute evaluation metrics for BoW
accuracy_bow = accuracy_score(test_labels, y_pred_bow)
precision_bow = precision_score(test_labels, y_pred_bow, average='weighted')
recall_bow = recall_score(test_labels, y_pred_bow, average='weighted')
f1_bow = f1_score(test_labels, y_pred_bow, average='weighted')
conf_matrix_bow = confusion_matrix(test_labels, y_pred_bow)
class_report_bow = classification_report(test_labels, y_pred_bow)

# Print the evaluation metrics for BoW
print("\nEvaluation Metrics (BoW):")
print(f"Accuracy: {accuracy_bow}")
print(f"Precision: {precision_bow}")
print(f"Recall: {recall_bow}")
print(f"F1 Score: {f1_bow}")
print("Confusion Matrix (BoW):")
print(conf_matrix_bow)
print("Classification Report (BoW):")
print(class_report_bow)


Evaluation Metrics (BoW):
Accuracy: 0.8263688760806917
Precision: 0.8245548964706609
Recall: 0.8263688760806917
F1 Score: 0.8253368475458328
Confusion Matrix (BoW):
[[402   3   2]
 [  4 211 122]
 [  2 108 534]]
Classification Report (BoW):
                precision    recall  f1-score   support

       Dessert       0.99      0.99      0.99       407
        Entrée       0.66      0.63      0.64       337
Plat principal       0.81      0.83      0.82       644

      accuracy                           0.83      1388
     macro avg       0.82      0.81      0.82      1388
  weighted avg       0.82      0.83      0.83      1388

