In [92]:
import pandas as pd # For files manipulations.
import numpy as np # For matrex manipulations.

df = pd.read_csv('../data/train.csv') # read the file (df = dataframe).
df.head() # display first 5 rows of the file.

Unnamed: 0,doc_id,titre,type,difficulte,cout,ingredients,recette
0,recette_221358.xml,"Feuilleté de saumon et de poireau, sauce aux c...",Plat principal,Facile,Moyen,- 1 gros pavé de saumon - 100 g de crevettes d...,Couper finement le blanc et un peu de vert des...
1,recette_48656.xml,Cake poulet/moutarde/amandes,Entrée,Très facile,Bon marché,- 3 œufs - 150 g de farine - 1 sachet de levur...,"Couper finement l'échalote, la faire revenir à..."
2,recette_30049.xml,Bûche à la truite fumée (7ème rencontre),Entrée,Moyennement difficile,Assez Cher,- 800 g de filet de truite saumonnée fumée en ...,Faites blanchir les épinards à l'eau bouillant...
3,recette_71424.xml,Gâteau au yaourt au coco sans huile de laetitia,Dessert,Très facile,Bon marché,- 1 pot de yaourt - 1 pot de lait de coco - 3 ...,Mélanger dans l'ordre tous les ingrédients en ...
4,recette_217204.xml,Crêpes au canard laqué,Entrée,Moyennement difficile,Moyen,- 90 g de farine - 45 g de maïzena - 2 œufs - ...,"Fouetter les œufs avec l'eau, le lait et le su..."


In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Preprocessing the textual data
# Assuming `features` contains textual data in the 'titre' and 'ingredients' columns
text_data = df['titre'] + ' ' + df['ingredients']
labels = [['type']]

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(text_data, labels, test_size=0.30, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [12473, 1]

In [94]:
# Vectorizing the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the max_features as needed
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

# Training the classifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(x_train_tfidf, y_train)

## Evaluating the classifier
accuracy = clf.score(x_test_tfidf, y_test)
print("Accuracy (TF-IDF) :", accuracy*100, '%')

Accuracy (TF-IDF) : 80.09086050240512 %


In [95]:
# Confusion Matrix & Classification Report for the testing sample
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(x_train_tfidf)
print ("\nTraining Classification Report:\n\n", classification_report(y_train, y_pred))

print ("\nConfusion Matrix Of Training:\n\n", confusion_matrix(y_train, y_pred))


Training Classification Report:

                 precision    recall  f1-score   support

       Dessert       1.00      1.00      1.00      2670
        Entrée       0.99      0.99      0.99      1997
Plat principal       0.99      0.99      0.99      4064

      accuracy                           0.99      8731
     macro avg       0.99      0.99      0.99      8731
  weighted avg       0.99      0.99      0.99      8731


Confusion Matrix Of Training:

 [[2670    0    0]
 [   8 1968   21]
 [   4   20 4040]]


In [96]:
# Vectorizing the text data using Bag of Words (BoW)
bow_vectorizer = CountVectorizer(max_features=1000)  # You can adjust the max_features as needed
x_train_bow = bow_vectorizer.fit_transform(x_train)
x_test_bow = bow_vectorizer.transform(x_test)

# Training the classifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(x_train_bow, y_train)

# Evaluating the classifier
accuracy = clf.score(x_test_bow, y_test)
print("Accuracy (BoW) :", accuracy*100, '%')

Accuracy (BoW) : 79.87707108498128 %


In [97]:
# Confusion Matrix & Classification Report for the testing sample
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = clf.predict(x_train_bow)
print ("\nTraining Classification Report:\n\n", classification_report(y_train, y_pred))

print ("\nConfusion Matrix Of Training:\n\n", confusion_matrix(y_train, y_pred))


Training Classification Report:

                 precision    recall  f1-score   support

       Dessert       1.00      1.00      1.00      2670
        Entrée       0.99      0.99      0.99      1997
Plat principal       1.00      0.99      0.99      4064

      accuracy                           0.99      8731
     macro avg       0.99      0.99      0.99      8731
  weighted avg       0.99      0.99      0.99      8731


Confusion Matrix Of Training:

 [[2668    1    1]
 [   5 1975   17]
 [   2   23 4039]]
