In [5]:
%pwd
%matplotlib inline
# neccasary packages are loaded
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import zipfile
from IPython.display import display, Markdown
from ast import literal_eval
import calendar
from wordcloud import WordCloud


# Load Excel file
file_path = 'full_dataset.csv' 
recipes = pd.read_csv(file_path)
print(recipes.head())
recipes.info()
recipes.head()
recipes.rename(columns={'Unnamed: 0': 'ID','directions': 'procedure', 'title': 'name'},inplace=True, errors='raise')
recipes.head()
recipes = recipes.astype({'ID': int,
                'name': str,
                'ingredients': 'object',
                'procedure': 'object',
                'link': str,
                'source': 'category',
                'NER': 'object'})
recipes['n_NER'] = recipes['NER'].apply(len)
recipes['n_procedures'] = recipes['procedure'].apply(len)
recipes.head()
recipes_old = recipes[:]
recipes = recipes[recipes['n_NER'] > 0]
recipes = recipes[recipes['n_procedures'] > 0]
recipes['source'].unique()
unique_names = len(recipes['name'].unique())
number_entries = len(recipes)
number_entries_old = len(recipes_old)
display(Markdown(f'Number of all entries: {number_entries:.0f} vs Number of all entries before cleaning: {number_entries_old :.0f}'))
display(Markdown(f'Number of unique names: {unique_names :.0f}'))
average_n_ingridients = recipes.groupby('source')['n_NER'].mean()
fig, ax = plt.subplots(figsize=(6, 4))
ax.set_title('Distribution of number ingridients by sources')
average_n_ingridients.plot(kind='barh', stacked=False, ax=ax)
plt.show()
average_n_procedures = recipes.groupby('source')['n_procedures'].mean()
fig, ax = plt.subplots(figsize=(6, 4))
ax.set_title('Distribution of number procedure steps by sources')
average_n_procedures.plot(kind='barh', stacked=False, ax=ax)
plt.show()
NER_exploded = recipes.explode('NER')
ingridients = NER_exploded['NER']
unique_NER_values = NER_exploded['NER'].unique()
display(Markdown(f'Number of unique ingridients: {len(unique_NER_values) :.0f}'))
print(unique_NER_values[:200])
wordcloud = WordCloud(width=800, height=800, background_color='white').generate(' '.join(ingridients[:1000]))
plt.figure(figsize=(8, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score 


valid_recipes = recipes[(recipes['ingredients'].notnull())]
valid_recipes['text'] = valid_recipes['ingredients'] 
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(valid_recipes['text'])

label_encoder = LabelEncoder()
valid_recipes['source_encoded'] = label_encoder.fit_transform(valid_recipes['source'])
y = valid_recipes['source_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, test_size=0.2, random_state=42)

# Logistic Regression
logistic_model = LogisticRegression(solver='lbfgs', max_iter=500)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic, target_names=label_encoder.classes_))

# Naive Bayes Classifier (MultinomialNB for text data)
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
y_pred_naive_bayes = naive_bayes_model.predict(X_test)
print("Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, y_pred_naive_bayes))
print("Classification Report:")
print(classification_report(y_test, y_pred_naive_bayes, target_names=label_encoder.classes_))


# Gradient Boosting Classifier
gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, random_state=42)
gradient_boosting_model.fit(X_train, y_train)
y_pred_gradient_boosting = gradient_boosting_model.predict(X_test)
print("Gradient Boosting:")
print("Accuracy:", accuracy_score(y_test, y_pred_gradient_boosting))
print("Classification Report:")
print(classification_report(y_test, y_pred_gradient_boosting, target_names=label_encoder.classes_))

# Support Vector Machine
svm_model = LinearSVC(max_iter=500,dual='auto')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("Support Vector Machine:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

# Random Forest
random_forest_model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
random_forest_model.fit(X_train, y_train)
y_pred_random_forest = random_forest_model.predict(X_test)
print("Random Forest:")
print("Accuracy:", accuracy_score(y_test, y_pred_random_forest))
print("Classification Report:")
print(classification_report(y_test, y_pred_random_forest, target_names=label_encoder.classes_))


Logistic Regression:
Accuracy: 0.7563963794374637
Classification Report:
              precision    recall  f1-score   support

    Gathered       0.77      0.95      0.85    328949
   Recipes1M       0.60      0.22      0.32    117280

    accuracy                           0.76    446229
   macro avg       0.69      0.58      0.59    446229
weighted avg       0.73      0.76      0.71    446229

Naive Bayes:
Accuracy: 0.7471858619677341
Classification Report:
              precision    recall  f1-score   support

    Gathered       0.77      0.93      0.84    328949
   Recipes1M       0.55      0.22      0.32    117280

    accuracy                           0.75    446229
   macro avg       0.66      0.58      0.58    446229
weighted avg       0.71      0.75      0.71    446229

Gradient Boosting:
Accuracy: 0.7508073209047372
Classification Report:
              precision    recall  f1-score   support

    Gathered       0.75      0.99      0.85    328949
   Recipes1M       0.70     