## Import libraries

In [1]:
import pandas as pd
import numpy as np
import joblib
import ast
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import chain
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# ingnore future warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
# Load data
cuisine_df = pd.read_csv('../data/RAW_recipes_cleaned.csv')
cuisine_df.head()

Unnamed: 0,id,name,Cuisine_Tags_str,replaced_ingredients_str,steps_str,tags_str,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,Cuisine_Tags,replaced_ingredients
0,137739,arriba baked winter squash mexican style,mexican,winter squash mexican seasoning mixed spice ho...,make a choice and proceed with recipe dependin...,60-minutes-or-less time-to-make course main-in...,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,['mexican'],"['winter squash', 'mexican seasoning', 'mixed ..."
1,31490,a bit different breakfast pizza,northeastern-united-states,pizza crust sausage egg milk salt and pepper c...,preheat oven to 425 degrees f press dough into...,30-minutes-or-less time-to-make course main-in...,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,['northeastern-united-states'],"['pizza crust', 'sausage', 'egg', 'milk', 'sal..."
2,44061,amish tomato ketchup for canning,northeastern-united-states,tomato juice apple cider vinegar sugar salt pe...,"mix all ingredients& boil for 2 1 / 2 hours , ...",weeknight time-to-make course main-ingredient ...,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,['northeastern-united-states'],"['tomato juice', 'apple cider vinegar', 'sugar..."
3,25274,aww marinated olives,canadian,fennel seed green olive ripe olive garlic pepp...,toast the fennel seeds and lightly crush them ...,15-minutes-or-less time-to-make course main-in...,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seed', 'green olive', 'ripe olive', '...",9,['canadian'],"['fennel seed', 'green olive', 'ripe olive', '..."
4,43026,chile rellenos,southwestern-united-states,egg roll wrap whole green chili cheese cornsta...,drain green chiles sprinkle cornstarch on shee...,60-minutes-or-less time-to-make course main-in...,45,52268,2002-10-14,"['60-minutes-or-less', 'time-to-make', 'course...","[94.0, 10.0, 0.0, 11.0, 11.0, 21.0, 0.0]",9,"['drain green chiles', 'sprinkle cornstarch on...",a favorite from a local restaurant no longer i...,"['egg roll wrap', 'whole green chili', 'cheese...",5,['southwestern-united-states'],"['egg roll wrap', 'whole green chili', 'cheese..."


## Data preprocessing

In [5]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cuisine_df['replaced_ingredients_str'])
y = cuisine_df['Cuisine_Tags_str']

In [6]:
# Save the TF-IDF Vectorizer
vectorizer_joblib_file = '../model/tfidf_vectorizer_0529.sav'
joblib.dump(vectorizer, vectorizer_joblib_file)
print(f"TF-IDF vectorizer saved to {vectorizer_joblib_file}")

TF-IDF vectorizer saved to ../model/tfidf_vectorizer_0529.sav


## Model Training

In [8]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# LinearSVC Prediction
svc = LinearSVC()
svc.fit(X_train, y_train)

In [10]:
svc.score(X_test, y_test)

0.5566468253968254

In [11]:
# # Hyperparameter tuning and cross-validation but the result is not better than the above model
# # Define parameter grid
# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'max_iter': [1000, 5000, 10000, 20000]
# }

# # Perform hyperparameter tuning using GridSearchCV
# grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# # Output best parameters
# print("Best parameters: ", grid_search.best_params_)

# # Use the best model to make predictions
# best_model = grid_search.best_estimator_
# print("Test accuracy: ", best_model.score(X_test, y_test))

# # Evaluate the model using cross-validation
# cv_scores = cross_val_score(best_model, X, y, cv=5)
# print("Cross-validation scores: ", cv_scores)
# print("Mean cross-validation score: ", np.mean(cv_scores))

In [12]:
y_pred = svc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.5566468253968254
Classification Report:
                            precision    recall  f1-score   support

                australian       0.52      0.50      0.51       452
                  austrian       0.00      0.00      0.00        26
                   belgian       0.00      0.00      0.00        19
                 brazilian       0.41      0.14      0.21        49
                  canadian       0.37      0.35      0.36       810
                   chilean       0.50      0.06      0.10        18
                   chinese       0.61      0.82      0.70       330
                 colombian       0.50      0.11      0.18         9
                     cuban       0.43      0.13      0.20        46
                     czech       0.00      0.00      0.00        18
                    danish       0.20      0.05      0.08        43
                     dutch       0.33      0.06      0.11        48
                  egyptian       0.36      0.16      0.22      

## Test the model

In [13]:
# Prediction function
def cuisine_recommender(ingredients, model):
    store = []
    store.append(ingredients)
    ingredient_vector = vectorizer.transform(store)
    return model.predict(ingredient_vector)

# Test the recommendation system
print(cuisine_recommender('apple cheese grapes sausage', svc))
print(cuisine_recommender('cumin chicken wheat salt', svc))

['italian']
['moroccan']


## Save the model

In [14]:
model_joblib_file = '../model/classification_model_SVC_0529.sav'
joblib.dump(svc, model_joblib_file)

['../model/classification_model_SVC_0529.sav']

In [15]:
loaded_model = joblib.load(model_joblib_file)

y_preds = loaded_model.predict(X_test)
y_preds

array(['mexican', 'italian', 'italian', ..., 'greek',
       'southern-united-states', 'moroccan'], dtype=object)

In [16]:
score = loaded_model.score(X_test, y_test)
print(f'Model Score: {score}')

Model Score: 0.5566468253968254
