## Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.model_selection import KFold

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import json
import umap

# ingnore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


## Load Data

In [2]:
# Load json file
def load_json(file_path):
    return pd.read_json(file_path)

# Load data
data = load_json('../data/cleaned_ingredients.json')

# Load JSON data
cuisine_df = pd.json_normalize(data.to_dict(orient='records'))

In [3]:
cuisine_df

Unnamed: 0,id,cuisine,ingredients,cleaned_ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...","[zesty italian dressing, purple onion, broccol..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, minced garli..."


## Process Data

In [4]:
# Combine ingredients into a single string for each recipe
cuisine_df['cleaned_ingredients_str'] = cuisine_df['cleaned_ingredients'].apply(lambda x: ' '.join(x))
cuisine_df


Unnamed: 0,id,cuisine,ingredients,cleaned_ingredients,cleaned_ingredients_str
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomatoes ground...
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs pepper salt mayonaise cooking oil green c...
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe...",black pepper shallots cornflour cayenne pepper...
...,...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulated sugar, butter, ...",light brown sugar granulated sugar butter warm...
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...","[zesty italian dressing, purple onion, broccol...",zesty italian dressing purple onion broccoli f...
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[eggs, citrus fruit, raisins, sourdough starte...",eggs citrus fruit raisins sourdough starter fl...
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, minced garli...",boneless chicken skinless thigh minced garlic ...


In [5]:
# # Download NLTK data files
# nltk.download('punkt')
# nltk.download('stopwords')

# # Preprocess ingredients (tokenize and remove stop words)
# stop_words = set(stopwords.words('english'))

# def preprocess_ingredients(text):
#     tokens = word_tokenize(text.lower())
#     tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
#     return ' '.join(tokens)

# cuisine_df['cleaned_ingredients_str_2'] = cuisine_df['cleaned_ingredients_str'].apply(preprocess_ingredients)

In [6]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cuisine_df['cleaned_ingredients_str'])
y = cuisine_df['cuisine']

## Split Data

In [7]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Train Linear SVC

In [8]:
# LinearSVC Prediction
svc = LinearSVC()
svc.fit(X_train, y_train)

In [9]:
svc.score(X_test, y_test)

0.7861722187303583

## Define Predictor

In [10]:
def cuisine_recommender(ingredients, model):
    store = []
    store.append(ingredients)
    ingredient_vector = vectorizer.transform(store)
    return model.predict(ingredient_vector)

In [11]:
cuisine_recommender('apple cheese grapes sausage', svc)

array(['italian'], dtype=object)

In [12]:
cuisine_recommender('cumin chicken wheat salt', svc)

array(['mexican'], dtype=object)

## Save Model

In [13]:
import joblib

In [14]:
model_joblib_file = '../model/cuisine_recommender_SVC.sav'
joblib.dump(svc, model_joblib_file)

['../model/cuisine_recommender_SVC.sav']

In [15]:
loaded_model = joblib.load(model_joblib_file)

y_preds = loaded_model.predict(X_test)
y_preds

array(['chinese', 'filipino', 'italian', ..., 'indian', 'french',
       'mexican'], dtype=object)

In [16]:
score = loaded_model.score(X_test, y_test)
print(f'Model Score: {score}')

Model Score: 0.7861722187303583
