## Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)


## Load Data

In [2]:
# Load JSON file
def load_json(file_path):
    return pd.read_json(file_path)

# Load data
data = load_json('../data/cleaned_ingredients.json')

# Normalize JSON data
cuisine_df = pd.json_normalize(data.to_dict(orient='records'))

In [3]:
cuisine_df

Unnamed: 0,id,cuisine,ingredients,cleaned_ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]","[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[black pepper, shallots, cornflour, cayenne pe..."
...,...,...,...,...
39769,29109,irish,"[light brown sugar, granulated sugar, butter, ...","[light brown sugar, granulated sugar, butter, ..."
39770,11462,italian,"[KRAFT Zesty Italian Dressing, purple onion, b...","[zesty italian dressing, purple onion, broccol..."
39771,2238,irish,"[eggs, citrus fruit, raisins, sourdough starte...","[eggs, citrus fruit, raisins, sourdough starte..."
39772,41882,chinese,"[boneless chicken skinless thigh, minced garli...","[boneless chicken skinless thigh, minced garli..."


## Process Data

In [4]:
# Combine ingredients into a single string for each recipe
cuisine_df['cleaned_ingredients_str'] = cuisine_df['cleaned_ingredients'].apply(lambda x: ' '.join(x))

# # Download NLTK data files
# nltk.download('punkt')
# nltk.download('stopwords')

# # Preprocess ingredients (tokenize and remove stop words)
# stop_words = set(stopwords.words('english'))

# def preprocess_ingredients(text):
#     tokens = word_tokenize(text.lower())
#     tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
#     return ' '.join(tokens)

# cuisine_df['ingredients_clean_2'] = cuisine_df['ingredients_str'].apply(preprocess_ingredients)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cuisine_df['cleaned_ingredients_str'])
y = cuisine_df['cuisine']

## Split Data

In [5]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model Training with hyperparameter tuning & cross-validation

In [6]:
# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'max_iter': [1000, 5000, 10000, 20000]
}

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Output best parameters
print("Best parameters: ", grid_search.best_params_)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
print("Test accuracy: ", best_model.score(X_test, y_test))

# Evaluate the model using cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5)
print("Cross-validation scores: ", cv_scores)
print("Mean cross-validation score: ", np.mean(cv_scores))

Best parameters:  {'C': 1, 'max_iter': 1000}
Test accuracy:  0.7849151477058454
Cross-validation scores:  [0.78629793 0.79522313 0.78780641 0.78378378 0.79280865]
Mean cross-validation score:  0.7891839801043303


## Define Predictor

In [7]:
# Prediction function
def cuisine_recommender(ingredients, model):
    store = []
    store.append(ingredients)
    ingredient_vector = vectorizer.transform(store)
    return model.predict(ingredient_vector)

In [8]:
# Test the recommendation system
print(cuisine_recommender('apple cheese grapes sausage', best_model))

['italian']


In [9]:
print(cuisine_recommender('cumin chicken wheat salt', best_model))

['mexican']


## Save Model

In [10]:
import joblib

In [11]:
model_joblib_file = '../model/cuisine_recommender_SVC_hycv.sav'
joblib.dump(best_model, model_joblib_file)

['../model/cuisine_recommender_SVC_hycv.sav']

In [12]:
loaded_model = joblib.load(model_joblib_file)

y_preds = loaded_model.predict(X_test)
y_preds

array(['southern_us', 'italian', 'cajun_creole', ..., 'mexican', 'french',
       'french'], dtype=object)

In [13]:
score = loaded_model.score(X_test, y_test)
print(f'Model Score: {score}')

Model Score: 0.7849151477058454
