## Necessary Libraries

In [20]:
import pandas as pd
import numpy as np
import joblib
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.model_selection import KFold

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

## Load Data

In [3]:
cuisine_df =pd.read_csv('RAW_recipes_with_one_cuisine.csv')

In [4]:
cuisine_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,Cuisine_Tags
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,['mexican']
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,['northeastern-united-states']
2,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,['northeastern-united-states']
3,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seed', 'green olive', 'ripe olive', '...",9,['canadian']
4,chile rellenos,43026,45,52268,2002-10-14,"['60-minutes-or-less', 'time-to-make', 'course...","[94.0, 10.0, 0.0, 11.0, 11.0, 21.0, 0.0]",9,"['drain green chiles', 'sprinkle cornstarch on...",a favorite from a local restaurant no longer i...,"['egg roll wrap', 'whole green chili', 'cheese...",5,['southwestern-united-states']


In [5]:
import ast

def convert_to_list(tags_str):
    try:
        return ast.literal_eval(tags_str)
    except (ValueError, SyntaxError):
        return []

In [6]:
cuisine_df['ingredients']=cuisine_df['ingredients'].apply(convert_to_list)
cuisine_df['Cuisine_Tags']=cuisine_df['Cuisine_Tags'].apply(convert_to_list)

In [7]:
# Join ingredients lists into strings
cuisine_df['ingredients_str'] = cuisine_df['ingredients'].apply(lambda x: ' '.join(x))
cuisine_df['Cuisine_Tags_str']= cuisine_df['Cuisine_Tags'].apply(lambda x: ' '.join(x))

In [8]:
cuisine_df.ingredients_str[10]

'fettuccine pasta cheddar cheese soup milk picante sauce black olive jalapeno'

## Process Data

In [9]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Cuisine_Tags_str']

In [10]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [79]:
model = LinearSVC()
model.fit(X_train, y_train)

In [123]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [124]:
pipeline.fit(X_train, y_train)

In [125]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.5635912698412698

In [126]:
# Save the model to a file
joblib.dump(pipeline, 'cuisine_prediction.joblib')

['cuisine_prediction.joblib']

## Deciding on Minimal Clustering of Cuisines

In [13]:
# Verify the content of the JSON file
with open('cuisine_clusters30.json', 'r') as f:
    clusters30 = json.load(f)

In [14]:
# Create a reverse mapping from cuisine to cluster number
cuisine_to_cluster30 = {cuisine: cluster for cluster, cuisines in clusters30.items() for cuisine in cuisines}

In [15]:
# Function to map each cuisine tag to its cluster number
def get_cluster_number(cuisine_tags):
    return cuisine_to_cluster30.get(cuisine_tags[0], None)  # Assuming each cuisine tag is a single-item list

In [16]:
# Apply the function to create a new column with cluster numbers
cuisine_df['Clusters30'] = cuisine_df['Cuisine_Tags'].apply(get_cluster_number)

In [17]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Clusters30']

In [21]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [22]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [23]:
pipeline.fit(X_train, y_train)

In [24]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.7228174603174603

In [25]:
joblib.dump(pipeline, 'clusters_30.joblib')

['clusters_30.joblib']

In [29]:
# Verify the content of the JSON file
with open('cuisine_clusters20.json', 'r') as f:
    clusters20 = json.load(f)

In [30]:
# Create a reverse mapping from cuisine to cluster number
cuisine_to_cluster20 = {cuisine: cluster for cluster, cuisines in clusters20.items() for cuisine in cuisines}

In [31]:
# Function to map each cuisine tag to its cluster number
def get_cluster_number(cuisine_tags):
    return cuisine_to_cluster20.get(cuisine_tags[0], None)  # Assuming each cuisine tag is a single-item list

In [32]:
# Apply the function to create a new column with cluster numbers
cuisine_df['Clusters20'] = cuisine_df['Cuisine_Tags'].apply(get_cluster_number)

In [33]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Clusters20']

In [34]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [35]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [36]:
pipeline.fit(X_train, y_train)

In [37]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.7644841269841269

In [38]:
joblib.dump(pipeline, 'clusters_20.joblib')

['clusters_20.joblib']

In [41]:
# Verify the content of the JSON file
with open('cuisine_clusters15.json', 'r') as f:
    clusters15 = json.load(f)

In [42]:
# Create a reverse mapping from cuisine to cluster number
cuisine_to_cluster15 = {cuisine: cluster for cluster, cuisines in clusters15.items() for cuisine in cuisines}

In [43]:
# Function to map each cuisine tag to its cluster number
def get_cluster_number(cuisine_tags):
    return cuisine_to_cluster15.get(cuisine_tags[0], None)  # Assuming each cuisine tag is a single-item list

In [44]:
# Apply the function to create a new column with cluster numbers
cuisine_df['Clusters15'] = cuisine_df['Cuisine_Tags'].apply(get_cluster_number)

In [45]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Clusters15']

In [46]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [47]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [48]:
pipeline.fit(X_train, y_train)

In [49]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.7883928571428571

In [50]:
joblib.dump(pipeline, 'clusters_15.joblib')

['clusters_15.joblib']

In [53]:
# Verify the content of the JSON file
with open('cuisine_clusters10.json', 'r') as f:
    clusters10 = json.load(f)

In [54]:
# Create a reverse mapping from cuisine to cluster number
cuisine_to_cluster10 = {cuisine: cluster for cluster, cuisines in clusters10.items() for cuisine in cuisines}

In [55]:
# Function to map each cuisine tag to its cluster number
def get_cluster_number(cuisine_tags):
    return cuisine_to_cluster10.get(cuisine_tags[0], None)  # Assuming each cuisine tag is a single-item list

In [56]:
# Apply the function to create a new column with cluster numbers
cuisine_df['Clusters10'] = cuisine_df['Cuisine_Tags'].apply(get_cluster_number)

In [57]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Clusters10']

In [58]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [59]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [60]:
pipeline.fit(X_train, y_train)

In [61]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.8198412698412698

In [62]:
joblib.dump(pipeline, 'clusters_10.joblib')

['clusters_10.joblib']

In [63]:
# Verify the content of the JSON file
with open('cuisine_clusters5.json', 'r') as f:
    clusters5 = json.load(f)

In [64]:
# Create a reverse mapping from cuisine to cluster number
cuisine_to_cluster5 = {cuisine: cluster for cluster, cuisines in clusters5.items() for cuisine in cuisines}

In [65]:
# Function to map each cuisine tag to its cluster number
def get_cluster_number(cuisine_tags):
    return cuisine_to_cluster5.get(cuisine_tags[0], None)  # Assuming each cuisine tag is a single-item list

In [66]:
# Apply the function to create a new column with cluster numbers
cuisine_df['Clusters5'] = cuisine_df['Cuisine_Tags'].apply(get_cluster_number)

In [67]:
X = cuisine_df['ingredients_str']
y = cuisine_df['Clusters5']

In [68]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

In [69]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=614)

In [70]:
pipeline.fit(X_train, y_train)

In [71]:
y_predict=pipeline.predict(X_test)
accuracy_score(y_test, y_predict)

0.8426587301587302

In [72]:
joblib.dump(pipeline, 'clusters_5.joblib')

['clusters_5.joblib']