Notes: Stage 2 after Candidate Generation is Scoring. This Step used Neural Networks to Filter top 50 Foods

## **Import Library**

In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.utils import simple_preprocess
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('./dataset/final_candidates.csv')

In [3]:
df.head(2)

Unnamed: 0,RecipeId,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,NameClean,RecipeIngredientPartsClean,RecipeInstructionsClean,Combined,cosine_similarity
0,72665,436.4,8.8,1.6,268.0,985.1,12.2,1.2,7.9,74.3,ginger and chilli baked fish,lime fresh ginger fish saue fresh ginger garli...,preheat oven hot 220 degree c make 4 deep slit...,lime fresh ginger fish saue fresh ginger garli...,0.788153
1,512673,179.7,9.7,2.6,82.2,815.2,0.8,0.1,0.2,21.4,fresh fillet of sole in a snap,fillet sole garli salt pepper butter olive oil...,spread garli side fish sprinkle salt pepper la...,fillet sole garli salt pepper butter olive oil...,0.729026


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    165 non-null    int64  
 1   Calories                    165 non-null    float64
 2   FatContent                  165 non-null    float64
 3   SaturatedFatContent         165 non-null    float64
 4   CholesterolContent          165 non-null    float64
 5   SodiumContent               165 non-null    float64
 6   CarbohydrateContent         165 non-null    float64
 7   FiberContent                165 non-null    float64
 8   SugarContent                165 non-null    float64
 9   ProteinContent              165 non-null    float64
 10  NameClean                   165 non-null    object 
 11  RecipeIngredientPartsClean  165 non-null    object 
 12  RecipeInstructionsClean     165 non-null    object 
 13  Combined                    165 non

# **Tokenizer**

In [5]:
# Tokenize the combined recipe text. This splits the text into lowercase tokens, removing punctuations and special characters.
df['CombinedTokenized'] = df['Combined'].apply(simple_preprocess)

In [6]:
# Load the pre-trained Word2Vec model from Google News using
model = api.load('word2vec-google-news-300')

# **Compute Average Word2Vec Embedding**

This function computes the average Word2Vec embedding for a list of words. It initializes a zero vector, then iterates through the words to accumulate their vectors. The result is the average vector for the input words.

In [7]:
# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords += 1.
            feature_vector = np.add(feature_vector, model[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

Compute the average Word2Vec embeddings for each recipe's tokenized content.

In [8]:
# Compute average word vectors for all recipes
vocabulary = set(model.index_to_key)
df['Embedding'] = df['CombinedTokenized'].apply(lambda x: average_word_vectors(x, model, vocabulary, 300))

# **Split Embeddings into Separate Columns**

Split the 300-dimensional embeddings into separate columns to integrate with other features in the dataframe.

In [9]:
embeddings = np.vstack(df['Embedding'])
embedding_df = pd.DataFrame(embeddings, columns=[f'emb_{i}' for i in range(300)])

In [10]:
# Combine embeddings with the original data
feature_matrix = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)

# **Add User-Specific Features**

Add dummy user-specific features, such as age and dietary preference, to the feature matrix.

In [11]:
# Dummy user-specific features (e.g., age, dietary preferences)
user_features = pd.DataFrame({
    'age': np.random.randint(18, 60, size=len(feature_matrix)),
    'dietary_preference': np.random.choice([0, 1, 2], size=len(feature_matrix))  # Encoding dietary preference as integers
})

In [12]:
feature_matrix = pd.concat([feature_matrix, user_features], axis=1)

# **Generate Dummy Labels**

Generate dummy relevance labels for training. In a real-world scenario, these labels would come from historical user interaction data.

In [13]:
# Generate dummy relevance labels (1 for relevant, 0 for not relevant)
# In practice, this should be based on historical user interaction data
np.random.seed(42)
labels = np.random.randint(0, 2, size=len(feature_matrix))

In [14]:
X = feature_matrix.drop(columns=['RecipeId', 'NameClean', 'RecipeIngredientPartsClean', 'RecipeInstructionsClean', 'Combined', 'Embedding', 'CombinedTokenized'])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [16]:
# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Define and Train Neural Network**

In [17]:
# Define a simple neural network model
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [18]:
# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a54fc77d2a0>

In [19]:
# Predict the relevance scores for the test set
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)
print("Test Accuracy:", accuracy_score(y_test, y_pred_binary))

Test Accuracy: 0.5757575757575758


In [22]:
# Predict the relevance scores for all candidate recipes
relevance_scores = model.predict(scaler.transform(X))
df['RelevanceScore'] = relevance_scores



In [25]:
# Sort the DataFrame by relevance_score in descending order
sorted_df = df.sort_values(by='RelevanceScore', ascending=False)

# Drop duplicates based on RecipeId, keeping the first occurrence
unique_sorted_df = sorted_df.drop_duplicates(subset='RecipeId', keep='first')

# Select the top 50 recipes
top_50_recipes = unique_sorted_df.head(50)

# Print the top 50 recipes
top_50_recipes[['RecipeId', 'NameClean', 'RelevanceScore']].head(5)

Unnamed: 0,RecipeId,NameClean,RelevanceScore
44,493208,broiled flounder,0.920063
91,40263,spiy garli beef,0.885103
120,29049,steamed fish chinese style,0.868919
65,178948,simple beef stew for two,0.867998
70,407257,slowcooker beef taos,0.865091


# **Export Data Top 50 Recipes**

In [28]:
top_50_recipes.to_csv('./dataset/top_50_recipes.csv', index=False)