In [3]:
import pandas as pd
import re
from collections import Counter
import random

# Load the CSV file into a DataFrame
file_path = r"C:\Users\SuZamii\Desktop\c441\Recipe-Gen final-project\data\RAW_recipes.csv"
recipes_df = pd.read_csv(file_path)


In [4]:
# Get the number of rows and columns
num_rows, num_columns = recipes_df.shape
print(f"The dataset contains {num_rows} rows and {num_columns} columns.")


The dataset contains 231637 rows and 12 columns.


In [5]:
# Define a list of common ingredients
common_ingredients = ['tomato', 'egg', 'potato', 'onion', 'garlic', 'butter', 'milk', 'cheese', 'flour', 'sugar']

# Convert ingredients to lowercase and check for common ingredients
filtered_df = recipes_df[recipes_df['ingredients'].str.lower().apply(lambda x: any(ing in x for ing in common_ingredients))]
# Check the number of rows in the filtered dataset
print(f"The filtered dataset contains {filtered_df.shape[0]} rows and {filtered_df.shape[1]} columns.")

The filtered dataset contains 212401 rows and 12 columns.


In [6]:
# Filter for recipes with 1 <= minutes < 200 and fewer than 15 ingredients
filtered_df = filtered_df[(filtered_df['n_ingredients'] < 15) & (filtered_df['minutes'] > 1) & (filtered_df['n_steps'] > 1) & (filtered_df['minutes'] < 200)]

# Check the number of rows in the filtered dataset
print(f"The filtered dataset contains {filtered_df.shape[0]} rows and {filtered_df.shape[1]} columns.")


The filtered dataset contains 178935 rows and 12 columns.


In [7]:
# Take a random sample of 100,000 recipes from the filtered dataset
filtered_df = filtered_df.sample(100000, random_state=1)

# Check the final dataset size
print(f"The final sampled dataset contains {filtered_df.shape[0]} rows and {filtered_df.shape[1]} columns.")

The final sampled dataset contains 100000 rows and 12 columns.


In [8]:

# Drop the specified columns from the DataFrame
filtered_df = filtered_df.drop(columns=['id', 'contributor_id', 'submitted', 'tags', 'nutrition'])

# Check the remaining columns to confirm
print("Remaining columns:", filtered_df.columns)


Remaining columns: Index(['name', 'minutes', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')


In [9]:
print(f"The final sampled dataset contains {filtered_df.shape[0]} rows and {filtered_df.shape[1]} columns.")


The final sampled dataset contains 100000 rows and 7 columns.


In [10]:
# Save the filtered DataFrame to a CSV file
filtered_df.to_csv("filtered_recipes.csv", index=False)


In [11]:
# Generate a list of unique random IDs between 1 and 100,000
num_rows = filtered_df.shape[0]
random_ids = random.sample(range(1, 100001), num_rows)

# Assign the random IDs to a new 'id' column
filtered_df['id'] = random_ids

# Move 'id' to the first column
cols = ['id'] + [col for col in filtered_df.columns if col != 'id']
filtered_df = filtered_df[cols]

# Check the first few rows to confirm 'id' is the first column
# print(filtered_df.head())


# Tokenization

In [12]:
# Convert each ingredient list into a list of ingredients if it's a string
def convert_to_list(ingredient_str):
    if isinstance(ingredient_str, str):
        try:
            return eval(ingredient_str)  # Convert string to list
        except:
            return []  # Return an empty list if eval fails
    else:
        return ingredient_str  # Return as is if already a list or non-string

filtered_df['ingredients'] = filtered_df['ingredients'].apply(convert_to_list)

# Flatten the list of ingredients across all recipes
flattened_ingredients = [ingredient.lower().strip() for ingredients in filtered_df['ingredients'] for ingredient in ingredients]

# Count occurrences of each ingredient
ingredient_counts = Counter(flattened_ingredients)

# Display the ingredient counts
# print(ingredient_counts)

# Calculate the total number of unique ingredients
total_unique_ingredients = len(ingredient_counts)

print(f"Total unique ingredients: {total_unique_ingredients}")

Total unique ingredients: 11522


In [13]:
# Flattened list of unique ingredients
unique_ingredients = list(ingredient_counts.keys())

# Define a set of stopwords to ignore
stopwords = {'fresh', 'chopped', 'sliced', 'diced', 'organic', 'crushed', 'small', 'large', 'and', 'dried', 
             'free', 'mix', 'low', 'hot', 'of', 'with', 'red', 'white', 'black', 'brown', 'yellow', 'fat', 
             'light', 'style', 'in', 'lean', 'dark', 'french', 'smoked', 'roast'}

# Create a unique token dictionary with numeric identifiers
token_dict = {}
token_id = 1

# Prepare data for each ingredient
data = []
for ingredient in unique_ingredients:
    # Tokenize the ingredient, removing punctuation
    ingredient_tokens = re.findall(r'\b\w+\b', ingredient.lower())
    
    # Remove stopwords
    ingredient_tokens = [token for token in ingredient_tokens if token not in stopwords]
    
    # Convert tokens to numeric identifiers
    token_ids = []
    for token in ingredient_tokens:
        if token not in token_dict:
            token_dict[token] = token_id
            token_id += 1
        token_ids.append(token_dict[token])

    # Get the count of the ingredient from ingredient_counts
    count = ingredient_counts[ingredient]
    
    # Append ingredient, token IDs, and count to the data list
    data.append({
        'ingredient': ingredient,
        'token_ids': ', '.join(map(str, token_ids)),  # Join token IDs as a string for easy viewing
        'count': count
    })

# Convert data to a DataFrame
tokenized_ingredient_df = pd.DataFrame(data)


# Save the token dictionary as well for reference
token_df = pd.DataFrame(list(token_dict.items()), columns=['token', 'token_id'])



# print(tokenized_ingredient_df.head)

In [14]:
# Function to tokenize an ingredient list based on token_dict
def tokenize_ingredient_list(ingredient_list):
    tokenized_list = []
    for ingredient in ingredient_list:
        # Tokenize and clean the ingredient
        ingredient_tokens = re.findall(r'\b\w+\b', ingredient.lower())
        ingredient_tokens = [token for token in ingredient_tokens if token not in stopwords]
        
        # Convert tokens to numeric token IDs based on token_dict
        token_ids = [token_dict[token] for token in ingredient_tokens if token in token_dict]
        tokenized_list.append(token_ids)
    
    return tokenized_list

# Apply the tokenization function to each row in filtered_df and rename to filtered_df_tokenized
filtered_df_tokenized = filtered_df.copy()  # Copy to avoid modifying the original DataFrame
filtered_df_tokenized['tokenized_ingredients'] = filtered_df_tokenized['ingredients'].apply(tokenize_ingredient_list)

# Display the updated DataFrame with the new 'tokenized_ingredients' column
print(filtered_df_tokenized[['ingredients', 'tokenized_ingredients']].head())

                                              ingredients  \
181397  [butter, olive oil, scallion, mushrooms, seaso...   
77568   [margarine, brown sugar, pecans, graham cracke...   
28270   [chicken breasts, onions, red bell peppers, mu...   
206145  [mixed baby lettuces and spring greens, pear, ...   
83001   [white beans, extra virgin olive oil, fresh le...   

                                    tokenized_ingredients  
181397  [[1], [2, 3], [4], [5], [6, 7], [8, 9], [10, 1...  
77568                    [[14], [15], [16], [17, 18, 19]]  
28270   [[20, 21], [22], [23, 24], [5], [25], [26], [2...  
206145  [[30, 31, 32, 33, 34], [35], [36, 20], [37, 38...  
83001   [[48], [49, 50, 2, 3], [51, 43], [52, 53], [7]...  


In [15]:
# Drop the specified columns from the DataFrame
filtered_df_tokenized = filtered_df_tokenized.drop(columns=['name', 'steps', 'description', 'ingredients'])
print(filtered_df_tokenized.head)

<bound method NDFrame.head of            id  minutes  n_steps  n_ingredients  \
181397  76104       45        6              8   
77568   40098       25        8              4   
28270   20898       75       11              9   
206145   8643       20        4             14   
83001   54357       10        5             10   
...       ...      ...      ...            ...   
168444  59432       20        4              5   
227081  87429      155       17              9   
186504  48485       15        6              6   
188200  71769       35        9             11   
83432   38504       70       12              9   

                                    tokenized_ingredients  
181397  [[1], [2, 3], [4], [5], [6, 7], [8, 9], [10, 1...  
77568                    [[14], [15], [16], [17, 18, 19]]  
28270   [[20, 21], [22], [23, 24], [5], [25], [26], [2...  
206145  [[30, 31, 32, 33, 34], [35], [36, 20], [37, 38...  
83001   [[48], [49, 50, 2, 3], [51, 43], [52, 53], [7]...  
...      

In [16]:
# Save the filtered DataFrame to a CSV file
filtered_df_tokenized.to_csv("tokenized_dataset.csv", index=False)

# Embedding


In [17]:
vocab_size = len(token_dict) + 1  # +1 for padding index if needed
embedding_dim = 100  # Dimension of each embedding vector


In [None]:
# # Create the Embedding Layer in Your Model

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, LSTM, Dense

# model = Sequential([
#     Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
#     LSTM(128, return_sequences=True),  # LSTM or any other sequence model
#     Dense(64, activation='relu'),
#     Dense(num_classes, activation='softmax')  # or linear for regression
# ])


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# # Prepare Your Data

# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Assuming tokenized_ingredient_lists contains token IDs for each recipe
# max_length = max(len(x) for x in tokenized_ingredient_lists)  # Define the max length of sequences
# padded_sequences = pad_sequences(tokenized_ingredient_lists, maxlen=max_length, padding='post')


In [None]:
# #Train the Model
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# model.fit(padded_sequences, labels, epochs=10, batch_size=32, validation_split=0.2)


# Rough

In [None]:
# Save to CSV
filtered_df.to_csv('ingredients_with_token_ids.csv', index=False)
token_df.to_csv('token_reference.csv', index=False)
print("Data saved to ingredients_with_token_ids.csv and token_reference.csv")

In [None]:
# Flattened list of unique ingredients
unique_ingredients = list(ingredient_counts.keys())

# Define a set of stopwords to ignore (add more as needed)
stopwords = {'fresh', 'chopped', 'sliced', 'diced', 'organic',  'crushed', 'small', 'large', 'and', 'dried', 
             'free', 'mix', 'low', 'hot', 'of', 'with','red','white','black','brown','yellow','fat','light' ,
              'style', 'in' , 'lean', 'dark', 'french', 'smoked', 'roast'}

# Tokenize each ingredient, remove stopwords, and flatten the result
tokens = []
for ingredient in unique_ingredients:
    # Split by spaces and remove punctuation using regex
    ingredient_tokens = re.findall(r'\b\w+\b', ingredient.lower())
    
    # Remove stopwords
    ingredient_tokens = [token for token in ingredient_tokens if token not in stopwords]
    
    # Add tokens to the list
    tokens.extend(ingredient_tokens)

# Count each token's occurrences
token_counts = Counter(tokens)

# Display token counts
print(f"Total unique tokens: {len(token_counts)}")
print("Most common tokens:", token_counts.most_common(10))


In [None]:
# Separate single-word and multi-word ingredients
single_word_ingredients = set()  # Using a set to avoid duplicates
multi_word_ingredients = []

for ingredient in ingredient_counts.keys():
    if len(ingredient.split()) == 1:
        single_word_ingredients.add(ingredient)
    else:
        multi_word_ingredients.append(ingredient)

# Define common descriptors to identify in multi-word ingredients
descriptors = {"unsalted", "salted", "fresh", "ground", "dried", "chopped", "large", "small"}

# Tokenize single-word ingredients directly
tokenized_single_word = list(single_word_ingredients)  # Convert to list for consistent output

# Tokenize multi-word ingredients by identifying core and descriptors
tokenized_multi_word = []
for ingredient in multi_word_ingredients:
    tokens = ingredient.split()  # Split the ingredient into individual words
    # Identify core and descriptor(s)
    core = tokens[-1] if tokens[-1] not in descriptors else tokens[-2]  # Assume last word is core if not a descriptor
    descriptor = [token for token in tokens if token in descriptors]  # Collect descriptors
    
    # Only add the core if it's not already in the single-word list
    if core not in single_word_ingredients:
        single_word_ingredients.add(core)
    
    # Append the tokenized form as a tuple (core, descriptor(s)) to keep structure
    tokenized_multi_word.append((core, descriptor))

# Display the results
print("Single-word Ingredients (Tokenized):", tokenized_single_word)
print("Multi-word Ingredients (Tokenized as Core + Descriptor):", tokenized_multi_word)
print(f"Total unique ingredients: {len(ingredient_counts)}")