THIS NOTEBOOK CONTAINS THE CODE TO EXTRACT SMALLER DATASET from FOOD101 AND VALIDATION DATASET.

THEN CONVERT THE DATASETs INTO METADATA CSVs.

In [26]:
import pandas as pd

food101_metadata = pd.read_csv('food101_metadata_full.csv', sep="\t")

In [27]:
full_dataset = pd.concat([food101_metadata], ignore_index = True)
print(len(full_dataset))

98000


In [28]:
# Checking unique ingredients in dataset
def checkIng(dataset):
    val_ing = set()
    if len(dataset) == 0: return val_ing
    for i in dataset['Ingredients']:
        l = i.split(",")
        for j in l:
            val_ing.add(j)
    return val_ing

In [30]:
# Food101 original dataset has 878 ingredients and 98 categories
ing_full = checkIng(full_dataset)
print("full dataset ingredients", len(checkIng(full_dataset)))

print("full dataset categories", len(full_dataset['Category'].unique()))

full dataset ingredients 878
full dataset categories 98


CREATING smaller_df

In [31]:
# sort ingredients based on number of records it is present in 
from collections import OrderedDict

def getSortedIngCounts(dataset):    
    ingredient_counts = {}
    if len(dataset) == 0: return ingredient_counts
    for ingredients in dataset['Ingredients']:
        for ingredient in ingredients.split(","):
            if ingredient in ingredient_counts:
                ingredient_counts[ingredient] += 1
            else:
                ingredient_counts[ingredient] = 1
    sorted_ingredient_counts = OrderedDict(sorted(ingredient_counts.items(), key=lambda x: x[1], reverse=True))
    return sorted_ingredient_counts

In [49]:
def getForbiddenIng(sorted_ingredient_counts, dataset):    
    forbidden_ingredients = set()
    sorted_dict_list = list(sorted_ingredient_counts.items())

    for ingredient, count in sorted_dict_list[::-1]:
        if len(forbidden_ingredients) < 28:
            forbidden_ingredients.add(ingredient)
        else:
            break
    return forbidden_ingredients

In [34]:
# Creating a smaller subset of the original dataset with ALL categories and Ingredients
import pandas as pd

smaller_df = pd.DataFrame()
for category, group in full_dataset.groupby('Category'):
    if len(group) >= 500:
        sampled_rows = group.sample(n=500, random_state=42)  
    else:
        sampled_rows = group
    smaller_df = pd.concat([smaller_df, sampled_rows])

smaller_df.reset_index(drop=True, inplace=True)

In [35]:
# checking the unique categories and ingredients in the smaller dataset
ing_small = checkIng(smaller_df)
cat_small = smaller_df['Category'].unique()

print("Smaller df ingr: ", len(ing_small))
print("Smaller df categories: ", len(cat_small))

Smaller df ingr:  878
Smaller df categories:  98


In [50]:
small_ingr_counts = getSortedIngCounts(smaller_df2)
small_forbidden_ing = getForbiddenIng(small_ingr_counts, smaller_df2)

In [44]:
smaller_df2.equals(smaller_df)

False

In [47]:
def dropForbiddenIngr(dataset, forbidden_ing):
    for index, row in dataset.iterrows():
        
        ingredients = row['Ingredients'].split(",")
        if any(ingredient in forbidden_ing for ingredient in ingredients):
            dataset.drop(dataset.index[index], inplace=True)

In [48]:
len(smaller_df2)

48847

In [124]:
def dropForbiddenIngrIndex(dataset, forbidden_ing):
    for index, row in dataset.iterrows():
        ingredients = row['Ingredients'].split(",")
        if any(ingredient in forbidden_ing for ingredient in ingredients):
            dataset.drop(index, inplace=True)

In [125]:
dropForbiddenIngr(smaller_df, small_forbidden_ing)

In [126]:
dropForbiddenIngrIndex(smaller_df2, small_forbidden_ing)

In [129]:
print(set(smaller_df) == set(smaller_df2))

True


In [54]:
ing_small = checkIng(smaller_df)

print("Updated smaller_df ing: ", len(ing_small))
print("Updated smaller_df cat: ", len(smaller_df['Category'].unique()))

Updated smaller_df ing:  850
Updated smaller_df cat:  98


In [58]:
# checking if we missed any ingredients from the original Food101
ing_remain = ing_full - ing_small
print(ing_remain == small_forbidden_ing)

True


CREATING VALIDATION DATASET

In [59]:
# Here extracting the remaining records from Food101 which are NOT present in the 
# smaller dataset which is used for training. 
merged_df = pd.merge(dataset_metadata, smaller_df, on=list(dataset_metadata.columns), how='left', indicator=True)

# Filter the merged DataFrame to keep only the rows that are not in smaller_df
df1_filtered = merged_df.query('_merge != "both"')

df1_filtered.reset_index(drop=True, inplace=True)
df1_filtered = df1_filtered.drop("_merge", axis=1)

In [11]:
# df1_filtered = df1_filtered[df1_filtered.apply(lambda row: row['Category'] in cat and all(ingredient not in ing_remain for ingredient in row['Ingredients'].split(",")), axis=1)]

In [60]:
# checking if the unique categories & ingredients in the filtered dataset are matching with 
# smaller dataset 
ing_fil = checkIng(df1_filtered)

print("remaining ing: ", len(ing_fil))
print("remaining cats: ", len(df1_filtered['Category'].unique()))

remaining ing:  878
remaining cats:  98


In [61]:
len(df1_filtered)

49153

In [75]:
# Dropping forbidden ing rows from filtered df
dropForbiddenIngr(df1_filtered, small_forbidden_ing)

In [76]:
print("Filtered ing", len(checkIng(df1_filtered)))
print("Filtered cat", len(df1_filtered['Category'].unique()))

Filtered ing 850
Filtered cat 98


In [250]:
# CREATING Validation dataset - THIS dataset SHOULD have
# the SAME number of Categories and Ingredients AS the smaller_df 
# which is used for training 

import pandas as pd

# validation_dataset = df1_filtered
# # dataset_metadata[~dataset_metadata.isin(smaller_df)].dropna()

# categories_in_smaller_df = set(smaller_df['Category'].unique())
# ingredients_in_smaller_df = ing_small

# final_validation_dataset = pd.DataFrame()

# while (
#     final_validation_dataset.empty
#     or set(final_validation_dataset['Category'].unique()) != categories_in_smaller_df
#     or ingredients_in_smaller_df != checkIng(final_validation_dataset)
# ):
#     final_validation_dataset = pd.DataFrame()

#     for category in categories_in_smaller_df:
#         category_records = validation_dataset[
#             (validation_dataset['Category'] == category)
#         ].sample(n=100, replace=True)  

#         final_validation_dataset = pd.concat([final_validation_dataset, category_records])

# final_validation_dataset = final_validation_dataset.sample(frac=1).reset_index(drop=True)

# import pandas as pd

# new_val_df = pd.DataFrame()
# for category, group in df1_filtered.groupby('Category'):

#     if len(group) >= 100:
#         sampled_rows = group.sample(n=100, random_state=42) 
#     else:
#         sampled_rows = group
#     new_val_df = pd.concat([new_val_df, sampled_rows])

# new_val_df.reset_index(drop=True, inplace=True)

In [78]:
final_val_df = pd.DataFrame()

while len(checkIng(final_val_df)) != 850:
    print(len(checkIng(final_val_df)))
    final_val_df = pd.DataFrame()
    for category, group in df1_filtered.groupby('Category'):
        if len(group) >= 150:
            sampled_rows = group.sample(n=150)
        else:
            sampled_rows = group
        final_val_df = pd.concat([final_val_df, sampled_rows])

final_val_df.reset_index(drop=True, inplace=True)
# while len(checkIng(final_val_df)) != 850 && getSortedIngCounts(final_val_df).keys() in small_forbidden_ing:
#     print(len(checkIng(final_val_df)))
#     final_val_df = pd.DataFrame()
#     for category, group in df1_filtered.groupby('Category'):
#         if len(group) >= 100:
#             sampled_rows = group.sample(n=100)
#         else:
#             sampled_rows = group
#         final_val_df = pd.concat([final_val_df, sampled_rows])

# final_val_df.reset_index(drop=True, inplace=True)

0
842
839
845
847
842
846
842
846
844
844
845
841
844
841
842
844
844
846
840
845
845
844
845
842
843
843
846
844
839
843
844
841
842
844
842
846
847
845
843
845
843
847
840
844
846
844
844
846
845
845
846
844
846
845
844
843
848
844
844
841
843
842
844
842
839
845
847
845
844
847
845
846
845
843
841
842
847
842
845
847
844
846
839
846
844
844
841
840
845
844
846
842
845
845
843
838
845
843
845
845
847
843
842
844
840
839
842
848
847
844
844
847
847
842
845
843
846
844
842
843
840
841
848
839
843
843
846
843
845
844
841
845
846
839
844
844
843
842
844
845
846
842
844
841
845
841
846
842
842
844
845
846
843
844
842
844
845
843
843
844
842
842
845
841
844
846
844
843
845
842
844
844
843
842
847
845
843
843
841
842
847
843
842
843
844
844
841
845


In [6]:
# CHECKING if unique ingredients & categories are MATCHING with the unique ingredients
# & categories in the TRAINING DATASET!!
ing_val = checkIng(final_val_df)
print("Validation ings: ", len(ing_val))

cat_val = final_val_df['Category'].unique()
print("Validation Cats: ", len(cat_val))

Validation ings:  850
Validation Cats:  98


In [12]:
# CHECKING if unique ingredients & categories are MATCHING with the unique ingredients
# & categories in the TRAINING DATASET!!
ing_small = checkIng(smaller_df)
print("Validation ings: ", len(ing_small))

cat_small = smaller_df['Category'].unique()
print("Validation Cats: ", len(cat_small))

Validation ings:  850
Validation Cats:  98


In [13]:
val_ingredient_counts = getSortedIngCounts(final_val_df)
ingredient_counts = getSortedIngCounts(smaller_df)
for k, v in val_ingredient_counts.items():
    if(k not in ingredient_counts.keys()):
        print(k)

In [14]:
print("INGREDIENTS same in small & validation: ", ing_val == ing_small)
print("CATEGORIES same in small & validation: ", cat_val == cat_small)

INGREDIENTS same in small & validation:  True
CATEGORIES same in small & validation:  [ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True]


In [25]:
for i in ing_small:
    if(i in small_forbidden_ing):
        print("In forbidden", i)

In forbidden citrus vinaigrette
In forbidden baton
In forbidden capon
In forbidden back ribs
In forbidden amaranth
In forbidden deli lunch
In forbidden crema mexicana
In forbidden gewurztraminer
In forbidden epazote
In forbidden pot roast
In forbidden fresno chiles
In forbidden demerara sugar
In forbidden endive
In forbidden indonesian sweet soy sauce
In forbidden tallow
In forbidden yellowfin tuna
In forbidden style corn
In forbidden freeze-dried strawberries
In forbidden spring ragout
In forbidden sauterne
In forbidden frankfurters
In forbidden pig
In forbidden crusty rolls
In forbidden fuyu persimmons
In forbidden natural yogurt
In forbidden xylitol sweetener
In forbidden teriyaki
In forbidden pasilla chiles


In [52]:
print("IF any ingr in validation are in forbidden?:", small_forbidden_ing.issubset(ing_val))
print("IF any ingr in train are in forbidden?:", small_forbidden_ing.issubset(ing_small))

IF any ingr in validation are in forbidden?: True
IF any ingr in train are in forbidden?: True


In [99]:
len(smaller_df)

48847

In [100]:
len(final_val_df)

14700

In [91]:
smaller_df.to_csv('data/metadata/50k_FINAL_DATA.csv',  index = False, sep="\t")

In [92]:
final_val_df.to_csv('data/metadata/10k_FINAL_VAL_DATA.csv',  index = False, sep="\t")

In [37]:
import pandas as pd 
smaller_df2 = pd.read_csv('data/metadata/50k_FINAL_TRAIN_DATA.csv', sep="\t")

smaller_df2.isna().sum()

ID/File Name       0
Category           0
Calorie(kcal)      0
Carbohydrate(g)    0
Protein(g)         0
Fat(g)             0
Ingredients        0
dtype: int64

In [2]:
import pandas as pd

final_val_df = pd.read_csv('data/metadata/15k_FINAL_VAL_DATA.csv', sep="\t")

final_val_df.isna().sum()

ID/File Name       0
Category           0
Calorie(kcal)      0
Carbohydrate(g)    0
Protein(g)         0
Fat(g)             0
Ingredients        0
dtype: int64