In [1]:
#Python Libraries for Mongos Database
import pymongo
from pymongo import MongoClient

#Python Library for Dataframe usage
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Serializing to a file
import _pickle as pickle

#Libraries for Graph
import networkx as nx

#etc
import random 
from collections import defaultdict

In [2]:
#accessing mongoDB
client = MongoClient()
database = client['food_map']   # Database name (to connect to)
collections = database['flavor_molecules']

#Getting the dataset from MongoDB into Pandas
flavorDB_pandas = pd.DataFrame(list(collections.find()))
flavorDB_pandas = flavorDB_pandas[["_id", "ingredient", "catgeory", "molecules", "molecule_IDs"]]

#Making each list into a set
flavorDB_pandas["set_molecules"] = flavorDB_pandas["molecules"].apply(lambda row: set(row))
flavorDB_pandas["set_molecules_ID"] = flavorDB_pandas["molecule_IDs"].apply(lambda row: set(row))
flavorDB_pandas["molecule_quantity"] = flavorDB_pandas["molecule_IDs"].apply(lambda row: len(row))


In [3]:
flavorDB_pandas.head()

Unnamed: 0,_id,ingredient,catgeory,molecules,molecule_IDs,set_molecules,set_molecules_ID,molecule_quantity
0,5ca27b8f13218a1eabac63d6,Egg,Animal Product,"[L-arginine, 3-Methylindole, 2,5-Dimethylpyraz...","[6322, 6736, 31252, 7909, 7284, 7501, 9609, 12...","{50-69-1, thiamine, Pyrrole, hydrogen sulfide,...","{5311110, 8094, 11552, 19602, 18635, 11747, 62...",55
1,5ca27b9113218a1eabac63d7,Bakery Products,Bakery,"[2,3-Dimethylpyrazine, 2,5-Dimethylpyrazine, 2...","[22201, 31252, 26331, 27457, 7976, 26808]","{2,3-Dimethylpyrazine, 2,3,5-Trimethylpyrazine...","{27457, 31252, 26808, 22201, 26331, 7976}",6
2,5ca27b9213218a1eabac63d8,Bread,Bakery,"[coumarin, Methyl Benzoate, 3-Hexanone, Gerani...","[323, 7150, 11509, 637566, 439341, 33931, 9261...","{succinic acid, thiamine, Pyrrolidine, Methyl ...","{702, 6054, 284, 15394, 5284639, 107, 798, 644...",129
3,5ca27b9313218a1eabac63d9,Rye Bread,Bakery,"[2-Methylbutyl Acetate, 3-methylthiopropanol, ...","[12209, 10448, 5372954, 8468, 643731, 7344, 18...","{guaiacol, thiamine, 3,4-Dihydroxybenzoic Acid...","{8094, 8369, 18635, 5366074, 19309, 7344, 6561...",30
4,5ca27b9413218a1eabac63da,Wheaten Bread,Bakery,"[Difurfuryl ether, 2-Undecanone, Octyl acetate...","[263034, 8163, 8164, 12170, 228583, 9589, 6375...","{2-Ethyl-3,5-Dimethylpyrazine, Octyl acetate, ...","{19310, 6915, 1146, 11747, 26808, 14286, 22386...",30


In [4]:
print("total number of ingredients: ", len(flavorDB_pandas["ingredient"]))

total number of ingredients:  935


In [5]:
flavorDB_pandas["catgeory"].unique()

array(['Animal Product', 'Bakery', 'Beverage', 'Beverage Alcoholic',
       'Beverage Caffeinated', 'Cereal', 'Maize', 'Dairy',
       'Essential Oil', 'Berry', 'Seafood', 'Fish', 'Flower', 'Fruit',
       'Fruit-Berry', 'Fruit Citrus', 'Fruit Essence', 'Fungus', 'Herb',
       'Meat', 'Dish', 'Nut', 'Seed', 'Legume', 'Plant Derivative',
       'Plant', 'Spice', 'Vegetable', 'Cabbage', 'Vegetable Root',
       'Vegetable Fruit', 'Gourd', 'Vegetable Stem', 'Vegetable Tuber',
       'Additive'], dtype=object)

In [6]:
flavorDB_pandas[flavorDB_pandas["catgeory"] == "Animal Product"]

Unnamed: 0,_id,ingredient,catgeory,molecules,molecule_IDs,set_molecules,set_molecules_ID,molecule_quantity
0,5ca27b8f13218a1eabac63d6,Egg,Animal Product,"[L-arginine, 3-Methylindole, 2,5-Dimethylpyraz...","[6322, 6736, 31252, 7909, 7284, 7501, 9609, 12...","{50-69-1, thiamine, Pyrrole, hydrogen sulfide,...","{5311110, 8094, 11552, 19602, 18635, 11747, 62...",55


In [29]:
for category in flavorDB_pandas["catgeory"] .unique():
    print(category)
    category_list = flavorDB_pandas[flavorDB_pandas["catgeory"] == category]["ingredient"]
    print(category_list)

Animal Product
0    Egg
Name: ingredient, dtype: object
Bakery
1          Bakery Products
2                    Bread
3                Rye Bread
4            Wheaten Bread
5              White Bread
6         Wholewheat Bread
372           Fried Potato
475                  Pasta
476                Biscuit
774            Marshmallow
785               Meringue
812            Potato chip
813          Tortilla chip
814              Corn chip
863           Phyllo dough
865              Pie crust
883             Pita bread
884               Focaccia
885                  Bagel
886    Other bread product
887             Piki bread
888           French toast
889              Oat bread
890           Potato bread
893       Multigrain bread
894             Rice bread
895              Pan dulce
896           Raisin bread
897         Wonton wrapper
904       Chocolate mousse
915                  Fudge
916              Candy bar
Name: ingredient, dtype: object
Beverage
7                   Wort
789    

In [40]:
list_of_categories = list(flavorDB_pandas["catgeory"].unique())

In [36]:
keep_these_list = ["Animal Product", "Beverage Caffeinated", "Dairy", "Berry", "Seafood", "Fish", "Fruit", "Fruit Citrus", "Fruit Essence", "Fungus", "Herb", "Meat", "Nut", "Seed", "Legume", "Plant Derivative", "Spice", "Vegetable", "Cabbage", "Vegetable Root", "Vegetable Fruit", "Gourd",  "Vegetable Stem", "Vegetable Tuber", "Additive"]
#things that should definitely be kept

In [37]:
remove_these_list = ["Bakery", "Beverage", "Dish"]
#things to definitely take out

In [38]:
consider_this_list = ["Beverage Alcoholic", "Cereal", "Maize", "Essential Oil", "Flower", "Fruit-Berry", "Plant","Additive"]
#things that may or may not be taken out

In [41]:
for item in keep_these_list:
    print(item)
    print(item in list_of_categories)

Animal Product
True
Beverage Caffeinated
True
Dairy
True
Berry
True
Seafood
True
Fish
True
Fruit
True
Fruit Citrus
True
Fruit Essence
True
Fungus
True
Herb
True
Meat
True
Nut
True
Seed
True
Legume
True
Plant Derivative
True
Spice
True
Vegetable
True
Cabbage
True
Vegetable Root
True
Vegetable Fruit
True
Gourd
True
Vegetable Stem
True
Vegetable Tuber
True
Additive
True


In [42]:
for item in consider_this_list:
    print(item)
    print(item in list_of_categories)

Beverage Alcoholic
True
Cereal
True
Maize
True
Essential Oil
True
Flower
True
Fruit-Berry
True
Plant
True
Additive
True


In [43]:
list_to_use = keep_these_list + consider_this_list
list_to_use

['Animal Product',
 'Beverage Caffeinated',
 'Dairy',
 'Berry',
 'Seafood',
 'Fish',
 'Fruit',
 'Fruit Citrus',
 'Fruit Essence',
 'Fungus',
 'Herb',
 'Meat',
 'Nut',
 'Seed',
 'Legume',
 'Plant Derivative',
 'Spice',
 'Vegetable',
 'Cabbage',
 'Vegetable Root',
 'Vegetable Fruit',
 'Gourd',
 'Vegetable Stem',
 'Vegetable Tuber',
 'Additive',
 'Beverage Alcoholic',
 'Cereal',
 'Maize',
 'Essential Oil',
 'Flower',
 'Fruit-Berry',
 'Plant',
 'Additive']

In [47]:
cleaned_up_df = flavorDB_pandas[flavorDB_pandas["catgeory"].isin(list_to_use)]

In [50]:
cleaned_up_df.shape

(813, 8)