# Imports

In [1]:
import json
import csv
from pprint import pprint
from typing import Optional

from tqdm import tqdm

import pandas as pd

# Open datasets as DataFrames

In [2]:
!ls ../dataset

[1;32m det_ingrs.json[0m  [1;32m recipe1M_images_test.tar[0m       [1;32m val_filtered.csv[0m
[1;32m filtered.csv[0m    [1;32m recipe1M_images_val.tar[0m        
[1;34m images[0m          [1;32m recipe1M_layers.tar[0m            
[1;32m layer1.csv[0m      [1;32m test_dataset_with_images.csv[0m   
[1;32m layer1.json[0m     [1;32m test_filtered.csv[0m              
[1;32m layer2.csv[0m      [1;32m train_dataset_with_images.csv[0m  
[1;32m layer2.json[0m     [1;32m train_filtered.csv[0m             
[1;32m README.md[0m       [1;32m val_dataset_with_images.csv[0m    


In [3]:
df_layer1 = pd.read_csv("../dataset/layer1.csv")
df_layer2 = pd.read_csv("../dataset/layer2.csv")

In [4]:
df_layer1.head()

Unnamed: 0,id,title,partition,url,instructions,ingredients
0,000018c8a5,Worlds Best Mac and Cheese,train,http://www.epicurious.com/recipes/food/views/-...,Preheat the oven to 350 F. Butter or oil an 8-...,6 ounces penne;2 cups Beechers Flagship Cheese...
1,000033e39b,Dilly Macaroni Salad Recipe,train,http://cookeatshare.com/recipes/dilly-macaroni...,Cook macaroni according to package directions;...,1 c. elbow macaroni;1 c. cubed American cheese...
2,000035f7ed,Gazpacho,train,http://www.foodnetwork.com/recipes/gazpacho1.html,Add the tomatoes to a food processor with a pi...,"8 tomatoes, quartered;Kosher salt;1 red onion,..."
3,00003a70b1,Crunchy Onion Potato Bake,test,http://www.food.com/recipe/crunchy-onion-potat...,Preheat oven to 350 degrees Fahrenheit.;Spray ...,2 12 cups milk;1 12 cups water;14 cup butter;m...
4,00004320bb,Cool 'n Easy Creamy Watermelon Pie,train,http://www.food.com/recipe/cool-n-easy-creamy-...,Dissolve Jello in boiling water.;Allow to cool...,1 (3 ounce) package watermelon gelatin;14 cup ...


In [5]:
df_layer2.head()

Unnamed: 0,id,image_file_names
0,00003a70b1,3e233001e2.jpg;7f749987f9.jpg;aaf6b2dcd3.jpg
1,000075604a,6bdca6e490.jpg
2,00007bfd16,6409eab844.jpg;f7cb3de295.jpg
3,000095fc1d,a1374cdd98.jpg
4,0000b1e2b5,cb1a684683.jpg


In [6]:
# Check if every recipe has image

len(df_layer1)==len(df_layer2)

False

In [7]:
from pathlib import Path

def get_file(id: str, partition: str) -> Path:
    """
    Get path from the id and partition of the image
    """
    char = [_ for _ in id[:4]]
    return Path(f'../dataset/images/{partition}')/char[0]/char[1]/char[2]/char[3]/f"{id}"

# Create DataFrame with recipes that are supposed to have images

In [8]:
df3 = pd.merge(df_layer1, df_layer2, on="id")
len(df3)

402760

In [9]:
df3.head()

Unnamed: 0,id,title,partition,url,instructions,ingredients,image_file_names
0,00003a70b1,Crunchy Onion Potato Bake,test,http://www.food.com/recipe/crunchy-onion-potat...,Preheat oven to 350 degrees Fahrenheit.;Spray ...,2 12 cups milk;1 12 cups water;14 cup butter;m...,3e233001e2.jpg;7f749987f9.jpg;aaf6b2dcd3.jpg
1,000075604a,Kombu Tea Grilled Chicken Thigh,train,https://cookpad.com/us/recipes/150100-kombu-te...,Pierce the skin of the chicken with a fork or ...,2 Chicken thighs;2 tsp Kombu tea;1 White pepper,6bdca6e490.jpg
2,00007bfd16,Strawberry Rhubarb Dump Cake,train,http://www.food.com/recipe/strawberry-rhubarb-...,Put ingredients in a buttered 9 x 12 x 2-inch ...,"6 -8 cups fresh rhubarb, or;6 -8 cups frozen r...",6409eab844.jpg;f7cb3de295.jpg
3,000095fc1d,Yogurt Parfaits,train,http://tastykitchen.com/recipes/breakfastbrunc...,Layer all ingredients in a serving dish.,"8 ounces, weight Light Fat Free Vanilla Yogurt...",a1374cdd98.jpg
4,0000b1e2b5,Fennel-Rubbed Pork Tenderloin with Roasted Fen...,train,http://www.epicurious.com/recipes/food/views/f...,Preheat oven to 350F with rack in middle.;Crus...,1 teaspoon fennel seeds;1 pound pork tenderloi...,cb1a684683.jpg


In [10]:
df3.to_csv("../dataset/filtered.csv")

In [11]:
!ls -alh ../dataset/filtered.csv

[1;32m.[0m[32mr[33mw[31mx[32mr[33mw[31mx[32mr[33mw[31mx[0m [38;5;230mlucky[0m [38;5;187mlucky[0m [38;5;216m416[0m [38;5;216mMB[0m [38;5;40mFri Aug 20 18:34:49 2021[0m [1;32m ../dataset/filtered.csv[0m


In [12]:
def get_partition_df(df: pd.DataFrame, partition: str) -> pd.DataFrame:
    """
    Get dataframe with only partition rows
    """
    return df.loc[df_layer1["partition"] == partition]

In [13]:
# Seperate dataframe for each partitions and save them

# df_train = get_partition_df(df3, "train")
# df_train.to_csv("../dataset/train_filtered.csv", index=False)

df_val = get_partition_df(df3, "val")
df_val.to_csv("../dataset/val_filtered.csv", index=False)

df_test = get_partition_df(df3, "test")
df_test.to_csv("../dataset/test_filtered.csv", index=False)

In [14]:
len(df_val.index)

60758

In [15]:
def filter_rows_with_image(csv_filename: str, partition: str, length: Optional[int] = None):
    """
    Filter out the rows from csv that doesn't have images
    in image directory
    """
    
    with open(f"../dataset/{csv_filename}.csv", 'r') as infile:
        reader = csv.reader(infile)
        
        with open(f"../dataset/{partition}_dataset_with_images.csv", 'w') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(next(reader))
            
            for row in tqdm(reader, total=length):
                images = row[6].split(';')
                count=0
                valid_images = []

                for image in images:
                    if get_file(image, partition).exists():
                        count+=1
                        valid_images.append(image)
                        
                if count > 0:
                    row[6] =  ';'.join(valid_images)
                    writer.writerow(row)

In [16]:
# filter_rows_with_image("train_filtered", "train", length=len(df_train.index))

filter_rows_with_image("val_filtered", "val", length=len(df_val.index))

filter_rows_with_image("test_filtered", "test", length=len(df_test.index))

100%|███████████████████████████████████████████████████████████████████████| 60758/60758 [03:23<00:00, 299.09it/s]
100%|███████████████████████████████████████████████████████████████████████| 60109/60109 [02:42<00:00, 370.86it/s]


In [17]:
# !ls -alh ../dataset/train_dataset_with_images.csv

In [18]:
!ls -alh ../dataset/val_dataset_with_images.csv

[1;32m.[0m[32mr[33mw[31mx[32mr[33mw[31mx[32mr[33mw[31mx[0m [38;5;230mlucky[0m [38;5;187mlucky[0m [38;5;216m9.4[0m [38;5;216mMB[0m [38;5;40mFri Aug 20 18:38:19 2021[0m [1;32m ../dataset/val_dataset_with_images.csv[0m


In [19]:
!ls -alh ../dataset/test_dataset_with_images.csv

[1;32m.[0m[32mr[33mw[31mx[32mr[33mw[31mx[32mr[33mw[31mx[0m [38;5;230mlucky[0m [38;5;187mlucky[0m [38;5;216m9.3[0m [38;5;216mMB[0m [38;5;40mFri Aug 20 18:41:01 2021[0m [1;32m ../dataset/test_dataset_with_images.csv[0m


In [20]:
# pd.read_csv("../dataset/train_dataset_with_images.csv").head()

In [21]:
pd.read_csv("../dataset/val_dataset_with_images.csv").head()

Unnamed: 0,id,title,partition,url,instructions,ingredients,image_file_names
0,0015b21c43,Crabby Cheese Angel Hair Pasta,val,http://www.food.com/recipe/crabby-cheese-angel...,Cook Pasta according to pkg.;instructions; dra...,2 packages louis kemp Crab Delights Imitation ...,1218a2172d.jpg
1,0016145ed6,Blue Cheese and Citrus Dressing,val,http://www.food.com/recipe/blue-cheese-and-cit...,Blend everything in a blender.;Infuse love and...,"3 12 ounces blue cheese;1 orange, juice of;1 s...",e1c6785e8c.jpg
2,00176e51aa,Baked Tofu and Veggie Stir-fry with Basmati Br...,val,http://tastykitchen.com/recipes/special-dietar...,Preheat oven to 400 F. Cut tofu into thin tria...,"1 package Firm Or Extra Firm Tofu, 14-16 Ounce...",a3b050484a.jpg
3,001bd30cff,Grilled Leg of Lamb with Spiced Mustard and Ro...,val,http://www.epicurious.com/recipes/food/views/g...,Combine first 3 ingredients in small bowl.;Gra...,1 tablespoon dry mustard;1 teaspoon ground car...,a265866e73.jpg
4,002f5d86e5,Curried Sausages,val,http://www.food.com/recipe/curried-sausages-19...,Cover the sausages with cold water; bring to t...,"8 links sausages (beef, chicken, lamb or pork ...",27593103db.jpg


In [22]:
pd.read_csv("../dataset/test_dataset_with_images.csv").head()

Unnamed: 0,id,title,partition,url,instructions,ingredients,image_file_names
0,00183acce9,Italian Vegetarian Patties,test,http://allrecipes.com/recipe/italian-vegetaria...,Heat 2 tablespoons oil in a large saucepan.;St...,2 tablespoons vegetable oil;3/4 cup uncooked b...,4aa71d889a.jpg;89b3fa3935.jpg
1,001ee1021c,"White Beans With Lemon, Garlic and Rosemary",test,http://www.food.com/recipe/white-beans-with-le...,"In a small pan, combine the rosemary sprigs, o...",3 sprigs fresh rosemary (three inch long);14 c...,42893ae184.jpg;b2e2997c3e.jpg
2,00234a6bd1,Ants Climbing a Tree,test,http://tastykitchen.com/recipes/main-courses/a...,"1.;Mix cornstarch and water into a slurry, the...",1 Tablespoon Cornstarch;1 Tablespoon Water;1/4...,89c648b622.jpg
3,0039542254,Vegetarian Japche,test,https://cookpad.com/us/recipes/154555-vegetari...,Boil the konnyaku noodles for 2-3 minutes (boi...,200 grams Konnyaku noodles (or cellophane nood...,e828eadfc3.jpg
4,0039c547a7,Coconut Milk Curry,test,https://cookpad.com/us/recipes/243747-coconut-...,Firstly cook some rice.;Dice the onion and gar...,200 g chicken;2 eggplants;1 carrot;1/2 onion;1...,205decf370.jpg


# Cleanup

In [23]:
# !rm ../dataset/filtered.csv