In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy import stats
import random
import os
import tensorflow as tf
import math
import cv2

In [None]:
seed = 9527
SIZE = 500

img_size = 256
img_path = '../input/food-ingredients-and-recipe-dataset-with-images/Food Images/Food Images/'

common_allergens = {
    'cows milk': {'Cheese', 'Butter', 'Margarine', 'Yogurt', 'Cream', 'Ice cream'},
    'eggs': {'egg'},
    'tree nuts': {'Brazil nut', 'Almond', 'Cashew', 'Macadamia nut', 'Pistachio','Pine nut','Walnut'},
    'peanuts': {'peanut'},
    'shellfish': {'Shrimp','Prawn','Crayfish', 'Lobster', 'Squid', 'Scallops'},
    'wheat': {'flour', 'wheat', 'pasta', 'noodle', 'bread', 'crust'},
    'soy': {'soy', 'tofu', 'soya'},
    'fish': {'fish', 'seafood'}
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
def allergens_mapping(row, types):
    for item in common_allergens[types]:
        if item.lower() in row.lower():
                return 1
    return 0

def image_mapping_check(dataset):
    counter = 0
    record = []
    while counter < dataset.shape[0]-1:
        row = dataset.loc[counter]
        img_name = row['Image_Name']
        img = cv2.imread(img_path+img_name+'.jpg')
        try:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 
        except:
            record.append(counter)
        counter+= 1
    new = dataset.drop(record, axis = 0)
    new = new.reset_index(drop = True)
    return new

In [None]:
def get_dataset():
    path = "../input/food-ingredients-and-recipe-dataset-with-images/Food Ingredients and Recipe Dataset with Image Name Mapping.csv"
    df = pd.read_csv(path)
    df = image_mapping_check(df)
    df['cows milk'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'cows milk'))
    df['eggs'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'eggs'))
    df['tree nuts'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'tree nuts'))
    df['peanuts'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'peanuts'))
    df['shellfish'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'shellfish'))
    df['wheat'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'wheat'))
    df['soy'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'soy'))
    df['fish'] = df['Cleaned_Ingredients'].apply(lambda x: allergens_mapping(x, types = 'fish'))
    x_train, x_val, y_train, y_val = train_test_split(df[['Image_Name']], df.iloc[:,6:14], shuffle = True, random_state = seed, test_size = 0.25)
    train_df = pd.concat([x_train, y_train], axis = 1).reset_index(drop = True)
    val_df = pd.concat([x_val, y_val], axis = 1).reset_index(drop = True)
    return df, train_df, val_df

In [None]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def serialize_example(feature0, feature1, feature2, feature3, feature4, feature5,feature6, feature7, feature8, feature9):
    feature = {
        'image_name': _bytes_feature(feature0),
        'image': _bytes_feature(feature1),
        'cows_milk': _int64_feature(feature2),
        'eggs': _int64_feature(feature3),
        'tree nuts': _int64_feature(feature4),
        'peanuts': _int64_feature(feature5),
        'shellfish': _int64_feature(feature6),
        'wheat': _int64_feature(feature7),
        'soy': _int64_feature(feature8),
        'fish': _int64_feature(feature9)
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def create_recored(dataset, title):
    CT = dataset.shape[0]//SIZE + int(dataset.shape[0]%SIZE!=0)
    for j in range(CT):
        print(); print('Writing TFRecord %i of %i...'%(j,CT))
        CT2 = min(SIZE,dataset.shape[0]-j*SIZE)
        if title == "train":
            with tf.io.TFRecordWriter('train%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
                for k in range(CT2):
                    counter = j*SIZE+k
                    row = dataset.loc[counter]
                    img_name = row['Image_Name']
                    img = cv2.imread(img_path+img_name+'.jpg')
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                    img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                    img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                    example = serialize_example(
                        str.encode(img_name),
                        img,
                        row['cows milk'],
                        row['eggs'],
                        row['tree nuts'],
                        row['peanuts'],
                        row['shellfish'],
                        row['wheat'],
                        row['soy'],
                        row['fish']
                    )
                    writer.write(example)
                    if k%100==0: print(k,', ',end='')
        else:
             with tf.io.TFRecordWriter('val%.4i-%.2i-%i.tfrec'%(seed,j,CT2)) as writer:
                for k in range(CT2):
                    counter = j*SIZE+k
                    img_name = dataset.Image_Name[counter]
                    img = cv2.imread(img_path+img_name+'.jpg')
                    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Fix incorrect colors
                    img = cv2.resize(img,(img_size,img_size),interpolation = cv2.INTER_AREA)
                    img = cv2.imencode('.jpg', img, (cv2.IMWRITE_JPEG_QUALITY, 94))[1].tostring()
                    row = dataset.loc[counter]
                    example = serialize_example(
                        str.encode(img_name),
                        img,
                        row['cows milk'],
                        row['eggs'],
                        row['tree nuts'],
                        row['peanuts'],
                        row['shellfish'],
                        row['wheat'],
                        row['soy'],
                        row['fish']
                    )
                    writer.write(example)
                    if k%100==0: print(k,', ',end='')

In [None]:
def get_record(seed):
    seed_everything(seed)
    df, train_df, val_df = get_dataset()
    # save df
    df.to_csv('food_ingredients_dataset.csv', index = False)
    create_recored(train_df,title = "train")
    create_recored(val_df, title = "val")

In [None]:
# create TFRecord
# the code is used to build model to predict asthma triggers for my course ALY6980. Liuzhao Tang.
get_record(seed)