In [1]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from os import listdir
from os.path import isfile, join
from PIL import Image
%matplotlib inline

# receiving data functions

In [2]:
# create four data frames of the relavant data
def get_data():
    train = pd.read_csv("./data/food_train.csv")
    test = pd.read_csv("./data/food_test.csv")
    nutrients = pd.read_csv("./data/nutrients.csv")
    food_nutrients = pd.read_csv("./data/food_nutrients.csv")
    return(train, test,food_nutrients, nutrients)

In [3]:
# merge food table with nutrients table to get for each food his nutrients
def merge_train_food_nutirents_nutrients():
    train, test, food_nutrients, nutrients = get_data()
    food_nutrients = food_nutrients[food_nutrients["amount"]>0]
    train_food_nutrients = pd.merge(train,food_nutrients,on='idx')
    output = pd.merge(train_food_nutrients, nutrients, on = "nutrient_id")
    return(output)

# probability functions

In [5]:
# return for each category the probability of a specific word to appear on foods parameter(such as: "brand", "description")
#that belongs to this category
# return a data frame that category column includes category names, and percentage_words column includes the precentage of 
# the word(that it is given to the function as an argument) to appear in this category.
def common_words_category(word, col):
    train, test, food_nutrients, nutrients = get_data()
    train['words']= train[col].apply(lambda x: 1 if word in str(x) else 0)
    #train['words']= train['description'].apply(lambda x: 1 if word in x else 0)
    tmp1 = train.groupby("category", as_index = False).sum()[["category", "words"]]
    tmp2 = train.groupby("category", as_index = False).count()
    output =pd.merge(tmp1,tmp2, on = "category")[["category", "words_x", "words_y"]]
    output["percentage_words"] = round(100*output["words_x"]/output["words_y"], 2)
    return(output[["category", "percentage_words"]])

In [None]:
def get_top_common_words(number_of_common_words,df_common_words, column, func, col_func_name , other_func,col_other_func_name ,words_to_check = 50):
    categories= ["cakes_cupcakes_snack_cakes", "candy", "chips_pretzels_snacks", "chocolate" , "cookies_biscuits", "popcorn_peanuts_seeds_related_snacks"]
    out_table= pd.DataFrame({"word":[], "percentage":[],  "percentage_2":[], "category":[]})
    for i in range(len(categories)):
        cat = categories[i]
        tmp_table= get_top_common_words_of_cat(number_of_common_words,df_common_words, column, cat, func, col_func_name,other_func, col_other_func_name, words_to_check)
        out_table = pd.concat([out_table, tmp_table], axis=0)
    out_table.drop('index', inplace=True, axis=1)
    for i in range(len(categories)):
        cat = categories[i]
        tmp_table= get_top_common_words_of_cat(number_of_common_words,df_common_words, column, cat, other_func, col_other_func_name,func, col_func_name, words_to_check, True)
        out_table = pd.concat([out_table, tmp_table], axis=0)
    #out_table.drop('index', inplace=True, axis=1)  
    return(out_table.reset_index(drop= True))

In [6]:
# returns for each catgeory the probabilty that a food belongs to it,
# given that the word in the  arguments of the function appears in this food's parameter(=col)
# return a data frame that category column includes category names, and category_percentage column includes the precentage of 
# the word(that it is given to the function as an argument) to appear in this category.
def category_if_word(word,col):
    #bayes method
    train, test, food_nutrients, nutrients = get_data()
    #train['words']= train['description'].apply(lambda x: 1 if word in x else 0)
    train['words']= train[col].apply(lambda x: 1 if word in str(x) else 0)
    n_words =train["words"].sum()
    n = len(train)
    tmp1 = train.groupby("category", as_index = False).sum()[["category", "words"]]
    cat_table = train.groupby("category", as_index = False).count()
    word_if_cat =pd.merge(tmp1,cat_table, on = "category")[["category", "words_x", "words_y"]]
    word_if_cat["percentage_words"] = word_if_cat["words_x"]/word_if_cat["words_y"]
    word_if_cat = word_if_cat[["category", "percentage_words"]]
    cat_table["words"] = cat_table["words"]/n
    out = pd.merge(word_if_cat,cat_table, on="category")
    out["category_percentage"] = round(100*out["percentage_words"]*out["words"]/(n_words/n), 2)
    return(out[["category", "category_percentage"]])

In [None]:
def get_top_common_words_of_cat(number_of_common_words,df_common_words, column,cat, func, col_func_name , other_func,col_other_func_name ,words_to_check = 50,flip =False):
    min_val = 0
    min_word =None
    table_sum = pd.DataFrame({"word":[], "percentage":[],  "percentage_2":[]})
    i = 0
    for word in df_common_words['Name']:
        table= func(word,column)
        filter_table= table[table['category']==cat].reset_index()
        table_sum.loc[len(table_sum.index)] = [word, filter_table[col_func_name][0], 0]
        if(i>=words_to_check):
            break
        i+=1
    table_sum.sort_values(['percentage'], ascending = False, inplace = True)
    out_table = table_sum.head(number_of_common_words).reset_index()
    for j in range(len(out_table)):
        word= out_table['word'][j]
        table= other_func(word,column)
        filter_table= table[table['category']==cat].reset_index()
        out_table.at[j, "percentage_2"] = filter_table[col_other_func_name][0]
    out_table['category'] = cat
    if flip:
        return(pd.DataFrame({"word":out_table['word'], "percentage":out_table["percentage_2"],  "percentage_2":out_table["percentage"], 'category': out_table['category']}))
    return(out_table)

# brand functions

In [8]:
# returns brands that make  at least num_of_foods for exactly num_of_categories
def get_expert_brands(num_of_foods, num_of_categories):
    train, test, food_nutrients, nutrients = get_data()
    #brand_agg = train.groupby(["brand"], as_index= False).count().sort_values(["brand", "idx"], ascending=False)[["brand", "idx"]]
    brand_agg = train.groupby(["brand"], as_index= False).count()[["brand", "idx"]]
    brand_agg.rename(columns = {'idx':'num_of_foods'}, inplace = True)
    #brand_cat_agg = train.groupby(["brand", "category"], as_index= False).count().sort_values(["brand", "idx"], ascending=False)
    #brand_category = brand_cat_agg.groupby(["brand"])["category"].count().sort_values()
    brand_cat_agg = train.groupby(["brand", "category"], as_index= False).count()
    brand_category = brand_cat_agg.groupby(["brand"])["category"].count()
    brand_category_merged = pd.merge(brand_agg, brand_category, on= "brand")
    brand_category_merged.rename(columns = {'category':'num_of_categories'}, inplace = True)
    output = brand_category_merged[(brand_category_merged["num_of_foods"]> num_of_foods) & (brand_category_merged["num_of_categories"]==num_of_categories)]
    output = output.sort_values(["num_of_foods"], ascending=False)
    return(output)

In [1]:
# returns all the brands that produce at least num_of_foods products and all of their products are belongs to 
# one food category.
def get_brands_expert_one_category(num_of_foods):
    train, test, food_nutrients, nutrients = get_data()
    brand_agg = train.groupby(["brand"], as_index= False).agg({'category': ['nunique']})
    brand_agg.columns = ["brand", "num_categories"]
    brand_agg_filter = brand_agg[brand_agg["num_categories"] == 1]
    brand_cat_agg = train.groupby(["brand", "category"], as_index= False).count()
    brand_category_merged = pd.merge(brand_agg_filter, brand_cat_agg, on= "brand")[['brand','category', 'idx']]
    brand_category_merged.rename(columns = {'idx':'num_of_foods'}, inplace = True)
    brand_category_merged_filter = brand_category_merged[brand_category_merged['num_of_foods']>= num_of_foods]
    return(brand_category_merged_filter.sort_values(['num_of_foods'], ascending=False).reset_index(drop =True))

In [9]:
# returns for expert brands their category of experty
# expert brand = brand that makes at least num_of_food for a specific category and does not make foods that belongs to other
# categories
def expert_brands_category(num_of_foods):
    expert_brands = get_expert_brands(num_of_foods,1)
    train, test, food_nutrients, nutrients = get_data()
    out = pd.merge(expert_brands, train, on= "brand")[["brand", "num_of_foods", "category", "num_of_categories"]]
    out = out.groupby(["brand", "num_of_foods", "category"], as_index= False).count()
    return(out[["brand", "num_of_foods", "category"]])

In [6]:
# plots a bar plot of all brands that produce at least num_of_foods products and all of their products are belongs to 
# one food category. (x lables are: brands names, y lbles are: amount of products each brand makes)
def plot_expert_brands(num_of_foods):
    brand_that_make_products_for_one_cat =expert_brands_category(num_of_foods).sort_values(["category", "num_of_foods"], ascending= False)
    brand_that_make_products_for_one_cat
    plt.figure(figsize=(7,7))
    ax = sns.barplot(x="brand", y="num_of_foods", hue="category",data=brand_that_make_products_for_one_cat, dodge=False)
    ax.set_xticklabels(ax.get_xticklabels(),rotation = 45, horizontalalignment='right')
    ax.set_title(f"brands that make at least {num_of_foods} products for one category and 0 products for other categories")
    plt.show()

In [None]:
# returns all brands that produce at least 'num_of_foods' products and there is at least one category that
# 'precentage' precentageof the products this brand produce is from this category.
def get_prob_brands(min_num_of_foods, precentage):
    train, test, food_nutrients, nutrients = get_data()
    brand_agg = train.groupby(["brand"], as_index= False).count()[["brand", "idx"]]
    brand_agg.rename(columns = {'idx':'num_of_foods'}, inplace = True)
    brand_agg = brand_agg[brand_agg['num_of_foods'] >=min_num_of_foods]
    brand_cat_agg = train.groupby(["brand", "category"], as_index= False).count()
    #brand_category = brand_cat_agg.groupby(["brand"])["category"].count()
    brand_category = brand_cat_agg[[ "brand", "category", "idx"]]
    brand_category = brand_category.rename(columns = {'idx': 'num_foods_category'}, inplace = False)
    brand_category_merged = pd.merge(brand_agg, brand_category, on= "brand")
    brand_category_merged['precentage'] = round((100*brand_category_merged.num_foods_category/brand_category_merged.num_of_foods),2)
    out= brand_category_merged[brand_category_merged.precentage>=precentage][['category', 'brand', 'num_of_foods', 'precentage']]
    return(out.sort_values(["category", "precentage"], ascending= False).reset_index(drop =True))

# transformation function

In [10]:
#  function that deletes non english characters from a string
def to_english(words):
    output= []
    for word in words:
        res = re.sub(r'[^a-zA-Z ]', '', word)
        if (len(res)>1):
            output.append(res)
    #print(out)
    return(','.join(output))

In [11]:
# function that gets dictionary of dictionries that the keys of the outr dicto
def add_words_to_dict(cat, dic,words):
    words_list = words.split(',')
    cur_dic = dic[str(cat)]
    for word in words_list:
        if word in cur_dic:
            cur_dic[word] +=1
        else:
            cur_dic[word]=1
    return None

In [7]:
# # for each category return  words that appears at least n times in this category in a food's parameter(=col)
# returns dictionary that it's keys are categories names and its values are the words that fulfill the condition.
def find_common_words(words_dict, col, limit,func):
    output = {"cakes_cupcakes_snack_cakes":[], "candy":[], "chips_pretzels_snacks":[], "chocolate": [], "cookies_biscuits":[], "popcorn_peanuts_seeds_related_snacks":[]}
    for word in words_dict:
            table= func(word, col)
            for cat in output:
                tmp_table= table[table["category"] == cat].reset_index()
                if (tmp_table.iloc[0,2]>=limit):
                    output[cat].append(word)
    return(output)

# ingredients functions

In [12]:
# return for each (category, ingredient) the number of foods in this category that use this ingredient to make them.
def get_common_ingredients():
    train, test, food_nutrients, nutrients = get_data()
    output = {"cakes_cupcakes_snack_cakes":{}, "candy":{}, "chips_pretzels_snacks":{}, "chocolate": {}, "cookies_biscuits":{}, "popcorn_peanuts_seeds_related_snacks":{}}
    train['modify_ing'] = train['ingredients'].apply(lambda x: to_english(str(x).split(',')))
    train.apply(lambda x: add_words_to_dict(x['category'], output, x['modify_ing']), axis =1)
    table_list = []
    for k in output:
        dic = output[k]
        tmp = pd.DataFrame.from_dict(dic, orient= 'index').reset_index()
        tmp.rename(columns={'index': 'ingredient', 0:'number_of_uses'}, inplace=True)
        tmp['category'] = k
        table_list.append(tmp)
    final_table= table_list[0]
    for i in range(1,len(table_list)):
        cur_table= table_list[i]
        final_table = pd.merge(final_table, cur_table, how='outer', on=['ingredient', 'category', 'number_of_uses'])
    final_table = final_table[['category', 'ingredient', 'number_of_uses']]
    return(final_table.sort_values(['category', 'number_of_uses'], ascending=False).reset_index(drop=True))

In [13]:
def intersection_common_ing(num_of_ing):
    common_ing_table = get_common_ingredients()
    categories = common_ing_table.category.unique()
    output= dict()
    for category in categories:
        tmp_table = common_ing_table[common_ing_table['category'] == category].head(num_of_ing)
        output[category] = tmp_table['ingredient'].to_list()
    return output

In [5]:
def check_in_sentence(words_list, sentence):
    for word in words_list:
        if word not in sentence:
            return 0
    return(1)

def plot_common_ing(num_of_ing, plot_size=5):
    common_ing_dict = intersection_common_ing(num_of_ing)
    train, test, food_nutrients, nutrients = get_data()
    train['modify_ing'] = train['ingredients'].apply(lambda x: to_english(str(x).split(',')))
    n = len(train)
    for cat in common_ing_dict:
        train["exist_"+cat] = train['modify_ing'].apply(lambda x: check_in_sentence(common_ing_dict[cat], x))
        n_words =train["exist_"+cat].sum()
        tmp1 = train.groupby("category", as_index = False).sum()[["category", "exist_"+cat]]
        cat_table = train.groupby("category", as_index = False).count()
        word_if_cat =pd.merge(tmp1,cat_table, on = "category")[["category", "exist_"+cat+"_x", "exist_"+cat+"_y"]]
        word_if_cat["percentage_words"] = word_if_cat["exist_"+cat+"_x"]/word_if_cat["exist_"+cat+"_y"]
        word_if_cat = word_if_cat[["category", "percentage_words"]]
        cat_table["exist_"+cat] = cat_table["exist_"+cat]/n
        out = pd.merge(word_if_cat,cat_table, on="category")
        out["category_percentage"] = round(100*out["percentage_words"]*out["exist_"+cat]/(n_words/n), 1)
        plt.figure()
        ax = sns.barplot(x="category", y="category_percentage", data=out)
        ax.set_title(f"{cat} with {num_of_ing} most common ing {common_ing_dict[cat]}")
        ax.set_xticklabels(ax.get_xticklabels(),rotation = 45, horizontalalignment='right')
        ax.set_ylim([0,100])
        ax.bar_label(ax.containers[0])
    return(None)

# nutrients functions

In [4]:
# return a dictionary that it's keys are categories names and it's values are dictionaries.
# in each inner dictionary it's keys are nutrients names and it's values are the number of categories that this nutirent
# is not appear on them.
# therefore this function  returns got each category the nutirents that at least one of food's category has the nutrient , but
# at least one category does not have this nutrient, and we count for this nutrient the number of categories that it is not 
# appear in them
def get_unq_values_cat():
    food_nutrient_category = merge_train_food_nutirents_nutrients().groupby(["category","name"], as_index = False).mean()[["category","name", "amount"]]
    category_dict = {fn: dict() for fn in food_nutrient_category.category.unique()}
    for cat in food_nutrient_category.category.unique():
        unq_values= food_nutrient_category[food_nutrient_category.category == cat].name.unique()
        cur_dic = category_dict[cat]
        for cat1 in food_nutrient_category.category.unique():
            if cat1==cat:
                continue
            unq_values1= food_nutrient_category[food_nutrient_category.category == cat1].name.unique()   
            unq_list = list(set(unq_values) - set(unq_values1))
            for v in unq_list:
                if v in cur_dic.keys():
                    cur_dic[v]+=1
                else:
                    cur_dic[v]=1
    return(category_dict)

In [15]:
# returns for each category it's n top used nutrients.
def get_top_n_nutrients(n):
    food_nutrient_category = merge_train_food_nutirents_nutrients().groupby(["category","name"], as_index = False).count()[["category","name", "amount"]].sort_values(["category", "amount"], ascending= False)
    categories_dic = { "candy":{}, "chips_pretzels_snacks":{}, "chocolate": {}, "cookies_biscuits":{}, "popcorn_peanuts_seeds_related_snacks":{}}
    output =food_nutrient_category[food_nutrient_category["category"]=="cakes_cupcakes_snack_cakes"].head(n)
    for cat in categories_dic:
        tmp_table = food_nutrient_category[food_nutrient_category["category"]==cat].head(n)
        output = pd.concat([output, tmp_table], axis=0, join='inner')
    return(output)

In [16]:
# return all the products that their ingredients includes all top n used ingredients of at least one category
def get_products_with_top_n_nutrient(n):
    food_nutrient_category = merge_train_food_nutirents_nutrients()
    top_nutrient = get_top_n_nutrients(n)
    merge_table =pd.merge(food_nutrient_category,top_nutrient, on = ["category", "name"]).groupby(["category","idx"], as_index = False).count()[["category","idx", "amount_x"]]
    merge_table.rename(columns = {"amount_x":"amount"}, inplace = True)
    output = merge_table[merge_table["amount"]==n]
    return(output)

In [None]:
def probabilty_of_product_to_have_top_n_nutrient_of_his_category(n):
    # top n nutrient = top n nutrient = n nutrients that appears the most in category
    train, test, food_nutrients, nutrients = get_data()
    food_nutrient_category = merge_train_food_nutirents_nutrients()
    top_nutrient = get_top_n_nutrients(n)
    merge_table =pd.merge(food_nutrient_category,top_nutrient, on = ["category", "name"]).groupby(["category","idx"], as_index = False).count()[["category","idx", "amount_x"]]
    merge_table.rename(columns = {"amount_x":"amount"}, inplace = True)
    p = merge_table[merge_table["amount"]==n]
    final_table = pd.DataFrame({"category":["cakes_cupcakes_snack_cakes", "candy", "chips_pretzels_snacks", "chocolate","cookies_biscuits", "popcorn_peanuts_seeds_related_snacks"],
                               "percentage":[len(p[p["category"] =="cakes_cupcakes_snack_cakes"])/len(train[train["category"] =="cakes_cupcakes_snack_cakes"]), len(p[p["category"] =="candy"])/len(train[train["category"] =="candy"]),len(p[p["category"] =="chips_pretzels_snacks"])/len(train[train["category"] =="chips_pretzels_snacks"]),len(p[p["category"] =="chocolate"])/len(train[train["category"] =="chocolate"]),len(p[p["category"] =="cookies_biscuits"])/len(train[train["category"] =="cookies_biscuits"]),len(p[p["category"] =="popcorn_peanuts_seeds_related_snacks"])/len(train[train["category"] =="popcorn_peanuts_seeds_related_snacks"])]})
                                #final_table = pd.DataFrame({"candy":[len(p[p["category"] =="candy"])/len(train[train["category"] =="candy"])]})
    final_table['percentage'] = round(100*final_table['percentage'],2)
    ax = sns.barplot(x="category", y="percentage", data=final_table)
    ax.set_xticklabels(ax.get_xticklabels(),rotation = 45, horizontalalignment='right')
    ax.set_title(f"percentage of product to have {n} top nutrients")
    ax.set_ylim([0,100])
    ax.bar_label(ax.containers[0])
    return(None)

# Images functions

In [12]:
path_to_train_images = "./images/train/"
path_to_test_images = "./images/test/"

In [None]:
# creates table of  that includes for each test image his width and height
def test_images_dims(folder_path):
    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    output= pd.DataFrame({"image":[], "height": [], "width": []})
    for file in onlyfiles:
        filepath = f"{folder_path}/{file}"
        img = Image.open(filepath)
        width = img.width
        height = img.height
        img_name = int(file[:-4])
        output.loc[len(output)] = [img_name, height, width]
    return(output)

In [None]:
# creates table of  that includes for each train image his width and height
def train_images_dims(folder_path):
    onlyDirs = [f for f in listdir(folder_path) if not isfile(join(folder_path, f))]
    output= pd.DataFrame({"image":[], "height": [], "width": [], "category":[]})
    for DIR in onlyDirs:
        onlyfiles = [f for f in listdir(f"{folder_path}/{DIR}") if isfile(join(f"{folder_path}/{DIR}", f))]
        for file in onlyfiles:
            filepath = f"{folder_path}/{DIR}/{file}"
            img = Image.open(filepath)
            width = img.width
            height = img.height
            img_name = int(file[:-4])
            output.loc[len(output)] = [img_name, height, width, DIR]
    return(output)

In [None]:
# creates histogram plot of the test/train images width/height 
def plot_img_attribute(type_of_img, atr, train_path=path_to_train_images, test_path=path_to_test_images):
    # type_of_img = which group we want to compare: train or test
    # atr = which type of dimension we want to compare.
    if type_of_img=='train':
        table = train_images_dims(train_path)
    elif type_of_img=='test':
        table = test_images_dims(test_path)
    else:
        raise ValueError("type_of_img has to be train or test")
    plt.hist(table[atr], density=True)
    plt.xlim([0,180])
    plt.title(f"{type_of_img} data")
    plt.ylabel('density')
    plt.xlabel(atr)
    plt.show()
    return(table)

In [None]:
# compares height/width of test images to height/width of train images.
def compre_train_test_img (atr='height', train_path=path_to_train_images, test_path=path_to_test_images):
    # atr = which type of dimension we want to compare.
    train_table= train_images_dims(train_path)
    test_table = test_images_dims(test_path)
    plt.figure(figsize=(8,6))
    plt.hist(train_table[atr], alpha=0.5, label="train", density = True)
    plt.hist(test_table[atr], alpha=0.5, label="test", density = True)
    plt.xlabel(atr, size=14)
    plt.ylabel("density", size=14)
    plt.title(f"compare train images {atr} to test images {atr}")
    plt.xlim([0,180])
    plt.legend()

In [11]:
# compares height of the images in test/train group to the width of images in test/train group
def compare_dims_in_group(type_of_img, train_path=path_to_train_images, test_path=path_to_test_images):
    # type_of_img = which group we want to compare: train or test
    if type_of_img=='train':
        table = train_images_dims(train_path)
    elif type_of_img=='test':
        table = test_images_dims(test_path)
    else:
        raise ValueError("type_of_img has to be train or test")
    plt.figure(figsize=(8,6))
    plt.scatter(table.width, table.height)
    plt.xlabel('width', size=14)
    plt.ylabel("height", size=14)
    plt.title(f"compare {type_of_img} width to {type_of_img} height")
    plt.xlim([0,180])
    plt.ylim([0,180])
    l = [i for i in range(0,180)]
    plt.plot(l,l, color = 'black')
    plt.show()