Table of Contents:
* [1. Introduction](#Introduction)
* [2. Foods by Nutrient](#Foods-by-Nutrient)
* [3. Nutrients by Food](#Nutrients-by-Food)

In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import difflib

In [None]:
import re

In [None]:
food = pd.read_csv('../input/food-and-nutrient-data/food.csv')

In [None]:
food_nutrient = pd.read_csv('../input/food-and-nutrient-data/food_nutrient.csv')

In [None]:
nutrient = pd.read_csv('../input/food-and-nutrient-data/nutrient.csv')

# Introduction

In [None]:
food.head()

In [None]:
food.info()

In [None]:
food['description'] = food['description'].astype(str)

In [None]:
f'number of unique food items {pd.unique(food.iloc[:, 2]).size}'

In [None]:
food_nutrient.head()

In [None]:
food_nutrient.info()

In [None]:
food_nutrient['footnote'].unique()

In [None]:
food_nutrient[food_nutrient['footnote']=='Trace amount']

In [None]:
f'number of unique nutrients {pd.unique(food_nutrient.iloc[:, 2]).size}'

In [None]:
nutrient.head()

In [None]:
nutrient.info()

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(x='unit_name', data=nutrient, order=nutrient['unit_name'].value_counts().index)

Merging with nutrient dataframe for names of nutrients.

In [None]:
nutrients_with_names = nutrient.merge(food_nutrient.drop('id', axis=1), left_on='id', right_on='nutrient_id')

In [None]:
nutrients_with_names.head()

In [None]:
nutrients_with_names.shape

In [None]:
nutrients_with_names.isnull().sum()

In [None]:
nutrients_with_names = nutrients_with_names[['nutrient_id', 'name', 'amount', 'unit_name', 'fdc_id']]

merging food and nutrients_with_names to make combined dataframe

In [None]:
df = pd.merge(food, nutrients_with_names, on='fdc_id')
df

# Foods by Nutrient

Most mentioned food items

In [None]:
nutrients = pd.unique(df['name'])
sorted(nutrients)

In [None]:
f'number of unique nutrient names {nutrients.size}'

In [None]:
vitamins = ['Vitamin A, RAE', 'Thiamin', 'Riboflavin', 'Niacin', 'Pantothenic acid', 'Vitamin B-6', 'Biotin', 'Folate, total', 'Vitamin B-12',
 'Vitamin C, total ascorbic acid', 'Vitamin D (D2 + D3)', 'Vitamin D2 (ergocalciferol)', 'Vitamin D3 (cholecalciferol)', 'Vitamin E (alpha-tocopherol)', 
 'Vitamin K (Dihydrophylloquinone)', 'Vitamin K (Menaquinone-4)', 'Vitamin K (phylloquinone)']

In [None]:
minerals = ['Potassium, K', 'Sodium, Na', 'Calcium, Ca', 'Phosphorus, P', 'Magnesium, Mg', 'Iron, Fe', 'Zinc, Zn', 'Manganese, Mn',
             'Copper, Cu', 'Iodine, I', 'Selenium, Se', 'Molybdenum, Mo', 'Cobalt, Co', 'Nickel, Ni', 'Boron, B']

measurement unit of vitamins

In [None]:
for i in vitamins:
    print(df[df['name']==i].iloc[0]['unit_name'], 'is unit of measurement for', i )

measurement unit of minerals

In [None]:
for i in minerals:
    print(df[df['name']==i].iloc[0]['unit_name'], 'is unit of measurement for', i )

In [None]:
df.groupby('description').count().sort_values(by='amount', ascending=False).head(10)

Most mentioned nutrients

In [None]:
df.groupby(['name']).count().sort_values(by='amount', ascending=False).head(10)

In [None]:
matches = difflib.get_close_matches('vitamin D', nutrients, n=15, cutoff=.2)
matches

In [None]:
r = re.compile("Choline")

In [None]:
set(filter(r.match, nutrients))

In [None]:
df[df['name']=='Calcium, Ca'].sort_values(by='amount', ascending=False).head(10)

## Finding by Modifying Description

When we sort the foods by amount of protein they have, there are many similar food items near to each other. To list different foods, we can split the description and remove duplicates

In [None]:
def filter_description(df, unique_words_in_description):
    return df.loc[(df['description'].str.split(r",|;", expand=True)).drop_duplicates(subset=[i for i in range(unique_words_in_description)]).index]

In [None]:
filter_description(df[df['name']=='Vitamin K (phylloquinone)'].sort_values(by='amount', ascending=False), 1).head(10)

# Nutrients by Food

## making wide dataframe

Removing duplicated nutrient names

In [None]:
nutrient['name'][nutrient['name'].duplicated()]

In [None]:
nutrient[nutrient['name']=='Energy']

In [None]:
(df[(df['name']=='Energy') & (df['unit_name']=='KCAL')].reset_index()['fdc_id'] == df[(df['name']=='Energy') & (df['unit_name']=='kJ')].reset_index()['fdc_id']).all()

In [None]:
df[df['nutrient_id']==1062].index

In [None]:
df_1 = df.drop(df[df['nutrient_id']==1062].index)
df_1

In [None]:
df_1.columns

In [None]:
df_1.drop(['data_type', 'food_category_id','publication_date'], axis=1, inplace=True)

Converting nutrients which are listed in rows to columns

In [None]:
wide_df = df_1.set_index(['fdc_id', 'description', 'name'])['amount'].unstack().reset_index().rename_axis(None, axis=1)
wide_df

Number of food items with vitamins

In [None]:
wide_df.count()[vitamins]

In [None]:
wide_df.count()[vitamins].plot(kind='bar', figsize=(10,5))

Number of food items with minerals

In [None]:
wide_df.count()[minerals]

In [None]:
wide_df.count()[minerals].plot(kind='bar', figsize=(10,5))

sum of rankings of foods by vitamin content

In [None]:
wide_df.sort_values(by='Vitamin B-6', ascending=False)[['description', 'Vitamin B-6']].head(10)

foods with high protein to carbohydrate value

In [None]:
wide_df.loc[(wide_df['Protein']/wide_df['Carbohydrate, by summation']).sort_values(ascending=False).index] \
    [['description', 'Protein', 'Carbohydrate, by summation']].head(10)

In [None]:
matches = difflib.get_close_matches('Egg', list(df['description']), n=15, cutoff=.5)
matches

In [None]:
r = re.compile('Egg')

In [None]:
set(filter(r.match, list(df['description'])))

## high vitamins and minerals

Recommended Dietary Allowance (RDA): average daily level of intake sufficient to meet the nutrient requirements of nearly all (97%-98%) healthy people taken from from https://ods.od.nih.gov/HealthInformation/Dietary_Reference_Intakes.aspx

In [None]:
minerals_rda = dict(zip(minerals[:12], [3400, 1500, 1000, 700, 400, 18, 11, 2.3, .9, 150, 55, 45]))

foods by their mineral content as a percentage of Recommended Dietary Allowance

In [None]:
wide_df_minerals = wide_df[['description']].join(wide_df.loc[:, minerals_rda.keys()] / minerals_rda.values() * 100)
wide_df_minerals

In [None]:
wide_df_minerals.loc[wide_df_minerals.loc[:,minerals_rda.keys()].sum(axis=1).sort_values(ascending=False).index].head(10)

In [None]:
wide_df['description'].str.contains('Salt').tail(10)

In [None]:
wide_df_minerals.loc[wide_df_minerals[~wide_df['description'].str.contains('Salt')].loc[:,minerals_rda.keys()].sum(axis=1).sort_values(ascending=False).index].round(2).head(10)

In [None]:
vitamins_rda = dict(zip(vitamins[:10] + vitamins[12:14] + vitamins[16:], [900, 1.2, 1.3, 16, 5, 1.3, 30, 400, 2.4, 90, 15, 15, 120]))

foods by their vitamin content as a percentage of Recommended Dietary Allowance

In [None]:
wide_df_vitamins = wide_df[['description']].join(wide_df.loc[:, vitamins_rda.keys()] / vitamins_rda.values() * 100)
wide_df_vitamins

In [None]:
wide_df_vitamins.loc[wide_df_vitamins.loc[:,vitamins_rda.keys()].sum(axis=1).sort_values(ascending=False).index].round(2).head(10)

Filtering description to view different food items

In [None]:
def filter_description(df, unique_words_in_description):
    return df.loc[(df['description'].str.split(r",|;|-", expand=True)).drop_duplicates(subset=[i for i in range(unique_words_in_description)]).index]

In [None]:
filter_description(wide_df_minerals.loc[wide_df_minerals.loc[:,minerals_rda.keys()].sum(axis=1).sort_values(ascending=False).index], 1).head(10)

In [None]:
filter_description(wide_df_vitamins.loc[wide_df_vitamins.loc[:,vitamins_rda.keys()].sum(axis=1).sort_values(ascending=False).index], 1).head(10)

In [None]:
wide_df_minerals[wide_df_minerals['description'].isin(set(filter(r.match, list(df['description'].values))))][['description'] + list(minerals_rda.keys())]