In [1]:
import pandas as pd
import matplotlib
%matplotlib inline

pd.options.display.max_rows = 100

In [2]:
files = {
    'diet_example': 'https://regepi.bwh.harvard.edu/health/Oxalate/files/Low%20Oxalate%20Diet%20Example.xls',
    'oxalate_content' : 'https://regepi.bwh.harvard.edu/health/Oxalate/files/Oxalate%20Content%20of%20Foods.xls',
    'table_of_foods' : 'https://regepi.bwh.harvard.edu/health/Oxalate/files/Oxalate%20Table%20of%20Foods.xls'
}

In [3]:
raw_data = pd.read_excel(files['oxalate_content'])

In [4]:
df = raw_data.copy()

# Filter Data

In [5]:
df = df[['Food Group', 'Food Item', 'Serving size','OxalateCategory','Oxalate Value']]

In [6]:
#get rid of rows with null Food Item's
df = df[df['Food Item'].notnull()]

In [7]:
#forward fill Food Groups
df['Food Group'] = df['Food Group'].fillna(method='ffill')

In [8]:
#removes the "mg" suffix if applicable
def remove_mg(value):
    if value.endswith('mg'):
        return value[:-2]


In [9]:
#Apply removal of "mg" suffix
df['Oxalate Value'] = df['Oxalate Value'].apply(remove_mg)

#convert Oxalate value to numeric data type
df['Oxalate Value'] = df['Oxalate Value'].apply(pd.to_numeric)

In [10]:
item_to_group_mapping = df[['Food Group', 'Food Item']]

In [11]:
#Remove duplicate food items (Some food items belong to more than one food group)

df = df.drop_duplicates(subset="Food Item", keep="first")


In [12]:
#List of all Food Groups
food_groups = list(df['Food Group'].unique())

#list of all food items
food_items = list(df['Food Item'])


In [13]:
df.describe()

Unnamed: 0,Oxalate Value
count,503.0
mean,14.94831
std,54.213733
min,0.0
25%,1.0
50%,4.0
75%,13.0
max,755.0


# Interquartile Range and Outliers

In [14]:
#First Quartile
q1 = df['Oxalate Value'].quantile(.25)

#Third Quartile
q3 = df['Oxalate Value'].quantile(.75)

#Interquartile Range
iqr = q3 - q1

outlier_threshold = q3 + 3 * iqr
outlier_threshold

49.0

In [15]:
#High in oxalate but not an outlier
high_oxalate = df[df['Oxalate Value'] >= q3]
high_oxalate = high_oxalate[high_oxalate['Oxalate Value'] < outlier_threshold]
high_oxalate.sort_values(by='Oxalate Value', ascending=False)

Unnamed: 0,Food Group,Food Item,Serving size,OxalateCategory,Oxalate Value
7,Whole Fruits,Raspberries,1 cup,Very High,48
647,Other Cereal Brands,Nabisco Honey Nut Shredded Wheat Bite Size,1 cup,Very High,47
547,Kellogg's,Raisin Bran,1 cup,Very High,46
648,Other Cereal Brands,Spoonsize Shredded Wheat,1 cup,Very High,45
646,Other Cereal Brands,Nabisco Shredded Wheat,2 biscuits,Very High,42
549,Kellogg's,Raisin Squares Mini-Wheats,3/4 cup,Very High,41
577,Post Cereals,"Fruit & Fiber Dates, Raisins & Walnuts",1 cup,Very High,41
211,"Pastas, Rice & Grains",Miso,1 cup,Very High,40
396,"Spreads, Sauces & Toppings",Miso,1 cup,Very High,40
73,Vegetables,Yams,"1/2 cup, cubed",Very High,40


In [16]:
outliers = df[df['Oxalate Value'] >= outlier_threshold]

In [17]:
outliers.sort_values(by='Oxalate Value', ascending=False)

Unnamed: 0,Food Group,Food Item,Serving size,OxalateCategory,Oxalate Value
69,Vegetables,"Spinach, cooked",1/2 cup,Very High,755
70,Vegetables,"Spinach, raw",1 cup,Very High,656
67,Vegetables,Rhubarb,1/2 cup,Very High,541
212,"Pastas, Rice & Grains",Rice Bran,1 cup,Very High,281
204,"Pastas, Rice & Grains",Buckwheat Groats,1 cup cooked,Very High,133
285,Nuts and Seeds,Almonds,1 oz or 22 kernels,Very High,122
493,Soups,Miso Soup,1 cup,Very High,111
214,"Pastas, Rice & Grains",Wheat Berries,1 cup cooked,Very High,98
120,Potatoes,Baked Potato with Skin,1 medium,Very High,97
206,"Pastas, Rice & Grains",Corn Grits,1 cup,Very High,97
