In [66]:
import pandas as pd
import numpy as np

import altair as alt
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS

In [67]:
alt.data_transformers.enable(max_rows=250000)
pd.set_option('display.max_columns', None)
df = pd.read_csv(filepath_or_buffer = "usda-grp-nomiss.csv", sep=',', header=0, index_col=None, lineterminator='\n')
df = df.replace(np.nan, 0)
# df.head(10)

# Protein, lean meats, red meats, poultry
protein_groups = ["Beef Products", "Poultry Products", "Pork Products", "Sausages and Luncheon Meats"]
dfProtein = df[df['FdGrp_Desc'].isin(protein_groups)]
dfProtein['FdGrp_Desc'].value_counts()

# Fruits and vegetable products
veggies_fruits_groups = ["Fruits and Fruit Juices", "Vegetables and Vegetable Products"]
dfVeggie_Fruits = df[df['FdGrp_Desc'].isin(veggies_fruits_groups)]
dfVeggie_Fruits['FdGrp_Desc'].value_counts()

Vegetables and Vegetable Products    254
Fruits and Fruit Juices              120
Name: FdGrp_Desc, dtype: int64

In [68]:
macros = ['Protein', 'Carbohydrt', 'Fiber_TD', 'Water', 'Cholestrl', 'Lipid_Tot']
vitamins = ['Vit_C', 'Vit_E', 'Vit_D', 'Vit_D.1', 'Vit_K', 'Vit_B12', 'Vit_B6', 'Vit_A_IU']
vitaminBs = ['Thiamin', 'Riboflavin', 'Niacin', 'Panto_Acid', 'Folic_Acid']
minerals = ['Calcium', 'Iron', 'Magnesium', 'Phosphorus', 'Potassium', 'Sodium', 'Zinc', 'Copper', 'Manganese', 'Selenium']

# Macros for Protein Groups
dfmacros = dfProtein[macros].values
dfvitamins = dfProtein[vitamins].values
dfminerals = dfProtein[minerals].values
# dfvitamins

columns = ["FdGrp_Desc", "Water", "Energ_Kcal", "Protein", "Carbohydrt", "Sugar_Tot"]
data = [df[["FdGrp_Desc", "Water", "Energ_Kcal", "Protein", "Carbohydrt", "Sugar_Tot"]]]

carb = alt.Chart(df).mark_bar().encode(
    x = 'FdGrp_Desc:N',
    y = 'AVG_Carbs:Q'
).transform_aggregate(
    AVG_Carbs='mean(Carbohydrt)',
    groupby=["FdGrp_Desc"]
).properties(
    title='Average Carbs Across Food Groups'
)

protein = alt.Chart(df).mark_bar().encode(
    x = 'FdGrp_Desc:N',
    y = 'AVG_Protein:Q'
).transform_aggregate(
    AVG_Protein='mean(Protein)',
    groupby=["FdGrp_Desc"]
).properties(
    title='Average Proteins Across Food Groups'
)

macrosChart = alt.vconcat(carb | protein)
macrosChart

In [69]:
# Scaler Function to normalize different groups (macros, vitamins, minerals values)

scaler = StandardScaler()
scaler.fit(dfminerals)
scaled_minerals = scaler.transform(dfminerals)

In [70]:
#KMeans

# PROTEINS

In [71]:
scaler = StandardScaler()
scaler.fit(dfmacros)
scaled_mac = scaler.transform(dfmacros)

kmeans = KMeans(n_clusters=4)
kmeans.fit(scaled_mac)
labels_macros = kmeans.predict(scaled_mac)

result_tsne_mac = TSNE(n_components=2, n_iter=800, random_state=0).fit_transform(scaled_mac)

df_macros = pd.DataFrame(data=result_tsne_mac, columns=['x', 'y'])
df_macros['Macros'] = labels_macros

macrosChart_TSNE = alt.Chart(df_macros).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color='Macros:N'
).properties(title = "TSNE Macronutrients for Protein Groups")

df_macros['food_group'] = dfProtein['FdGrp_Desc'].values

macrosGroup = alt.Chart(df_macros).mark_circle().encode(
    x='x:Q', 
    y='y:Q',
    color=alt.Color('food_group:O', scale=alt.Scale(scheme='dark2'))
).properties(title = "Normalized Macronutrients for Protein Groups")

final1 = alt.vconcat(macrosChart_TSNE | macrosGroup)
final1

# There are products that are outliers from : Pork Products, Sausages & Luncheon Meats

In [82]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(dfvitamins)
labels_vitamins = kmeans.predict(dfvitamins)

result_tsne_vitamins = TSNE(n_components=2, n_iter=600, random_state=0).fit_transform(dfvitamins)

df_vitamins = pd.DataFrame(data=result_tsne_vitamins, columns=['x', 'y'])
df_vitamins['vitamins'] = labels_vitamins

vitaminsChart_TSNE = alt.Chart(df_vitamins).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color='vitamins:N'
)

df_vitamins['food_group'] = dfProtein['FdGrp_Desc'].values

vitaminsGroup = alt.Chart(df_vitamins).mark_circle().encode(
    x='x:Q', 
    y='y:Q',
    color=alt.Color('food_group:O', scale=alt.Scale(scheme='dark2'))
)

final2 = alt.vconcat(vitaminsChart_TSNE | vitaminsGroup).properties(title = "Vitamins for Protein Groups")
final2

In [75]:
kmeansMinerals = KMeans(n_clusters=4)
kmeans.fit(scaled_minerals)
labels_minerals = kmeans.predict(scaled_minerals)

result_tsne_minerals = TSNE(n_components=2, random_state=0).fit_transform(scaled_minerals)

df_minerals = pd.DataFrame(data=result_tsne_minerals, columns=['x', 'y'])
df_minerals['minerals'] = labels_minerals

mineralsChart_TSNE = alt.Chart(df_minerals).mark_circle().encode(
    x='x:Q',
    y='y:Q',
    color='minerals:N'
)

df_minerals['food_group'] = dfProtein['FdGrp_Desc'].values

mineralsGroup = alt.Chart(df_minerals).mark_circle().encode(
    x='x:Q', 
    y='y:Q',
    color=alt.Color('food_group:O', scale=alt.Scale(scheme='dark2'))
)

final3 = alt.vconcat(mineralsChart_TSNE | mineralsGroup).properties(title = "Minerals for Protein Groups")
final3