**1. Importing Necessary packages**
---

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import accuracy_score

**2. Reading CSVs**
---

In [None]:
adult = pd.read_csv("avg_adult.csv")
fball_player = pd.read_csv("football_player.csv")
rock = pd.read_csv("rock.csv")

In [None]:
data_food = pd.read_csv("food_data.csv")
nutrient_food = pd.read_csv("food_nutrient.csv")
portion_food = pd.read_csv("food_portion.csv")
sample_result = pd.read_csv("sub_sample_result.csv")
food =  pd.read_csv("food_nutrient_AHS.csv", header=0,low_memory=False)
FoodData = pd.read_csv('fooddata (1).csv')
DRI = pd.read_csv('DRI_data (1).csv')

**3. Insight into our data**
---

In [None]:
data_food.head()

In [None]:
nutrient_food.head()

In [None]:
portion_food.head()

In [None]:
adult

In [None]:
fball_player

In [None]:
rock

**4. Additional Data**
--

In [None]:
adult = [50,70,24,310,90,2.3,30]
fball = [162,139,24,463,17,2.3,35]
Rock = [507,112,34,571,62.64,2.57,68.9]

In [None]:
d2 = portion_food.loc[portion_food["gram_weight"]>300, ['id','gram_weight']]

**5. Data visualisation**
---

# Plotting the nutritional requirement for : Average adult VS Average football player

In [None]:
Nutrients = ["Protein","Fat","Fatty_acids","carbs","Sugars","Salt","Diet_fibre"]
indices = np.arange(len(Nutrients))

width = 0.20

plt.bar(indices, adult, width = width)
plt.bar(indices + width, fball, width = width)

plt.xticks(ticks = indices, labels = Nutrients)


plt.xlabel("Nutrients")
plt.title("Average adult vs Football player")
plt.legend(["Average Adult","Footballer Player"])

plt.show()

# Plotting the nutritional requirement for : Average football player VS The Rock

In [None]:
Nutrients = ["Protein","Fat","Fatty_acids","carbs","Sugars","Salt","Diet_fibre"]
indices = np.arange(len(Nutrients))

width = 0.20

plt.bar(indices, fball, width = width)
plt.bar(indices + width, Rock, width = width)

plt.xticks(ticks = indices, labels = Nutrients)


plt.xlabel("Nutrients")
plt.title("Football player vs The Rock")
plt.legend(["Footballer Player","The Rock"])

plt.show()

# Plotting the nutritional requirement for : Average adult VS The Rock

In [None]:
Nutrients = ["Protein","Fat","Fatty_acids","carbs","Sugars","Salt","Diet_fibre"]
indices = np.arange(len(Nutrients))

width = 0.20

plt.bar(indices, Rock, width = width)
plt.bar(indices + width, adult, width = width)

plt.xticks(ticks = indices, labels = Nutrients)


plt.xlabel("Nutrients")
plt.title("The Rock vs Average Adult")
plt.legend(["The Rock","Average Adult"])

plt.show()

**Gram Weight distribution based on id**
---

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = d2['id'], y = d2['gram_weight'], mode='markers',marker_color=d2['gram_weight']))
fig.show()

**Count of food-items in different categories**
---

In [None]:
px.histogram(data_food, x = 'Category', barmode = 'group')

**Carbohydrate composition of food in different categories**
---

In [None]:
cols = ['Description','Data.Carbohydrate']

d5 = data_food.loc[ : , cols ]

fig = go.Figure()
fig.add_trace(go.Scatter(x = d5['Description'], y = d5['Data.Carbohydrate'], mode='markers', name = 'Carbohydrate', marker = dict(color = d5['Data.Carbohydrate'], colorscale = "Viridis")))
fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()

**Cholesterol composition of food in different categories**
---

In [None]:
cols = ['Description','Data.Cholesterol']

d5 = data_food.loc[ : , cols ]

fig = go.Figure()
fig.add_trace(go.Scatter(x = d5['Description'], y = d5['Data.Cholesterol'], mode='markers', name = 'Carbohydrate', marker = dict(color = d5['Data.Cholesterol'], colorscale = "Viridis")))
fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()

**Kilo-calorie composition of food in different categories**
---

In [None]:
cols = ['Description','Data.Kilocalories']

d5 = data_food.loc[ : , cols ]

fig = go.Figure()
fig.add_trace(go.Scatter(x = d5['Description'], y = d5['Data.Kilocalories'], mode='markers', name = 'Carbohydrate', marker = dict(color = d5['Data.Kilocalories'], colorscale = "Viridis")))
fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()

**Plot of nutrient compostion for Milk**
---

In [None]:
col_selected = ['Description','Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']
col_final = ['Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']

for i in range(0, 10):
    d4 = data_food.loc[data_food["Category"]=="MILK", col_selected]

d5 = d4.iloc[:, 1:]

fig = go.Figure()

for i in col_final:
    fig.add_trace(go.Scatter(x = d4['Description'], y = d5[i], mode='markers+lines', name = i))

fig.update_layout(width = 1000, height = 1200, autosize = False)
fig.show()


**Plot of nutrient compostion for Egg**
---

In [None]:
col_selected = ['Description','Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']
col_final = ['Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']

for i in range(0, 10):
    d4 = data_food.loc[data_food["Category"]=="EGG", col_selected]

d5 = d4.iloc[:, 1:]

fig = go.Figure()

for i in col_final:
    fig.add_trace(go.Scatter(x = d4['Description'], y = d5[i], mode='markers+lines', name = i))

fig.update_layout(width = 1000, height = 1200, autosize = False)
fig.show()


**Plot of nutrient compostion for Babyfood**
---

In [None]:
col_selected = ['Description','Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']
col_final = ['Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']

for i in range(0, 20):
    d4 = data_food.loc[data_food["Category"]=="BABYFOOD", col_selected]

d5 = d4.iloc[:, 1:]

fig = go.Figure()

for i in col_final:
    fig.add_trace(go.Scatter(x = d4['Description'], y = d5[i], mode='markers+lines', name = i))

fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()


**Plot of nutrient compostion for Chicken**
---

In [None]:
col_selected = ['Description','Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']
col_final = ['Data.Carbohydrate','Data.MajorMinerals.Calcium','Data.Vitamins.Vitamin_B12', 'Data.MajorMinerals.Zinc', 'Data.Vitamins.Vitamin_K', 'Data.Vitamins.Vitamin_E', 'Data.Vitamins.Vitamin_C', 'Data.MajorMinerals.Sodium', 'Data.MajorMinerals.Potassium', 'Data.MajorMinerals.Phosphorus']

for i in range(0, 20):
    d4 = data_food.loc[data_food["Category"]=="CHICKEN", col_selected]

d5 = d4.iloc[:, 1:]

fig = go.Figure()

for i in col_final:
    fig.add_trace(go.Scatter(x = d4['Description'], y = d5[i], mode='markers+lines', name = i))

fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()


**Plot of nutrient against it's gram/DV**
---

In [None]:
sample_result = sample_result[['nutrient_name', 'adjusted_amount']]

sample_result = sample_result.replace(0, np.nan).dropna(axis = 0, how = 'any')
d1 = sample_result.drop_duplicates('nutrient_name', keep='last')

fig = go.Figure()
fig.add_trace(go.Scatter(x = d1['nutrient_name'], y = d1['adjusted_amount'], mode = 'markers', marker = dict(color = d1['adjusted_amount'], colorscale = "Agsunset")))
fig.update_layout(width = 1200, height = 1200, autosize = False)
fig.show()

**ML algorithm on our data**
---

In [None]:
def VAT(R):
    R = np.array(R)
    N, M = R.shape
    if N != M:
        R = squareform(pdist(R))
    J = list(range(0, N))
    
    y = np.max(R, axis = 0)
    i = np.argmax(R, axis = 0)
    j = np.argmax(y)
    y = np.max(y)
    
    I = i[j]
    del J[I]
    
    y = np.min( R[I, J], axis = 0)
    j = np.argmin(R[I, J], axis = 0)
    I = [I, J[j]]
    J = [e for e in J if e != J[j]]
    C = [1, 1]

    for r in range(2, N - 1):
        y = np.min(R[I,:][:,J], axis = 0)
        i = np.argmin(R[I,:][:,J], axis = 0)
        j = np.argmin(y)
        y = np.min(y)
        I.extend([J[j]])
        J = [e for e in J if e != J[j]]
        C.extend([i[j]])

    y = np.min(R[I, :][:, J], axis = 0)
    i = np.argmin(R[I, :][:, J], axis = 0)

    I.extend(J)
    C.extend(i)

    RI = list(range(N))
    for idx, val in enumerate(I):
        RI[val] = idx

    RV = R[I, :][:, I]
    return RV.tolist(), C, I


def entropy(probs):
    
    return -probs.dot(np.log2(probs))

def mutual_info(df):
    
    Hx = entropy(df.iloc[:, 0].value_counts(normalize = True, sort = False))
    Hy = entropy(df.iloc[:, 1].value_counts(normalize = True, sort = False))
    
    counts = df.groupby(list(df.columns.values)).size()
    probs = counts / counts.values.sum()
    H_xy = entropy(probs)

    I_xy = Hx + Hy - H_xy
    MI = I_xy
    NMI = I_xy / min(Hx, Hy) 
    
    return {'H_'+list(df)[0]:Hx,'H_'+list(df)[1]:Hy,'MI':MI,'NMI':NMI} 

In [None]:
continuous_feature = pd.DataFrame(food.iloc[:, 4:57])
scaler = StandardScaler()
foodscaled = scaler.fit_transform(continuous_feature)

In [None]:
food['EnergyLevel'] = np.where(food['Energy, with dietary fibre (kJ)'] > 1000, 1, 0)

**PCA based on energy**
--

In [None]:
pca = PCA(n_components = 2)
foodreduced = pca.fit_transform(foodscaled)

reduced = pd.DataFrame({"1st principle component":[i[0] for i in foodreduced],"2nd principle component":[i[1] for i in foodreduced]
                       ,'EnergyLevel':food['EnergyLevel']})
food_high=reduced.loc[reduced['EnergyLevel'] == 1]
food_low=reduced.loc[reduced['EnergyLevel'] == 0]

fig, ax = plt.subplots()

ax.scatter(food_high["1st principle component"], food_high["2nd principle component"], color = 'red',label = 'High energy', s = 1.5)
ax.scatter(food_low["1st principle component"], food_low["2nd principle component"], color = 'blue',label = 'Low energy',s = 1.5)
leg = ax.legend()
plt.xlabel('1st Principal Component')
plt.ylabel('2nd Principal Component')
plt.title('Scatter plot about 1st PCs vs 2nd PCs based on Energy Level')
plt.show()

**Heatmap on dissimilarities of Categories 13, 20 and 24**
---

In [None]:
food['Survey ID']=food['Survey ID'].astype(str)
food['Food Category'] = food['Survey ID'].str.slice(0,2)

food_row = food.shape[0]
foodscaledsample=[]
for i in range(food_row):
    if food['Food Category'][i] == '13' or food['Food Category'][i] == '20' or food['Food Category'][i] == '24':
        foodscaledsample.append(foodscaled[i])

RV, C, I = VAT(foodscaledsample)

x = sns.heatmap(RV, cmap='gist_stern', xticklabels=False, yticklabels=False)
x.set(xlabel='Objects', ylabel='Objects')
plt.title('Heatmap of foodscaledsample')
plt.show()

**K-NN on Scaled values wrt Category**
---

In [None]:
X_train, X_test, y_train, y_test = train_test_split( foodscaled, food['Food Category'], train_size=0.8, test_size=0.2)

In [None]:
print('k=1')
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

y_pred=knn.predict(X_train)
print('Train accuracy: ',accuracy_score(y_train, y_pred),1)

y_pred=knn.predict(X_test)
print('Test accuracy: ',accuracy_score(y_test, y_pred),1)

print('k=3')
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

y_pred=knn.predict(X_train)
print('Train accuracy: ',accuracy_score(y_train, y_pred),1)

y_pred=knn.predict(X_test)
print('Test accuracy: ',accuracy_score(y_test, y_pred),1)

In [None]:
neededColumns = [ 'Data.Major Minerals.' + column for column in DRI.drop('Life-Stage Group', axis=1).columns ]

for column in FoodData.drop(['Category', 'Description'], axis=1).columns:
    if column not in neededColumns:
        FoodData = FoodData.drop(column, axis=1)

In [None]:
FoodData = FoodData.groupby('Category', as_index=False).mean()

In [None]:
def DemographicKey(age:float, sex:str):
    if age <= 0.5: return '0-6 m'
    if age <= 1.0: return '7-12 m'
    if age <= 3.0: return '1-3 y'
    if age <= 8.0: return '4-8 y'

    key = ''

    if age <= 13.0: key = '9-13 y '
    elif age <= 18.0: key = '14-18 y '
    elif age <= 30.0: key = '19-30 y '
    elif age <= 50.0: key = '31-50 y '
    elif age <= 70.0: key = '51-70 y '
    else: key = '71+ y '

    sex = sex.lower()
    if sex in ('male', 'm'): return key + 'M'
    if sex in ('female', 'f'): return key + 'F'
    if sex in ('preg', 'pregnant', 'p'): return key + 'P'
    if sex in ('lact', 'lactation', 'l'): return key + 'L'

In [None]:
neededColumns = FoodData.drop('Category', axis=1).columns
for column in DRI.columns:
    if 'Data.Major Minerals.' + column not in neededColumns:
        DRI.drop(column, axis=1)

DRI = DRI.rename(lambda col: 'Data.Major Minerals.' + col if col != 'Life-Stage Group' else col, axis=1)

In [None]:
def ExpensedData(specimenPool:pd.DataFrame, standard:pd.DataFrame, age:float, subclass:str, normalize=False):
    selected = standard.loc[standard['Life-Stage Group'] == DemographicKey(age, subclass)]
    commonColumns = np.intersect1d(specimenPool.columns, selected.columns, assume_unique=True)
    
    expensed = pd.DataFrame([], columns=[ 'Category', *[ column.removeprefix('Data.Major Minerals.') for column in commonColumns ] ])

    if normalize:
        deviations = np.std(specimenPool, axis=0)

        for specimen in specimenPool.iloc:
            sub = { 'Category': specimen['Category'] }

            for column in commonColumns: sub[column.removeprefix('Data.Major Minerals.')] = float((specimen[column] - selected[column]) / deviations[column])
            expensed = expensed.append(sub, ignore_index=True)

        return expensed, deviations

    for specimen in specimenPool.iloc:
        sub = { 'Category': specimen['Category'] }

        for column in commonColumns: sub[column.removeprefix('Data.Major Minerals.')] = float(specimen[column] - selected[column])
        expensed = expensed.append(sub, ignore_index=True)

    return expensed

In [None]:
Base = FoodData.head(20)

In [None]:
Expensed = ExpensedData(Base, DRI, age=0.2, subclass='Male')

In [None]:
Expensed

In [None]:
def GenerateDataframe(Reference:pd.DataFrame, FoodRatio:dict):
    generated = pd.DataFrame([], columns=Reference.columns)

    totalWeight = sum(FoodRatio.values())

    mineralColumns = list(Reference.columns)[1:]

    for food, grams in FoodRatio.items():
        try:
            mineralVector = Reference.loc[Reference.Category == food].iloc[0]
        except:
            continue

        for column in mineralColumns:
            mineralVector[column] *= grams / totalWeight

        generated = generated.append(mineralVector, ignore_index=True)
    
    return generated

In [None]:
def ExpensedAnalysis(Encyclopedia:pd.DataFrame, FoodDist:dict, Ideal:pd.DataFrame, age:int, subclass:str):
    return ExpensedData(GenerateDataframe(Encyclopedia, FoodDist), Ideal, age, subclass).sum(axis=0)

In [None]:
ExpensedAnalysis(Base, { 'ABALONE' : 2000, 'AGAVE' : 1000 }, DRI, age=21, subclass='male')