# SD212: Graph mining

This notebook shows how to save your dataset as a ``Bunch`` object.

## Making the adjacency matrix

In [20]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix

# Open the file
data = pd.read_csv('food.csv')
data.head()

Unnamed: 0,Category,Description,Nutrient Data Bank Number,Data.Alpha Carotene,Data.Ash,Data.Beta Carotene,Data.Beta Cryptoxanthin,Data.Carbohydrate,Data.Cholesterol,Data.Choline,...,Data.Major Minerals.Potassium,Data.Major Minerals.Sodium,Data.Major Minerals.Zinc,Data.Vitamins.Vitamin A - IU,Data.Vitamins.Vitamin A - RAE,Data.Vitamins.Vitamin B12,Data.Vitamins.Vitamin B6,Data.Vitamins.Vitamin C,Data.Vitamins.Vitamin E,Data.Vitamins.Vitamin K
0,BUTTER,"BUTTER,WITH SALT",1001,0,2.11,158,0,0.06,215,19,...,24,576,0.09,2499,684,0.17,0.003,0.0,2.32,7.0
1,BUTTER,"BUTTER,WHIPPED,WITH SALT",1002,0,2.11,158,0,0.06,219,19,...,26,827,0.05,2499,684,0.13,0.003,0.0,2.32,7.0
2,BUTTER OIL,"BUTTER OIL,ANHYDROUS",1003,0,0.0,193,0,0.0,256,22,...,5,2,0.01,3069,840,0.01,0.001,0.0,2.8,8.6
3,CHEESE,"CHEESE,BLUE",1004,0,5.11,74,0,2.34,75,15,...,256,1395,2.66,763,198,1.22,0.166,0.0,0.25,2.4
4,CHEESE,"CHEESE,BRICK",1005,0,3.18,76,0,2.79,94,15,...,136,560,2.6,1080,292,1.26,0.065,0.0,0.26,2.5


In [21]:
# print(np.array(data.Category.unique()).tolist())

In [22]:
data.columns

Index(['Category', 'Description', 'Nutrient Data Bank Number',
       'Data.Alpha Carotene', 'Data.Ash', 'Data.Beta Carotene',
       'Data.Beta Cryptoxanthin', 'Data.Carbohydrate', 'Data.Cholesterol',
       'Data.Choline', 'Data.Fiber', 'Data.Kilocalories',
       'Data.Lutein and Zeaxanthin', 'Data.Lycopene', 'Data.Manganese',
       'Data.Niacin', 'Data.Pantothenic Acid', 'Data.Protein',
       'Data.Refuse Percentage', 'Data.Retinol', 'Data.Riboflavin',
       'Data.Selenium', 'Data.Sugar Total', 'Data.Thiamin', 'Data.Water',
       'Data.Fat.Monosaturated Fat', 'Data.Fat.Polysaturated Fat',
       'Data.Fat.Saturated Fat', 'Data.Fat.Total Lipid',
       'Data.Household Weights.1st Household Weight',
       'Data.Household Weights.1st Household Weight Description',
       'Data.Household Weights.2nd Household Weight',
       'Data.Household Weights.2nd Household Weight Description',
       'Data.Major Minerals.Calcium', 'Data.Major Minerals.Copper',
       'Data.Major Minerals.Iro

In [23]:
data.loc[data.Category == 'WHALE']

Unnamed: 0,Category,Description,Nutrient Data Bank Number,Data.Alpha Carotene,Data.Ash,Data.Beta Carotene,Data.Beta Cryptoxanthin,Data.Carbohydrate,Data.Cholesterol,Data.Choline,...,Data.Major Minerals.Potassium,Data.Major Minerals.Sodium,Data.Major Minerals.Zinc,Data.Vitamins.Vitamin A - IU,Data.Vitamins.Vitamin A - RAE,Data.Vitamins.Vitamin B12,Data.Vitamins.Vitamin B6,Data.Vitamins.Vitamin C,Data.Vitamins.Vitamin E,Data.Vitamins.Vitamin K
7022,WHALE,"WHALE,BELUGA,MEAT,DRIED (ALASKA NATIVE)",35009,0,2.69,0,0,0.0,122,132,...,800,220,7.8,0,0,7.31,0.131,0.0,0.27,0.0
7023,WHALE,"WHALE,BELUGA,EYES (ALASKA NATIVE)",35010,0,1.3,0,0,0.0,0,0,...,0,0,0.0,1870,561,0.0,0.0,0.0,0.0,0.0
7025,WHALE,"WHALE,BELUGA,FLIPPER,RAW (ALASKA NATIVE)",35012,0,1.8,0,0,0.0,0,0,...,0,0,0.0,930,279,0.0,0.0,0.0,0.0,0.0
7026,WHALE,"WHALE,BELUGA,LIVER,RAW (ALASKA NATIVE)",35013,0,1.1,0,0,2.0,0,0,...,0,0,0.0,22100,0,0.0,0.0,0.0,0.0,0.0
7082,WHALE,"WHALE,BOWHEAD,SUBCUTANEOUS FAT (BLUBBER) (ALAS...",35085,0,0.2,0,0,0.0,150,0,...,0,0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
7083,WHALE,"WHALE,BOWHEAD,SKN & SUBCUTANEOUS FATMUKTUK(ALA...",35086,0,0.1,0,0,0.0,54,0,...,0,0,0.0,750,0,0.0,0.0,0.0,0.0,0.0


In [24]:
data.describe()

Unnamed: 0,Nutrient Data Bank Number,Data.Alpha Carotene,Data.Ash,Data.Beta Carotene,Data.Beta Cryptoxanthin,Data.Carbohydrate,Data.Cholesterol,Data.Choline,Data.Fiber,Data.Kilocalories,...,Data.Major Minerals.Potassium,Data.Major Minerals.Sodium,Data.Major Minerals.Zinc,Data.Vitamins.Vitamin A - IU,Data.Vitamins.Vitamin A - RAE,Data.Vitamins.Vitamin B12,Data.Vitamins.Vitamin B6,Data.Vitamins.Vitamin C,Data.Vitamins.Vitamin E,Data.Vitamins.Vitamin K
count,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,...,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0,7413.0
mean,14116.44368,21.210711,1.852459,159.043437,8.776744,21.785381,37.162822,20.673546,1.993147,219.655875,...,268.348172,331.590719,1.875125,767.568191,99.43707,1.172903,0.269547,9.075651,0.842837,9.448604
std,8767.416214,269.714183,2.993228,1126.285026,154.18486,27.123491,119.738438,45.48199,4.292873,171.668713,...,404.91622,977.046544,4.193682,3871.307652,761.653061,4.512816,0.565116,63.443284,4.169756,66.067619
min,1001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8121.0,0.0,0.83,0.0,0.0,0.49,0.0,0.0,0.0,82.0,...,103.0,31.0,0.19,0.0,0.0,0.0,0.03,0.0,0.0,0.0
50%,12539.0,0.0,1.24,0.0,0.0,9.29,2.0,0.0,0.3,181.0,...,210.0,86.0,0.77,33.0,0.0,0.01,0.11,0.0,0.05,0.0
75%,18424.0,0.0,2.2,1.0,0.0,30.59,60.0,20.0,2.3,331.0,...,328.0,428.0,2.46,280.0,24.0,0.83,0.33,3.5,0.39,1.7
max,93600.0,14251.0,99.8,42891.0,7923.0,100.0,3100.0,1388.0,79.0,902.0,...,16500.0,38758.0,181.61,100000.0,30000.0,98.89,12.0,2400.0,149.4,1714.5


In [25]:
# Get the categories and nutrients names
categories = data['Category'].unique()
nutrients_full = np.hstack((np.array(data.columns[3:30]), np.array(data.columns[34:])))
weights = data.columns[30:33]
nutrients = [nutri[5:] for nutri in nutrients_full]
nutrients_mean = data[nutrients_full].mean()
nutrients_std = data[nutrients_full].std()

for i in range(len(nutrients_full)):
    print(nutrients[i], len(data[data[nutrients_full[i]] > nutrients_mean[i]]))

Alpha Carotene 284
Ash 2315
Beta Carotene 536
Beta Cryptoxanthin 307
Carbohydrate 2314
Cholesterol 2396
Choline 1844
Fiber 2126
Kilocalories 3134
Lutein and Zeaxanthin 530
Lycopene 157
Manganese 1357
Niacin 2715
Pantothenic Acid 2453
Protein 2978
Refuse Percentage 1261
Retinol 646
Riboflavin 1960
Selenium 2599
Sugar Total 1613
Thiamin 1788
Water 4649
Fat.Monosaturated Fat 2177
Fat.Polysaturated Fat 1581
Fat.Saturated Fat 2160
Fat.Total Lipid 2266
Household Weights.1st Household Weight 2868
Major Minerals.Copper 1626
Major Minerals.Iron 1572
Major Minerals.Magnesium 1611
Major Minerals.Phosphorus 3199
Major Minerals.Potassium 2807
Major Minerals.Sodium 2438
Major Minerals.Zinc 2345
Vitamins.Vitamin A - IU 1105
Vitamins.Vitamin A - RAE 880
Vitamins.Vitamin B12 1628
Vitamins.Vitamin B6 2257
Vitamins.Vitamin C 1292
Vitamins.Vitamin E 1017
Vitamins.Vitamin K 695


In [26]:
data.columns = [
'Category',
'Description',
'Nutrient Data Bank Number',
'Alpha Carotene',
'Ash',
'B-Carot.',
'Beta Cryptoxanthin',
'Carbohydrate',
'Cholesterol',
'Choline',
'Fiber',
'Kilocalories',
'Lutein and Zeaxanthin',
'Lycopene',
'Manganese',
'Niacin',
'Pantothenic Acid',
'Protein',
'Refuse Percentage',
'Retinol',
'Riboflavin',
'Selenium',
'Sugar Total',
'Thiamin',
'Water',
'Monosaturated Fat',
'Polysaturated Fat',
'Saturated Fat',
'Total Lipid',
'1st Household Weight',
'1st Household Weight Description',
'2nd Household Weight',
'2nd Household Weight Description',
'Calcium',
'Copper',
'Iron',
'Magnesium',
'Phosphorus',
'Potassium',
'Sodium',
'Zinc',
'Vitamin A - IU',
'Vitamin A - RAE',
'Vit. B12',
'Vitamin B6',
'Vit. C',
'Vitamin E',
'Vitamin K']

A first attempt to build the adjacency matrix was to use two for loops to iterate over the data rows. This is of course a very bad idea and was indeed very slow.

Since numpy is very efficient with vector and matrix operations, vectorizing the operations made the computations significantly faster.

In [27]:
# nutrients_to_search = ['Data.Vitamins.Vitamin B12',
#                         'Data.Vitamins.Vitamin C',
#                         'Data.Vitamins.Vitamin E']
nutrients_to_search = ['Vit. B12',
                      'Calcium',
                      'Iron',
                      'Zinc',
                      'Fiber',
                      'Vit. C',
                      'B-Carot.',
                      ]
                      
distance_threshold = 0.005 # the maximum distance between two nutrients vector

# Vectorized method for a vector of nutrients
indptr = [0]
indices = []
csr_data = []
node_nutrient_vector = [] # The nutrient vector of each food item
node_bank_number = data['Nutrient Data Bank Number'] # The bank number of the food item (we keep all the food items)
node_category = data['Category'] # The category of the food item
node_description = data['Description'] # The description of the food item

# Standardize all nutrients
standardized_data = pd.concat([data[data.columns[:3]],(data[data.columns[3:]]-data[data.columns[3:]].mean())/data[data.columns[3:]].std()], axis=1)
# Normalize all nutrients by max value
normalized_data = pd.concat([data[data.columns[:3]],(data[data.columns[3:]]-data[data.columns[3:]].min())/(data[data.columns[3:]].max()-data[data.columns[3:]].min())], axis=1)

used_data = normalized_data # standardized_data or data

for i in range(len(used_data)):
    nutrient_vector = used_data.iloc[i][nutrients_to_search]
    # Save the nutrient vector of the food item
    node_nutrient_vector.append(nutrient_vector)
    if np.linalg.norm(nutrient_vector) != 0: 
        # If the nutrient vector is not null
        # Compute the distance between the nutrient vector of the food item and all other food items
        nutrient_diff = nutrient_vector - used_data[nutrients_to_search]
        nutrient_norms = np.linalg.norm(nutrient_diff.values.astype(np.float64), axis=1)
        # Find the indices of the food items that are within specified distance threshold
        food_indices = np.where(nutrient_norms < distance_threshold)[0]
        # Update CSR lists with items that are within that range
        indptr.append(indptr[-1] + len(food_indices))
        indices.extend(food_indices)
        csr_data.extend(np.ones(len(food_indices)))
        # inv_norms = [1/norm for norm in nutrient_norms if norm !=0 else np.Inf]
        # csr_data.extend(nutrient_norms[food_indices])
    else:
        # Else if the vector is null, add a null row to the CSR matrix,
        # to indicate that the food item is not connected to any other food item
        indptr.append(indptr[-1])
        
csr_data = np.array(csr_data)

In [28]:
# Build the adjacency matrix from the CSR data vectors
adjacency = csr_matrix((csr_data, indices, indptr), dtype=int)

In [29]:
adjacency

<7413x7413 sparse matrix of type '<class 'numpy.int32'>'
	with 491168 stored elements in Compressed Sparse Row format>

## Saving data as Bunch object

In [30]:
from sknetwork.data import Bunch, save
import pickle
# your names
name = 'tang_tourtois'

In [31]:
adjacency

<7413x7413 sparse matrix of type '<class 'numpy.int32'>'
	with 491168 stored elements in Compressed Sparse Row format>

In [32]:
# check
10**3 < adjacency.nnz < 10**6

True

In [33]:
# Build the bunch object
dataset = Bunch()
dataset.adjacency = adjacency
dataset.node_feature = pd.DataFrame(node_nutrient_vector)
dataset.category = pd.Series(node_category)
dataset.description = pd.Series(node_description)
dataset.bank_number = pd.Series(node_bank_number)

### Details about the source

In [34]:
meta = Bunch()

In [35]:
meta.name = 'Nutrient availability of food items with links depending on two items similarity'
meta.description = 'Links connecting food items if their nutrient content vector is similar within ' + str(distance_threshold) + ' (euclidian distance) of the nutrient in the whole dataset.'
meta.source = 'https://www.kaggle.com/datasets/shrutisaxena/food-nutrition-dataset'
meta.orginal_source = 'https://ndb.nal.usda.gov/'
meta.date = 'June 2022'

In [36]:
dataset.meta = meta

In [37]:
with open(name, 'bw') as f:
    pickle.dump(dataset, f)

## Old unused code

In [38]:
# nutrient_to_search = 'Data.Protein'
# fraction_of_std = 0.005


# # Vectorized method for one nutrient
# indptr = [0]
# indices = []
# csr_data = []
# node_feature = [] # The amount of nutrients in the food item
# node_label = [] # The name of the food category

# for i in range(len(data)):
#     if data[nutrient_to_search].iloc[i] != 0:
#         # Save the amount of the nutrient in the food item
#         node_feature.append(data[nutrient_to_search].iloc[i])
#         # Save the category of the food item
#         node_label.append(data['Category'].iloc[i])
#         # Create a vector of the nutrient value of the food item
#         food_nutrient = data[nutrient_to_search].iloc[i] * np.ones(len(data))
#         nutrient_diff = abs(food_nutrient - data[nutrient_to_search])
#         # Find the indices of the food items that are within the range of the nutrient standard deviation (times an arbitrary coefficient)
#         food_indices = np.where(nutrient_diff < nutrients_std[nutrient_to_search] * fraction_of_std)[0]
#         # Update CSR lists with items that are within that range
#         indptr.append(indptr[-1] + len(food_indices))
#         indices.extend(food_indices)
#         csr_data.extend(nutrient_diff[food_indices])
#     else:
#         indptr.append(indptr[-1])
#         node_feature.append(data[nutrient_to_search].iloc[i])

# csr_data = np.array(csr_data) * 1000
# adjacency = csr_matrix((csr_data, indices, indptr), dtype=int)