In [49]:
import sys
sys.path.append('../webscraper/ingredient_parser/')
from load_ingredients import load_ingredients

ingredient_map, alias_map = load_ingredients()
NUM_INGREDIENTS = len(alias_map.values())
NUM_INGREDIENTS

831

In [50]:
import numpy as np

ordered_ingredients = sorted(alias_map.values())
ingredient_index_map = {k: v for v, k in enumerate(ordered_ingredients)}

def ingredients_to_hot(ingredients):
    hot_vector = np.zeros(NUM_INGREDIENTS)
    for i in ingredients:
        hot_vector[ingredient_index_map[i]] = 1
    return hot_vector

In [51]:
import csv

INPUT_CSV = 'labeled_parsed.csv'

CSV_COLUMNS = [
    'id',
    'url',
    'title',
    'img_url',
    'score',
    'servings',
    'prep_time',
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
    'ingredients'
]

CSV_COLUMN_TYPES = dict({
    'id': 'string',
    'url': 'string',
    'title': 'string',
    'img_url': 'string',
    'score': 'int32',
    'servings': 'int32',
    'prep_time': 'int32',
    'rating': 'float32',
    'reviews': 'int32',
    'made_it_count': 'int32',
    'calories': 'int32',
    'total_fat': 'float32',
    'saturated_fat': 'float32',
    'cholesterol': 'float32',
    'sodium': 'float32',
    'potassium': 'float32',
    'carbs': 'float32',
    'dietary_fiber': 'float32',
    'protein': 'float32',
    'sugars': 'float32',
    'vitamin_a': 'float32',
    'vitamin_c': 'float32',
    'calcium': 'float32',
    'iron': 'float32',
    'thiamin': 'float32',
    'niacin': 'float32',
    'vitamin_b6': 'float32',
    'magnesium': 'float32',
    'folate': 'float32',
})

def parse_data(data_csv, max_rows=None):
    with open(data_csv, 'r') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
        
        col_len = len(CSV_COLUMNS)-1
        row_count = 0
        
        for row in reader:
            row_arr = [col.replace(r'"', r'\"') for col in row]
            
            yield row_arr[:col_len] + [ingredients_to_hot(filter(lambda i: i != "", row_arr[col_len:]))]
            row_count += 1
            
            if max_rows is not None and row_count >= max_rows:
                break

In [52]:
import pandas as pd

df = pd.DataFrame(data=parse_data(INPUT_CSV), columns=(CSV_COLUMNS))
df = df[df['score'] != ""]

In [53]:
DAILY_VALUES = dict({
    'calories': 2000,
    'total_fat': 65,
    'saturated_fat': 20,
    'cholesterol': 300,
    'sodium': 2400,
    'potassium': 4700,
    'carbs': 300,
    'dietary_fiber': 27.5,
    'protein': 50,
#     'sugars': 'float32',
    'vitamin_a': 3000,
    'vitamin_c': 90,
    'calcium': 1300,
    'iron': 18,
    'thiamin': 1.2,
    'niacin': 16,
    'vitamin_b6': 1.7,
    'magnesium': 420,
    'folate': 400,
    # Everything above this index is a <contains> relationship
})

def dv_col(c):
    return '{}_dv'.format(c)

DV_COLS = sorted([dv_col(c) for c in DAILY_VALUES.keys()])

In [54]:
BASE_FEATURES = [
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
]

# Augmented columns?
# Calorie percentage from protein
# Calorie percentage from fat
# Calorie percentage from carbs
# Normalize all categories to daily values

AUG_FEATURES = [
    'cal_protein',
    'cal_fat',
    'cal_carbs'
]

def compute_aug_cols(row):
    protein_cals = row['protein'] * 4.0
    fat_cals = row['total_fat'] * 9.0
    carb_cals = row['carbs'] * 4.0
    
    total_cals = sum([protein_cals, fat_cals, carb_cals])
    row['cal_protein'] = protein_cals / total_cals
    row['cal_fat'] = fat_cals / total_cals
    row['cal_carbs'] = carb_cals / total_cals
    
    for c in DAILY_VALUES.keys():
        row[dv_col(c)] = row[c] / DAILY_VALUES[c]
    
    return row


FEATURES = BASE_FEATURES + AUG_FEATURES
DF_COLUMNS = BASE_FEATURES + ['score']

df_data = df[DF_COLUMNS]

type_dict = {col:CSV_COLUMN_TYPES[col] for col in DF_COLUMNS}
df_data = df_data.astype(type_dict)

df_data = df_data[df_data['score'] >= 0]

df_data = df_data.apply(compute_aug_cols, axis=1)
df_data

Unnamed: 0,rating,reviews,made_it_count,calories,total_fat,saturated_fat,cholesterol,sodium,potassium,carbs,...,protein_dv,vitamin_a_dv,vitamin_c_dv,calcium_dv,iron_dv,thiamin_dv,niacin_dv,vitamin_b6_dv,magnesium_dv,folate_dv
0,4.66,102.0,152.0,500.0,21.700001,16.0,5.0,291.0,368.0,72.500000,...,0.138,0.347333,0.333333,0.116154,0.055556,0.000000,0.1250,0.000000,0.073810,0.0350
1,4.33,5.0,7.0,308.0,18.900000,10.0,73.0,482.0,214.0,26.100000,...,0.168,0.203333,0.022222,0.084615,0.277778,0.000000,0.2500,0.000000,0.040476,0.1350
2,4.52,83.0,119.0,339.0,12.300000,2.0,0.0,122.0,117.0,56.200001,...,0.068,0.011667,0.088889,0.020000,0.055556,0.000000,0.1250,0.000000,0.045238,0.0950
3,3.14,5.0,8.0,275.0,12.200000,7.0,57.0,399.0,185.0,29.400000,...,0.236,0.119333,0.011111,0.076923,0.111111,0.000000,0.3125,0.000000,0.069048,0.2375
4,3.62,13.0,14.0,694.0,26.100000,11.0,139.0,528.0,1042.0,79.900002,...,0.728,0.437333,0.355556,0.194615,0.444444,0.833333,1.1875,0.588235,0.238095,0.5500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,3.95,11.0,18.0,351.0,20.400000,7.0,46.0,1590.0,818.0,15.500000,...,0.542,0.269667,0.644444,0.072308,0.166667,0.000000,0.5000,0.588235,0.076190,0.0800
788,4.44,227.0,256.0,153.0,9.400000,5.0,29.0,529.0,222.0,10.100000,...,0.150,0.197333,0.288889,0.163846,0.055556,0.000000,0.1250,0.000000,0.045238,0.0625
789,3.68,22.0,37.0,935.0,58.200001,21.0,161.0,2274.0,822.0,51.500000,...,1.022,0.517333,0.144444,0.431538,0.500000,1.666667,1.4375,0.588235,0.219048,0.4925
790,4.00,7.0,17.0,508.0,35.099998,13.0,97.0,1096.0,997.0,25.500000,...,0.492,0.872333,0.855556,0.055385,0.222222,0.000000,0.6250,0.000000,0.071429,0.0500


In [74]:
import numpy as np

ing_df = df[['score', 'ingredients']]
ing_df = ing_df.astype({'score': CSV_COLUMN_TYPES['score']})
ing_df = ing_df[ing_df['score'] >= 0]
ing_data = np.stack(ing_df['ingredients'].to_numpy(), axis=0)

X = np.concatenate((df_data[FEATURES].to_numpy(), ing_data), axis=1)
y = df_data['score'].to_numpy()
print(X.shape, y.shape)

(663, 856) (663,)


In [75]:
def train_val_test_split(data, train_frac, val_frac, test_frac):
    assert(train_frac + val_frac + test_frac == 1)
    M = data.shape[0]
    np.random.seed(628)
    np.random.shuffle(data)
    return np.split(data, [int(train_frac*M), int((train_frac+val_frac)*M)])

# We use a 3:1:1 split for test train and validation
X_train, X_val, X_test = train_val_test_split(X, 0.6, 0.2, 0.2)
y_train, y_val, y_test = train_val_test_split(y, 0.6, 0.2, 0.2)

X_label = np.concatenate((X_train, X_val), axis=0)
y_label = np.concatenate((y_train, y_val), axis=0)

print(X_train.shape, X_val.shape, X_test.shape)

(397, 856) (133, 856) (133, 856)


In [76]:
# The risk is the average absolute difference
# between the predicted and true nutritional score
def risk(model, X_, y_):
    M_1 = 1.0 / X_.shape[0]
    y_hat = model.predict(X_)
    abs_error = np.abs(y_hat - y_)
    return M_1 * np.sum(abs_error), np.std(abs_error)

In [77]:
from sklearn.model_selection import cross_val_score

# Perform kfold analysis on the data
def kfold_analysis(model, X_, y_, cv=5):
    return cross_val_score(model, X_, y_, cv=cv)

In [78]:
# Find the data samples with the largest absolute error
# We are doing this because the data was manually labeled and may contain errors
def outlier_analysis(model, X_, y_):
    # We will only analyze data we 
    model.fit(X_, y_)
    y_hat = model.predict(X_)
    abs_error = np.abs(y_hat - y_)
    # abs_error, y, calories
    zipped = list(zip(abs_error, y_, X_[:, FEATURES.index('calories')]))
    return sorted(zipped, key=lambda x: x[0], reverse=True)

In [79]:
outlier_analysis(LinearRegression(), X, y)[:20]

NameError: name 'LinearRegression' is not defined

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import TheilSenRegressor

print(kfold_analysis(LinearRegression(), X_label, y_label))
print(kfold_analysis(TheilSenRegressor(), X_label, y_label))

[0.64362631 0.63993017 0.62505006 0.5822936  0.60630005]
[0.45642798 0.62359426 0.36511304 0.24369595 0.29144962]


In [80]:
# Implement trivial estimator (guesses mean) to get upper bound on error
class TrivialEstimator:
    def __init__(self):
        self.mean = 0
    
    def fit(self, X, y):
        self.mean = np.mean(y)
    
    def predict(self, _):
        return self.mean
    
est = TrivialEstimator()
est.fit(X_label, y_label)
print('Training Risk', risk(est, X_label, y_label))
print('Test Risk', risk(est, X_test, y_test))

Training Risk (16.191441794232823, 10.434500826469424)
Test Risk (15.356844942545044, 11.440404154038516)


In [82]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)
print('No normalization')
print('Training Risk', risk(reg, X_train, y_train))
print('Validation Risk', risk(reg, X_val, y_val))
print()

reg_norm = LinearRegression(normalize=True).fit(X_train, y_train)
print('Normalization')
print('Training Risk', risk(reg_norm, X_train, y_train))
print('Validation Risk', risk(reg_norm, X_val, y_val))
print()

No normalization
Training Risk (2.672754952946369, 3.903145637963223)
Validation Risk (37.18387194847048, 43.02706438587043)

Normalization
Training Risk (2.689153022670025, 3.892408692002486)
Validation Risk (3783121111290.4214, 8024935265452.38)



In [88]:
from sklearn.linear_model import TheilSenRegressor

ts_reg = TheilSenRegressor()
ts_reg.fit(X_train, y_train)
print('Training Risk', risk(ts_reg, X_train, y_train))
print('Validation Risk', risk(ts_reg, X_val, y_val))

Training Risk (9.910167138879574, 9.380405338813935)
Validation Risk (10.77121242393858, 11.391851625999523)


In [83]:
reg_final = LinearRegression(normalize=True).fit(X_label, y_label)
print('Test Score: ', risk(reg, X_test, y_test))
print(reg_final.coef_)
print(reg_final.intercept_)

Test Score:  (35.21597481641078, 47.768841453829374)
[ 4.81824913e-01  1.93122332e-02 -8.81618121e-03 -3.11204790e-01
  3.34183334e+00 -5.96072677e-01 -1.48247691e-01 -2.42408838e-02
  2.77364613e-02  1.39317905e+00 -1.60778937e-01  2.39711664e+00
 -6.05854447e-01 -4.25487942e-04 -1.05493214e-01  1.16127445e-02
 -1.72207536e+00  1.02555231e+00 -2.46633498e+00  3.06734950e-01
  2.17989667e-01  2.77229921e-02  1.09387735e+15  1.09387735e+15
  1.09387735e+15 -6.24990766e+14  3.71705515e+14  4.06337838e+15
  6.62271854e+14 -9.10528206e+14  1.05845986e+15 -1.02153152e+15
 -2.09131239e+01  5.15476128e+00 -1.03271012e+15  1.11521263e+15
 -1.10223905e+15  6.47562764e-01 -1.71194586e+14  5.09194652e+00
  1.95949179e+14  1.16220649e+01  3.22984534e+02  3.99305661e+14
  9.14961750e+13 -3.07786028e+01  3.92066556e+14  8.79801748e+00
  4.38695309e+00 -3.88709074e+14 -2.30962916e+14 -6.33019750e+14
  2.07884251e+01 -9.52089584e+14  1.43040799e+01 -1.62895340e+01
  1.14404855e+01  4.89627583e+14  8.4

In [127]:
import pickle

# Save the model to a file
fname = 'nutriscore_lr.model'
pickle.dump(reg_final, open(fname, 'wb'))