In [73]:
import csv

INPUT_CSV = 'data_labeled.csv'

CSV_COLUMNS = [
    'id',
    'url',
    'title',
    'img_url',
    'score',
    'servings',
    'prep_time',
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
    # Everything above this index is a <contains> relationship
]

CSV_COLUMN_TYPES = dict({
    'id': 'string',
    'url': 'string',
    'title': 'string',
    'img_url': 'string',
    'score': 'int32',
    'servings': 'int32',
    'prep_time': 'int32',
    'rating': 'float32',
    'reviews': 'int32',
    'made_it_count': 'int32',
    'calories': 'int32',
    'total_fat': 'float32',
    'saturated_fat': 'float32',
    'cholesterol': 'float32',
    'sodium': 'float32',
    'potassium': 'float32',
    'carbs': 'float32',
    'dietary_fiber': 'float32',
    'protein': 'float32',
    'sugars': 'float32',
    'vitamin_a': 'float32',
    'vitamin_c': 'float32',
    'calcium': 'float32',
    'iron': 'float32',
    'thiamin': 'float32',
    'niacin': 'float32',
    'vitamin_b6': 'float32',
    'magnesium': 'float32',
    'folate': 'float32',
    # Everything above this index is a <contains> relationship
})

def parse_data(data_csv, max_rows=None):
    with open(data_csv, 'r') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
        
        col_len = len(CSV_COLUMNS)
        row_count = 0
        
        for row in reader:
            row_arr = [col.replace(r'"', r'\"') for col in row]
            yield row_arr[:col_len]
            row_count += 1
            
            if max_rows is not None and row_count >= max_rows:
                break
            

In [75]:
import pandas as pd

MAX_ROWS = 831
df = pd.DataFrame(data=parse_data(INPUT_CSV, max_rows=MAX_ROWS), columns=(CSV_COLUMNS))

In [76]:
DAILY_VALUES = dict({
    'calories': 2000,
    'total_fat': 65,
    'saturated_fat': 20,
    'cholesterol': 300,
    'sodium': 2400,
    'potassium': 4700,
    'carbs': 300,
    'dietary_fiber': 27.5,
    'protein': 50,
#     'sugars': 'float32',
    'vitamin_a': 3000,
    'vitamin_c': 90,
    'calcium': 1300,
    'iron': 18,
    'thiamin': 1.2,
    'niacin': 16,
    'vitamin_b6': 1.7,
    'magnesium': 420,
    'folate': 400,
    # Everything above this index is a <contains> relationship
})

def dv_col(c):
    return '{}_dv'.format(c)

DV_COLS = sorted([dv_col(c) for c in DAILY_VALUES.keys()])

In [77]:
BASE_FEATURES = [
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
]

# Augmented columns?
# Calorie percentage from protein
# Calorie percentage from fat
# Calorie percentage from carbs
# Normalize all categories to daily values

AUG_FEATURES = [
    'cal_protein',
    'cal_fat',
    'cal_carbs'
]

def compute_aug_cols(row):
    protein_cals = row['protein'] * 4.0
    fat_cals = row['total_fat'] * 9.0
    carb_cals = row['carbs'] * 4.0
    
    total_cals = sum([protein_cals, fat_cals, carb_cals])
    row['cal_protein'] = protein_cals / total_cals
    row['cal_fat'] = fat_cals / total_cals
    row['cal_carbs'] = carb_cals / total_cals
    
    for c in DAILY_VALUES.keys():
        row[dv_col(c)] = row[c] / DAILY_VALUES[c]
    
    return row


FEATURES = BASE_FEATURES + AUG_FEATURES

DF_COLUMNS = BASE_FEATURES + ['score']
df = df[DF_COLUMNS]

type_dict = {col:CSV_COLUMN_TYPES[col] for col in DF_COLUMNS}
df = df.astype(type_dict)

df = df[df['score'] >= 0]

df = df.apply(compute_aug_cols, axis=1)
df

Unnamed: 0,rating,reviews,made_it_count,calories,total_fat,saturated_fat,cholesterol,sodium,potassium,carbs,...,protein_dv,vitamin_a_dv,vitamin_c_dv,calcium_dv,iron_dv,thiamin_dv,niacin_dv,vitamin_b6_dv,magnesium_dv,folate_dv
0,4.66,102.0,152.0,500.0,21.700001,16.0,5.0,291.0,368.0,72.500000,...,0.138,0.347333,0.333333,0.116154,0.055556,0.000000,0.1250,0.000000,0.073810,0.0350
1,4.33,5.0,7.0,308.0,18.900000,10.0,73.0,482.0,214.0,26.100000,...,0.168,0.203333,0.022222,0.084615,0.277778,0.000000,0.2500,0.000000,0.040476,0.1350
2,4.52,83.0,119.0,339.0,12.300000,2.0,0.0,122.0,117.0,56.200001,...,0.068,0.011667,0.088889,0.020000,0.055556,0.000000,0.1250,0.000000,0.045238,0.0950
3,3.14,5.0,8.0,275.0,12.200000,7.0,57.0,399.0,185.0,29.400000,...,0.236,0.119333,0.011111,0.076923,0.111111,0.000000,0.3125,0.000000,0.069048,0.2375
4,3.00,3.0,3.0,276.0,15.700000,8.0,73.0,60.0,120.0,31.900000,...,0.094,0.091000,0.000000,0.024615,0.000000,0.000000,0.0625,0.000000,0.009524,0.0175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
823,3.95,11.0,18.0,351.0,20.400000,7.0,46.0,1590.0,818.0,15.500000,...,0.542,0.269667,0.644444,0.072308,0.166667,0.000000,0.5000,0.588235,0.076190,0.0800
825,4.44,227.0,256.0,153.0,9.400000,5.0,29.0,529.0,222.0,10.100000,...,0.150,0.197333,0.288889,0.163846,0.055556,0.000000,0.1250,0.000000,0.045238,0.0625
826,3.68,22.0,37.0,935.0,58.200001,21.0,161.0,2274.0,822.0,51.500000,...,1.022,0.517333,0.144444,0.431538,0.500000,1.666667,1.4375,0.588235,0.219048,0.4925
827,4.00,7.0,17.0,508.0,35.099998,13.0,97.0,1096.0,997.0,25.500000,...,0.492,0.872333,0.855556,0.055385,0.222222,0.000000,0.6250,0.000000,0.071429,0.0500


In [78]:
X, y = df[FEATURES].to_numpy(), df['score'].to_numpy()
print(X.shape, y.shape)

(690, 25) (690,)


In [79]:
import numpy as np

def train_val_test_split(data, train_frac, val_frac, test_frac):
    assert(train_frac + val_frac + test_frac == 1)
    M = data.shape[0]
    np.random.seed(628)
    np.random.shuffle(data)
    return np.split(data, [int(train_frac*M), int((train_frac+val_frac)*M)])

# We use a 3:1:1 split for test train and validation
X_train, X_val, X_test = train_val_test_split(X, 0.6, 0.2, 0.2)
y_train, y_val, y_test = train_val_test_split(y, 0.6, 0.2, 0.2)

X_label = np.concatenate((X_train, X_val), axis=0)
y_label = np.concatenate((y_train, y_val), axis=0)

print(X_train.shape, X_val.shape, X_test.shape)

(414, 25) (138, 25) (138, 25)


In [80]:
# The risk is the average absolute difference
# between the predicted and true nutritional score
def risk(model, X_, y_):
    M_1 = 1.0 / X_.shape[0]
    y_hat = model.predict(X_)
    abs_error = np.abs(y_hat - y_)
    return M_1 * np.sum(abs_error), np.std(abs_error)

In [81]:
from sklearn.model_selection import cross_val_score

# Perform kfold analysis on the data
def kfold_analysis(model, X_, y_, cv=5):
    return cross_val_score(model, X_, y_, cv=cv)

In [82]:
# Find the data samples with the largest absolute error
# We are doing this because the data was manually labeled and may contain errors
def outlier_analysis(model, X_, y_):
    # We will only analyze data we 
    model.fit(X_, y_)
    y_hat = model.predict(X_)
    abs_error = np.abs(y_hat - y_)
    # abs_error, y, calories
    zipped = list(zip(abs_error, y_, X_[:, FEATURES.index('calories')]))
    return sorted(zipped, key=lambda x: x[0], reverse=True)

In [83]:
outlier_analysis(LinearRegression(), X, y)[:20]

[(38.73125407346327, 7.0, 182.0),
 (32.01781601424358, 12.0, 133.0),
 (31.553189708886435, 11.0, 407.0),
 (30.935009068014267, 78.0, 137.0),
 (30.1809368503407, 96.0, 39.0),
 (29.133712622642626, 22.0, 53.0),
 (28.95454968072547, 33.0, 505.0),
 (28.782278007852362, 12.0, 367.0),
 (28.4015519977131, 25.0, 831.0),
 (28.23595488224879, 95.0, 113.0),
 (26.58521476923866, 40.0, 422.0),
 (26.325570022551346, 16.0, 80.0),
 (26.313293789123094, 28.0, 221.0),
 (26.20850006393139, 32.0, 282.0),
 (26.184064935880443, 77.0, 81.0),
 (26.174068742959115, 27.0, 271.0),
 (26.057172287326978, 24.0, 84.0),
 (25.784116214686527, 23.0, 163.0),
 (25.277891771473087, 21.0, 143.0),
 (25.22734395228403, 77.0, 207.0)]

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import TheilSenRegressor

print(kfold_analysis(LinearRegression(), X_label, y_label))
print(kfold_analysis(TheilSenRegressor(), X_label, y_label))

[0.64362631 0.63993017 0.62505006 0.5822936  0.60630005]
[0.45642798 0.62359426 0.36511304 0.24369595 0.29144962]


In [85]:
# Implement trivial estimator (guesses mean) to get upper bound on error
class TrivialEstimator:
    def __init__(self):
        self.mean = 0
    
    def fit(self, X, y):
        self.mean = np.mean(y)
    
    def predict(self, _):
        return self.mean
    
est = TrivialEstimator()
est.fit(X_label, y_label)
print('Training Risk', risk(est, X_label, y_label))
print('Test Risk', risk(est, X_test, y_test))

Training Risk (16.27775152278933, 10.67756087204537)
Test Risk (15.229232304137787, 10.136516567339482)


In [87]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression().fit(X_train, y_train)
print('No normalization')
print('Training Risk', risk(reg, X_train, y_train))
print('Validation Risk', risk(reg, X_val, y_val))
print()

reg_norm = LinearRegression(normalize=True).fit(X_train, y_train)
print('No normalization')
print('Training Risk', risk(reg_norm, X_train, y_train))
print('Validation Risk', risk(reg_norm, X_val, y_val))
print()

No normalization
Training Risk (8.835219432302043, 6.729969873809595)
Validation Risk (9.056691648367353, 7.295344007004641)

No normalization
Training Risk (8.835219432302004, 6.7299698738096465)
Validation Risk (9.0566916483674, 7.295344007004525)



In [88]:
from sklearn.linear_model import TheilSenRegressor

ts_reg = TheilSenRegressor()
ts_reg.fit(X_train, y_train)
print('Training Risk', risk(ts_reg, X_train, y_train))
print('Validation Risk', risk(ts_reg, X_val, y_val))

Training Risk (9.910167138879574, 9.380405338813935)
Validation Risk (10.77121242393858, 11.391851625999523)


In [89]:
reg_final = LinearRegression(normalize=True).fit(X_label, y_label)
print('Test Score: ', risk(reg, X_test, y_test))
print(reg_final.coef_)
print(reg_final.intercept_)

Test Score:  (9.490236495764062, 7.744582571151072)
[ 7.01282659e-01  5.37407926e-03 -3.40451782e-03 -2.29270976e-01
  3.20236682e+00 -2.26887592e+00 -1.33372221e-01 -9.57137353e-03
  8.87493250e-03  1.04237918e+00  1.72307562e+00  1.02511970e+00
 -5.19890298e-01  6.26877812e-04  1.13236817e-01  6.10643906e-03
 -1.68452280e-01  1.43866781e+00  3.66136014e-01 -9.13932173e+00
 -2.52252541e-02  2.08087150e-03  7.04405845e+00 -1.44955899e+01
  8.41613458e+00]
45.33133077427652


In [127]:
import pickle

# Save the model to a file
fname = 'nutriscore_lr.model'
pickle.dump(reg_final, open(fname, 'wb'))