In [33]:
import csv

INPUT_CSV = 'data_labeled.csv'

CSV_COLUMNS = [
    'id',
    'url',
    'title',
    'img_url',
    'score',
    'servings',
    'prep_time',
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
    # Everything above this index is a <contains> relationship
]

CSV_COLUMN_TYPES = dict({
    'id': 'string',
    'url': 'string',
    'title': 'string',
    'img_url': 'string',
    'score': 'int32',
    'servings': 'int32',
    'prep_time': 'int32',
    'rating': 'float32',
    'reviews': 'int32',
    'made_it_count': 'int32',
    'calories': 'int32',
    'total_fat': 'float32',
    'saturated_fat': 'float32',
    'cholesterol': 'float32',
    'sodium': 'float32',
    'potassium': 'float32',
    'carbs': 'float32',
    'dietary_fiber': 'float32',
    'protein': 'float32',
    'sugars': 'float32',
    'vitamin_a': 'float32',
    'vitamin_c': 'float32',
    'calcium': 'float32',
    'iron': 'float32',
    'thiamin': 'float32',
    'niacin': 'float32',
    'vitamin_b6': 'float32',
    'magnesium': 'float32',
    'folate': 'float32',
    # Everything above this index is a <contains> relationship
})

def parse_data(data_csv, max_rows=None):
    with open(data_csv, 'r') as f:
        reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
        
        col_len = len(CSV_COLUMNS)
        row_count = 0
        
        for row in reader:
            row_arr = [col.replace(r'"', r'\"') for col in row]
            yield row_arr[:col_len]
            row_count += 1
            
            if max_rows is not None and row_count >= max_rows:
                break
            

In [90]:
import pandas as pd

df = pd.DataFrame(data=parse_data(INPUT_CSV, max_rows=400), columns=(CSV_COLUMNS))

In [109]:
BASE_FEATURES = [
    'rating',
    'reviews',
    'made_it_count',
    'calories',
    'total_fat',
    'saturated_fat',
    'cholesterol',
    'sodium',
    'potassium',
    'carbs',
    'dietary_fiber',
    'protein',
    'sugars',
    'vitamin_a',
    'vitamin_c',
    'calcium',
    'iron',
    'thiamin',
    'niacin',
    'vitamin_b6',
    'magnesium',
    'folate',
]

# Augmented columns?
# Calorie percentage from protein
# Calorie percentage from fat
# Calorie percentage from carbs
# Normalize all categories to daily values

AUG_FEATURES = [
    'cal_protein',
    'cal_fat',
    'cal_carbs'
]

def compute_aug_cols(row):
    protein_cals = row['protein'] * 4.0
    fat_cals = row['total_fat'] * 9.0
    carb_cals = row['carbs'] * 4.0
    
    total_cals = sum([protein_cals, fat_cals, carb_cals])
    row['cal_protein'] = protein_cals / total_cals
    row['cal_fat'] = fat_cals / total_cals
    row['cal_carbs'] = carb_cals / total_cals
    return row


FEATURES = BASE_FEATURES + AUG_FEATURES

DF_COLUMNS = BASE_FEATURES + ['score']
df = df[DF_COLUMNS]

type_dict = {col:CSV_COLUMN_TYPES[col] for col in DF_COLUMNS}
df = df.astype(type_dict)

df = df[df['score'] >= 0]

df = df.apply(compute_aug_cols, axis=1)
df

Unnamed: 0,rating,reviews,made_it_count,calories,total_fat,saturated_fat,cholesterol,sodium,potassium,carbs,...,iron,thiamin,niacin,vitamin_b6,magnesium,folate,score,cal_protein,cal_fat,cal_carbs
0,4.66,102.0,152.0,500.0,21.700001,16.0,5.0,291.0,368.0,72.500000,...,1.0,0.0,2.0,0.0,31.0,14.0,43.0,0.053812,0.380776,0.565412
1,4.33,5.0,7.0,308.0,18.900000,10.0,73.0,482.0,214.0,26.100000,...,5.0,0.0,4.0,0.0,17.0,54.0,33.0,0.109055,0.552093,0.338851
2,4.52,83.0,119.0,339.0,12.300000,2.0,0.0,122.0,117.0,56.200001,...,1.0,0.0,2.0,0.0,19.0,38.0,36.0,0.038957,0.317101,0.643942
3,3.14,5.0,8.0,275.0,12.200000,7.0,57.0,399.0,185.0,29.400000,...,2.0,0.0,5.0,0.0,29.0,95.0,41.0,0.171886,0.399854,0.428259
4,3.00,3.0,3.0,276.0,15.700000,8.0,73.0,60.0,120.0,31.900000,...,0.0,0.0,1.0,0.0,4.0,7.0,18.0,0.065346,0.491137,0.443518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,4.49,47.0,81.0,153.0,2.300000,0.0,0.0,235.0,105.0,28.299999,...,2.0,0.0,4.0,0.0,24.0,72.0,77.0,0.129955,0.134503,0.735543
396,3.89,14.0,17.0,121.0,7.300000,1.0,0.0,7.0,237.0,5.800000,...,2.0,0.0,3.0,0.0,46.0,13.0,32.0,0.320856,0.501910,0.177235
397,4.71,19.0,25.0,410.0,24.600000,15.0,92.0,735.0,876.0,33.799999,...,2.0,0.0,3.0,0.0,106.0,108.0,23.0,0.100857,0.558245,0.340898
398,4.19,26.0,36.0,216.0,11.900000,7.0,57.0,138.0,44.0,24.200001,...,1.0,0.0,1.0,0.0,5.0,29.0,17.0,0.064250,0.491510,0.444240


In [110]:
X, y = df[FEATURES].to_numpy(), df['score'].to_numpy()
print(X.shape, y.shape)

(372, 25) (372,)


In [111]:
import numpy as np

def train_val_test_split(data, train_frac, val_frac, test_frac):
    assert(train_frac + val_frac + test_frac == 1)
    M = data.shape[0]
    np.random.seed(628)
    np.random.shuffle(data)
    return np.split(data, [int(train_frac*M), int((train_frac+val_frac)*M)])

# We use a 3:1:1 split for test train and validation
X_train, X_val, X_test = train_val_test_split(X, 0.6, 0.2, 0.2)
y_train, y_val, y_test = train_val_test_split(y, 0.6, 0.2, 0.2)
print(X_train.shape, X_val.shape, X_test.shape)

(223, 25) (74, 25) (75, 25)


In [116]:
from sklearn.linear_model import LinearRegression

# The risk is the average absolute difference
# between the predicted and true nutritional score
def risk(model, X_, y_):
    M_1 = 1.0 / X.shape[0]
    y_hat = model.predict(X_)
    return M_1 * np.sum(np.abs(y_hat - y_))

reg = LinearRegression().fit(X_train, y_train)
print('No normalization: ', risk(reg, X_val, y_val))

reg_norm = LinearRegression(normalize=True).fit(X_train, y_train)
print('Normalized: ', risk(reg_norm, X_val, y_val))

No normalization:  1.9562126068598806
Normalized:  1.9562126068598806


In [114]:
X_label = np.concatenate((X_train, X_val), axis=0)
y_label = np.concatenate((y_train, y_val), axis=0)

reg_final = LinearRegression(normalize=True).fit(X_label, y_label)
print('Test Score: ', risk(reg, X_test, y_test))
print(reg_final.coef_)

Test Score:  2.2014742869399493
[-1.16261862e+00  6.56460198e-03 -2.47378582e-03 -7.44684507e-02
  1.63974346e+00 -1.95172101e+00 -1.26213252e-01 -8.92856101e-03
  4.56926988e-03  2.77324964e-01  2.51591842e+00  1.56804788e-01
 -3.05998463e-01  5.40323475e-04  1.16682161e-01  7.90753895e-03
  1.59047056e-01  2.36776007e-01  5.48559616e-01 -5.22387877e+00
 -3.84787124e-02  2.36014834e-02  1.52952886e+01 -1.27052948e+01
  5.64817948e+00]


In [117]:
import pickle

# Save the model to a file
fname = 'nutriscore_lr.model'
pickle.dump(reg_final, open(fname, 'wb'))