In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.style.use('seaborn')

In [None]:
# !head data/mfp-diaries.tsv

## Exploratory Data Analysis

In [2]:
df = pd.read_csv('data/mfp-diaries.tsv',
                  sep='\t',
                  header=None,
                  names=['userId','diary_date','food_entries','daily_goal'])

In [3]:
df.head()

Unnamed: 0,userId,diary_date,food_entries,daily_goal
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [4]:
len(df['userId'].unique())

9896

## UserId, Entry_Date, Daily_Goal  (Need to Parse Data)

In [7]:
df_goal = df.drop(['food_entries'], axis=1)

In [8]:
df_goal.head()

Unnamed: 0,userId,diary_date,daily_goal
0,1,2014-09-14,"{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [10]:
parse_df = df_goal.drop(['userId', 'diary_date'], axis=1)
parse_df.head()

Unnamed: 0,daily_goal
0,"{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,"{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,"{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,"{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,"{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [11]:
tg_dict = json.loads(parse_df['daily_goal'][0])
tg_dict

{'total': [{'name': 'Calories', 'value': 2924},
  {'name': 'Carbs', 'value': 340},
  {'name': 'Fat', 'value': 114},
  {'name': 'Protein', 'value': 186},
  {'name': 'Sodium', 'value': 3658},
  {'name': 'Sugar', 'value': 109}],
 'goal': [{'name': 'Calories', 'value': 3173},
  {'name': 'Carbs', 'value': 396},
  {'name': 'Fat', 'value': 105},
  {'name': 'Protein', 'value': 160},
  {'name': 'Sodium', 'value': 2300},
  {'name': 'Sugar', 'value': 119}]}

## Parse Function

In [12]:
"""Creates list of keys and values from the first dictionary, and zips them together"""
def daily_totals(tg_dict):    
    total_cols = [list(tg_dict.keys())[0] + "_" + x['name'].lower() 
                  for x in tg_dict['total']]
    total_values = [y['value'] 
                  for y in tg_dict['total']]

    total_dict = {}
    for col, val in zip(total_cols, total_values):
        total_dict[col] = val
    return total_dict

"""Creates list of keys and values from the second dictionary, and zips them together"""
def daily_goals(tg_dict):
    goal_cols = [list(tg_dict.keys())[1] + "_" + k['name'].lower()
                 for k in tg_dict['goal']]
    goal_values = [i['value']
                 for i in tg_dict['goal']]
    
    goal_dict = {}
    for col, val in zip(goal_cols, goal_values):
        goal_dict[col] = val
    return goal_dict

"""Combines new keys and values for 'Total' and 'Goals' into single dictionary"""
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res

"""Iterrates through entire parsed dataframe and appends merged rows"""
def final_parsed_dict():
    results = []
    for row in parse_df['daily_goal'].values:
        tg_dict = json.loads(row)
        results.append(Merge(daily_goals(tg_dict), daily_totals(tg_dict)))
    return results

In [13]:
f = pd.DataFrame(final_parsed_dict())

In [14]:
f.head()

Unnamed: 0,goal_calcium,goal_calories,goal_carbs,goal_chol,goal_fat,goal_fiber,goal_iron,goal_mon fat,goal_ply fat,goal_potass.,...,total_mon fat,total_ply fat,total_potass.,total_protein,total_sat fat,total_sodium,total_sugar,total_trn fat,total_vit a,total_vit c
0,,3173.0,396.0,,105.0,,,,,,...,,,,186.0,,3658.0,109.0,,,
1,,1572.0,196.0,,52.0,,,,,,...,,,,50.0,,855.0,63.0,,,
2,,1832.0,229.0,,61.0,,,,,,...,,,,114.0,,2215.0,100.0,,,
3,,1685.0,210.0,,56.0,,,,,,...,,,,98.0,,1765.0,105.0,,,
4,,1597.0,199.0,,53.0,,,,,,...,,,,202.0,,1101.0,71.0,,,


In [15]:
df_concat = pd.concat([df_goal, f], axis=1)
final_df = df_concat.drop(['daily_goal'], axis=1)

In [16]:
final_df = final_df.drop(['goal_calcium','goal_chol','goal_iron','goal_mon fat',
                          'goal_ply fat','goal_potass.','goal_sat fat','goal_trn fat',
                          'goal_vit a','goal_vit c','total_calcium','total_chol',
                          'total_iron','total_mon fat','total_ply fat','total_potass.',
                          'total_sat fat','total_trn fat','total_vit a','total_vit c'], axis=1)

In [17]:
final_df.isna().sum()

userId                 0
diary_date             0
goal_calories          7
goal_carbs         14145
goal_fat           17859
goal_fiber        344347
goal_protein        7321
goal_sodium       196659
goal_sugar        218272
total_calories        26
total_carbs        14146
total_fat          17866
total_fiber       344349
total_protein       7331
total_sodium      196660
total_sugar       218270
dtype: int64

In [18]:
final_df.fillna(0, inplace=True)

In [22]:
final_df

Unnamed: 0,userId,diary_date,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar
0,1,2014-09-14,3173.0,396.0,105.0,0.0,160.0,2300.0,119.0,2924.0,340.0,114.0,0.0,186.0,3658.0,109.0
1,1,2014-09-15,1572.0,196.0,52.0,0.0,79.0,2300.0,59.0,2430.0,96.0,37.0,0.0,50.0,855.0,63.0
2,1,2014-09-16,1832.0,229.0,61.0,0.0,92.0,2300.0,69.0,1862.0,158.0,54.0,0.0,114.0,2215.0,100.0
3,1,2014-09-17,1685.0,210.0,56.0,0.0,85.0,2300.0,63.0,2251.0,187.0,60.0,0.0,98.0,1765.0,105.0
4,1,2014-09-18,1597.0,199.0,53.0,0.0,80.0,2300.0,60.0,2001.0,113.0,81.0,0.0,202.0,1101.0,71.0
5,1,2014-09-19,1589.0,198.0,53.0,0.0,80.0,2300.0,60.0,2158.0,180.0,89.0,0.0,115.0,1998.0,84.0
6,1,2014-09-20,2823.0,352.0,93.0,0.0,142.0,2300.0,106.0,2691.0,282.0,92.0,0.0,216.0,2623.0,134.0
7,1,2014-09-21,2168.0,271.0,72.0,0.0,109.0,2300.0,82.0,2524.0,224.0,62.0,0.0,133.0,2602.0,110.0
8,1,2014-09-22,2153.0,269.0,71.0,0.0,108.0,2300.0,81.0,2182.0,195.0,74.0,0.0,180.0,1507.0,129.0
9,1,2014-09-23,2587.0,323.0,86.0,0.0,130.0,2300.0,97.0,2443.0,214.0,128.0,0.0,147.0,3222.0,105.0


In [None]:
# final_df['diary_date'] = pd.to_datetime(final_df['diary_date'])

In [None]:
# for col in final_df.columns:
#     final_df[col] = final_df[col].fillna(0)

In [25]:
# Create sum column of all nutritional values inputted for the day
final_df['total_sum'] = f[['total_calcium', 'total_calories', 'total_carbs','total_chol', 
                           'total_fat', 'total_fiber', 'total_iron', 'total_mon fat', 
                           'total_ply fat', 'total_potass.', 'total_protein', 'total_sat fat', 
                           'total_sodium', 'total_sugar', 'total_trn fat', 'total_vit a', 
                           'total_vit c']
                          ].apply(np.sum, axis=1)

In [26]:
# Create sum column of all nutritional goal values for the day
final_df['goal_sum'] = f[['goal_calcium', 'goal_calories', 'goal_carbs','goal_chol', 
                          'goal_fat', 'goal_fiber', 'goal_iron', 'goal_mon fat', 
                          'goal_ply fat', 'goal_potass.', 'goal_protein', 'goal_sat fat', 
                          'goal_sodium', 'goal_sugar', 'goal_trn fat', 'goal_vit a', 'goal_vit c']
                          ].apply(np.sum, axis=1)

In [28]:
final_df.head().T

Unnamed: 0,0,1,2,3,4
userId,1,1,1,1,1
diary_date,2014-09-14,2014-09-15,2014-09-16,2014-09-17,2014-09-18
goal_calories,3173,1572,1832,1685,1597
goal_carbs,396,196,229,210,199
goal_fat,105,52,61,56,53
goal_fiber,0,0,0,0,0
goal_protein,160,79,92,85,80
goal_sodium,2300,2300,2300,2300,2300
goal_sugar,119,59,69,63,60
total_calories,2924,2430,1862,2251,2001


In [49]:
final_df.head()

Unnamed: 0,userId,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum
0,1,3173.0,396.0,105.0,0.0,160.0,2300.0,119.0,2924.0,340.0,114.0,0.0,186.0,3658.0,109.0,7331.0,6253.0
1,1,1572.0,196.0,52.0,0.0,79.0,2300.0,59.0,2430.0,96.0,37.0,0.0,50.0,855.0,63.0,3531.0,4258.0
2,1,1832.0,229.0,61.0,0.0,92.0,2300.0,69.0,1862.0,158.0,54.0,0.0,114.0,2215.0,100.0,4503.0,4583.0
3,1,1685.0,210.0,56.0,0.0,85.0,2300.0,63.0,2251.0,187.0,60.0,0.0,98.0,1765.0,105.0,4466.0,4399.0
4,1,1597.0,199.0,53.0,0.0,80.0,2300.0,60.0,2001.0,113.0,81.0,0.0,202.0,1101.0,71.0,3569.0,4289.0


In [111]:
days = df['userId'].value_counts().sort_index()
days = pd.DataFrame(days).rename(columns={'userId': 'days'})
days.head()

Unnamed: 0,days
1,174
2,60
3,7
4,27
5,179


In [None]:
cols = final_df.drop('userId','diary_date', axis=1).columns

In [99]:
ff = final_df.groupby(['userId'])[cols].sum()
ff.head()

Unnamed: 0_level_0,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,375085.0,46633.0,12472.0,0.0,19029.0,400200.0,14076.0,401750.0,32265.0,13562.0,0.0,22408.0,360487.0,14227.0,844699.0,867495.0
2,90925.0,11387.0,2955.0,0.0,4714.0,138000.0,3422.0,87430.0,10872.0,2537.0,0.0,2550.0,100217.0,4573.0,208179.0,251403.0
3,9438.0,1178.0,312.0,196.0,469.0,16100.0,0.0,11969.0,811.0,267.0,120.0,323.0,10156.0,0.0,23646.0,27693.0
4,44211.0,5524.0,1464.0,0.0,2223.0,62100.0,0.0,24636.0,2682.0,1001.0,0.0,1153.0,40061.0,0.0,86279.0,210022.0
5,572414.0,0.0,15948.0,7564.0,35677.0,411700.0,21485.0,536631.0,0.0,13818.0,6305.0,33720.0,358700.0,21707.0,970881.0,1064788.0


In [114]:
combine = pd.merge(days, ff, left_index=True, right_index=True)
combine

Unnamed: 0,days,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum
1,174,375085.0,46633.0,12472.0,0.0,19029.0,400200.0,14076.0,401750.0,32265.0,13562.0,0.0,22408.0,360487.0,14227.0,844699.0,867495.0
2,60,90925.0,11387.0,2955.0,0.0,4714.0,138000.0,3422.0,87430.0,10872.0,2537.0,0.0,2550.0,100217.0,4573.0,208179.0,251403.0
3,7,9438.0,1178.0,312.0,196.0,469.0,16100.0,0.0,11969.0,811.0,267.0,120.0,323.0,10156.0,0.0,23646.0,27693.0
4,27,44211.0,5524.0,1464.0,0.0,2223.0,62100.0,0.0,24636.0,2682.0,1001.0,0.0,1153.0,40061.0,0.0,86279.0,210022.0
5,179,572414.0,0.0,15948.0,7564.0,35677.0,411700.0,21485.0,536631.0,0.0,13818.0,6305.0,33720.0,358700.0,21707.0,970881.0,1064788.0
6,115,181432.0,18144.0,6071.0,0.0,13616.0,0.0,0.0,156058.0,16249.0,6267.0,0.0,7745.0,0.0,0.0,186319.0,219263.0
7,15,19761.0,1852.0,549.0,383.0,1852.0,0.0,1179.0,12786.0,1513.0,383.0,133.0,891.0,0.0,514.0,16220.0,25576.0
8,85,171949.0,0.0,0.0,2157.0,13005.0,0.0,0.0,167892.0,0.0,0.0,2330.0,10931.0,0.0,0.0,186062.0,195611.0
9,1,1656.0,208.0,55.0,0.0,83.0,2300.0,62.0,1495.0,119.0,38.0,0.0,25.0,729.0,56.0,2462.0,4364.0
10,59,103887.0,13110.0,3516.0,0.0,5274.0,135700.0,3930.0,110924.0,12861.0,3506.0,0.0,4604.0,101906.0,6669.0,240470.0,265417.0


## Labeling:
- Achievement Column: Indicates whether or not a user has met their daily goal within a tolerance of 10%.
    - 1 = Succeed
    - 0 = Fail

In [115]:
combine['achievement'] = np.where(abs((combine['total_sum'] - combine['goal_sum']) / combine['goal_sum']) 
                                   <= 0.1, 1, 0)

In [118]:
combine.head()

Unnamed: 0,days,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum,achievement
1,174,375085.0,46633.0,12472.0,0.0,19029.0,400200.0,14076.0,401750.0,32265.0,13562.0,0.0,22408.0,360487.0,14227.0,844699.0,867495.0,1
2,60,90925.0,11387.0,2955.0,0.0,4714.0,138000.0,3422.0,87430.0,10872.0,2537.0,0.0,2550.0,100217.0,4573.0,208179.0,251403.0,0
3,7,9438.0,1178.0,312.0,196.0,469.0,16100.0,0.0,11969.0,811.0,267.0,120.0,323.0,10156.0,0.0,23646.0,27693.0,0
4,27,44211.0,5524.0,1464.0,0.0,2223.0,62100.0,0.0,24636.0,2682.0,1001.0,0.0,1153.0,40061.0,0.0,86279.0,210022.0,0
5,179,572414.0,0.0,15948.0,7564.0,35677.0,411700.0,21485.0,536631.0,0.0,13818.0,6305.0,33720.0,358700.0,21707.0,970881.0,1064788.0,1


In [None]:
final_df.to_json('data/mfp-goals.json', orient='records', lines=True)

In [None]:
final_df = pd.read_json('data/mfp-goals.json', lines=True)

In [None]:
final_df['diary_date'] = pd.to_datetime(final_df['diary_date'])

In [None]:
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score

In [None]:
#Setting X and y variables, then Train Test Split
X = final_df.drop(['achievement','diary_date','userId','goal_sum','total_sum',
                   'goal_calories','goal_carbs','goal_fat','goal_fiber',
                   'goal_protein','goal_sodium','goal_sugar'], axis=1)
y = final_df.achievement

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
loreg = LogisticRegression(solver='lbfgs', max_iter=1000)
loreg.fit(X_train, y_train)
loreg_preds = loreg.predict(X_test)

In [None]:
def print_metrics(y, loreg_preds):
    print("Precision Score: {}".format(precision_score(y, loreg_preds)))
    print("Recall Score: {}".format(recall_score(y, loreg_preds)))
    print("Accuracy Score: {}".format(accuracy_score(y, loreg_preds)))
    print("F1 Score: {}".format(f1_score(y, loreg_preds)))
    
print_metrics(y_test, loreg_preds)

In [None]:
# json_dict.keys()[0]
# Gives Error: 'dict_keys' object does not support indexing, hence needs to be set to a list to iterate.

### Goal:
Create 1 column per dictionary in both total key and goal key

- total
    + total_calories
    + total_carbs
    + total_fat
    + total_protein
    + total_sodium
    + total_sugar
- goal
    + goal_calories
    + goal_carbs
    + goal_fat
    + goal_protein
    + goal_sodium
    + goal_sugar

In [None]:
# df = pd.DataFrame([[1, 4],
#                    [2, 5],
#                    [3, 6]],
#                   columns=["a", "b"])
# df

In [None]:
# df["c"] = df.apply(np.sum, axis=1)
# df