In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('seaborn')

In [None]:
# !head data/mfp-diaries.tsv

## Exploratory Data Analysis

In [2]:
df = pd.read_csv('data/mfp-diaries.tsv',
                  sep='\t',
                  header=None,
                  names=['userId','diary_date','food_entries','daily_goal'])

In [4]:
df.head()

Unnamed: 0,userId,diary_date,food_entries,daily_goal
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [7]:
len(df['userId'].unique())

9896

In [None]:
# entry = json.loads(df.iloc[2, 2])
# entry

In [None]:
# entry[0].keys()

## UserId, Entry_Date, Daily_Goal  (Need to Parse Data)

In [8]:
df_goal = df.drop(['food_entries'], axis=1)

In [9]:
df_goal.head()

Unnamed: 0,userId,diary_date,daily_goal
0,1,2014-09-14,"{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [10]:
parse_df = df_goal.drop(['userId', 'diary_date'], axis=1)
parse_df.head()

Unnamed: 0,daily_goal
0,"{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,"{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,"{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,"{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,"{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [11]:
tg_dict = json.loads(parse_df['daily_goal'][0])
tg_dict

{'total': [{'name': 'Calories', 'value': 2924},
  {'name': 'Carbs', 'value': 340},
  {'name': 'Fat', 'value': 114},
  {'name': 'Protein', 'value': 186},
  {'name': 'Sodium', 'value': 3658},
  {'name': 'Sugar', 'value': 109}],
 'goal': [{'name': 'Calories', 'value': 3173},
  {'name': 'Carbs', 'value': 396},
  {'name': 'Fat', 'value': 105},
  {'name': 'Protein', 'value': 160},
  {'name': 'Sodium', 'value': 2300},
  {'name': 'Sugar', 'value': 119}]}

## Parse Function

In [12]:
"""Creates list of keys and values from the first dictionary, and zips them together"""
def daily_totals(tg_dict):    
    total_cols = [list(tg_dict.keys())[0] + "_" + x['name'].lower() 
                  for x in tg_dict['total']]
    total_values = [y['value'] 
                  for y in tg_dict['total']]

    total_dict = {}
    for col, val in zip(total_cols, total_values):
        total_dict[col] = val
    return total_dict

"""Creates list of keys and values from the second dictionary, and zips them together"""
def daily_goals(tg_dict):
    goal_cols = [list(tg_dict.keys())[1] + "_" + k['name'].lower()
                 for k in tg_dict['goal']]
    goal_values = [i['value']
                 for i in tg_dict['goal']]
    
    goal_dict = {}
    for col, val in zip(goal_cols, goal_values):
        goal_dict[col] = val
    return goal_dict

"""Combines new keys and values for 'Total' and 'Goals' into single dictionary"""
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res

"""Iterrates through entire parsed dataframe and appends merged rows"""
def final_parsed_dict():
    results = []
    for row in parse_df['daily_goal'].values:
        tg_dict = json.loads(row)
        results.append(Merge(daily_goals(tg_dict), daily_totals(tg_dict)))
    return results

In [13]:
f = pd.DataFrame(final_parsed_dict())

In [14]:
df_concat = pd.concat([df_goal, f], axis=1)
final_df = df_concat.drop(['daily_goal'], axis=1)

In [16]:
final_df = final_df.drop(['goal_calcium','goal_chol','goal_iron','goal_mon fat',
                          'goal_ply fat','goal_potass.','goal_sat fat','goal_trn fat',
                          'goal_vit a','goal_vit c','total_calcium','total_chol',
                          'total_iron','total_mon fat','total_ply fat','total_potass.',
                          'total_sat fat','total_trn fat','total_vit a','total_vit c'], axis=1)

In [17]:
final_df.isna().sum()

userId                 0
diary_date             0
goal_calories          7
goal_carbs         14145
goal_fat           17859
goal_fiber        344347
goal_protein        7321
goal_sodium       196659
goal_sugar        218272
total_calories        26
total_carbs        14146
total_fat          17866
total_fiber       344349
total_protein       7331
total_sodium      196660
total_sugar       218270
dtype: int64

In [18]:
final_df.fillna(0, inplace=True)

In [19]:
final_df.isna().sum()

userId            0
diary_date        0
goal_calories     0
goal_carbs        0
goal_fat          0
goal_fiber        0
goal_protein      0
goal_sodium       0
goal_sugar        0
total_calories    0
total_carbs       0
total_fat         0
total_fiber       0
total_protein     0
total_sodium      0
total_sugar       0
dtype: int64

In [20]:
final_df.head()

Unnamed: 0,userId,diary_date,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar
0,1,2014-09-14,3173.0,396.0,105.0,0.0,160.0,2300.0,119.0,2924.0,340.0,114.0,0.0,186.0,3658.0,109.0
1,1,2014-09-15,1572.0,196.0,52.0,0.0,79.0,2300.0,59.0,2430.0,96.0,37.0,0.0,50.0,855.0,63.0
2,1,2014-09-16,1832.0,229.0,61.0,0.0,92.0,2300.0,69.0,1862.0,158.0,54.0,0.0,114.0,2215.0,100.0
3,1,2014-09-17,1685.0,210.0,56.0,0.0,85.0,2300.0,63.0,2251.0,187.0,60.0,0.0,98.0,1765.0,105.0
4,1,2014-09-18,1597.0,199.0,53.0,0.0,80.0,2300.0,60.0,2001.0,113.0,81.0,0.0,202.0,1101.0,71.0


In [21]:
final_df['diary_date'] = pd.to_datetime(final_df['diary_date'])

In [None]:
# for col in final_df.columns:
#     final_df[col] = final_df[col].fillna(0)

In [23]:
# Create sum column of all nutritional values inputted for the day
final_df['total_sum'] = f[['total_calcium', 'total_calories', 'total_carbs','total_chol', 
                           'total_fat', 'total_fiber', 'total_iron', 'total_mon fat', 
                           'total_ply fat', 'total_potass.', 'total_protein', 'total_sat fat', 
                           'total_sodium', 'total_sugar', 'total_trn fat', 'total_vit a', 
                           'total_vit c']
                          ].apply(np.sum, axis=1)

In [24]:
# Create sum column of all nutritional goal values for the day
final_df['goal_sum'] = f[['goal_calcium', 'goal_calories', 'goal_carbs','goal_chol', 
                          'goal_fat', 'goal_fiber', 'goal_iron', 'goal_mon fat', 
                          'goal_ply fat', 'goal_potass.', 'goal_protein', 'goal_sat fat', 
                          'goal_sodium', 'goal_sugar', 'goal_trn fat', 'goal_vit a', 'goal_vit c']
                          ].apply(np.sum, axis=1)

In [None]:
# final_df['achievement'] = final_df['total_sum'].apply(lambda x: 'succeed' 
#                                                       if x <= 6000 
#                                                       else 'fail')

## Labeling:
- Achievement Column: Indicates whether or not a user has met their daily goal within a tolerance of 10%.
    - 1 = Succeed
    - 0 = Fail

In [25]:
final_df['achievement'] = np.where(abs((final_df['total_sum'] - final_df['goal_sum']) / final_df['goal_sum']) 
                                   <= 0.1, 1, 0)

In [26]:
final_df.head().T

Unnamed: 0,0,1,2,3,4
userId,1,1,1,1,1
diary_date,2014-09-14 00:00:00,2014-09-15 00:00:00,2014-09-16 00:00:00,2014-09-17 00:00:00,2014-09-18 00:00:00
goal_calories,3173,1572,1832,1685,1597
goal_carbs,396,196,229,210,199
goal_fat,105,52,61,56,53
goal_fiber,0,0,0,0,0
goal_protein,160,79,92,85,80
goal_sodium,2300,2300,2300,2300,2300
goal_sugar,119,59,69,63,60
total_calories,2924,2430,1862,2251,2001


In [27]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587187 entries, 0 to 587186
Data columns (total 19 columns):
userId            587187 non-null int64
diary_date        587187 non-null datetime64[ns]
goal_calories     587187 non-null float64
goal_carbs        587187 non-null float64
goal_fat          587187 non-null float64
goal_fiber        587187 non-null float64
goal_protein      587187 non-null float64
goal_sodium       587187 non-null float64
goal_sugar        587187 non-null float64
total_calories    587187 non-null float64
total_carbs       587187 non-null float64
total_fat         587187 non-null float64
total_fiber       587187 non-null float64
total_protein     587187 non-null float64
total_sodium      587187 non-null float64
total_sugar       587187 non-null float64
total_sum         587187 non-null float64
goal_sum          587187 non-null float64
achievement       587187 non-null int64
dtypes: datetime64[ns](1), float64(16), int64(2)
memory usage: 85.1 MB


In [33]:
final_df.to_json('data/mfp-goals.json', orient='records', lines=True)

In [36]:
df = pd.read_json('data/mfp-goals.json', lines=True)

In [37]:
df.head()

Unnamed: 0,achievement,diary_date,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,goal_sum,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,userId
0,0,1410652800000,3173,396,105,0,160,2300,119,6253,2924,340,114,0,186,3658,109,7331,1
1,0,1410739200000,1572,196,52,0,79,2300,59,4258,2430,96,37,0,50,855,63,3531,1
2,1,1410825600000,1832,229,61,0,92,2300,69,4583,1862,158,54,0,114,2215,100,4503,1
3,1,1410912000000,1685,210,56,0,85,2300,63,4399,2251,187,60,0,98,1765,105,4466,1
4,0,1410998400000,1597,199,53,0,80,2300,60,4289,2001,113,81,0,202,1101,71,3569,1


In [39]:
df['diary_date'] = pd.to_datetime(df['diary_date'])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587187 entries, 0 to 587186
Data columns (total 19 columns):
achievement       587187 non-null int64
diary_date        587187 non-null datetime64[ns]
goal_calories     587187 non-null int64
goal_carbs        587187 non-null int64
goal_fat          587187 non-null int64
goal_fiber        587187 non-null int64
goal_protein      587187 non-null int64
goal_sodium       587187 non-null int64
goal_sugar        587187 non-null int64
goal_sum          587187 non-null int64
total_calories    587187 non-null int64
total_carbs       587187 non-null int64
total_fat         587187 non-null int64
total_fiber       587187 non-null int64
total_protein     587187 non-null int64
total_sodium      587187 non-null int64
total_sugar       587187 non-null int64
total_sum         587187 non-null int64
userId            587187 non-null int64
dtypes: datetime64[ns](1), int64(18)
memory usage: 85.1 MB


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score

In [29]:
#Setting X and y variables, then Train Test Split
X = final_df.drop(['achievement','diary_date','userId','goal_sum','total_sum',
                   'goal_calories','goal_carbs','goal_fat','goal_fiber',
                   'goal_protein','goal_sodium','goal_sugar'], axis=1)
y = final_df.achievement

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [30]:
loreg = LogisticRegression(solver='lbfgs', max_iter=1000)
loreg.fit(X_train, y_train)
loreg_preds = loreg.predict(X_test)

In [31]:
def print_metrics(y, loreg_preds):
    print("Precision Score: {}".format(precision_score(y, loreg_preds)))
    print("Recall Score: {}".format(recall_score(y, loreg_preds)))
    print("Accuracy Score: {}".format(accuracy_score(y, loreg_preds)))
    print("F1 Score: {}".format(f1_score(y, loreg_preds)))
    
print_metrics(y_test, loreg_preds)

Precision Score: 0.2971922246220302
Recall Score: 0.0221778092966282
Accuracy Score: 0.7822775669802516
F1 Score: 0.04127545969943306


In [None]:
# json_dict.keys()[0]
# Gives Error: 'dict_keys' object does not support indexing, hence needs to be set to a list to iterate.

### Goal:
Create 1 column per dictionary in both total key and goal key

- total
    + total_calories
    + total_carbs
    + total_fat
    + total_protein
    + total_sodium
    + total_sugar
- goal
    + goal_calories
    + goal_carbs
    + goal_fat
    + goal_protein
    + goal_sodium
    + goal_sugar

In [None]:
# df = pd.DataFrame([[1, 4],
#                    [2, 5],
#                    [3, 6]],
#                   columns=["a", "b"])
# df

In [None]:
# df["c"] = df.apply(np.sum, axis=1)
# df