# Fitness Tracker Survival Analysis
Created by: Andy Luc

## Business case:
1. What is the average amount of time that MyFitnessPal users use the app, and do they use it consistently throughout?
2. Do users still keep using the program after meeting or not meeting their goal?

In [1]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%load_ext autoreload
%autoreload 2
plt.style.use('seaborn')

## Exploratory Data Analysis

In [2]:
#Reading the data file
raw_df = pd.read_csv('data/mfp-diaries.tsv',
                  sep='\t',
                  header=None,
                  names=['userId','diary_date','food_entries','daily_goal'])

In [3]:
raw_df.head()

Unnamed: 0,userId,diary_date,food_entries,daily_goal
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [4]:
#Number of unique users
len(raw_df['userId'].unique())

9896

### Parsing lists of dictionaries within *daily_goals* column

In [5]:
parse_df = raw_df.drop(['userId', 'diary_date','food_entries'], axis=1)

In [6]:
parse_df.head()

Unnamed: 0,daily_goal
0,"{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,"{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,"{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,"{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,"{""total"": [{""name"": ""Calories"", ""value"": 2001}..."


In [7]:
tg_dict = json.loads(parse_df['daily_goal'][0])
tg_dict

{'total': [{'name': 'Calories', 'value': 2924},
  {'name': 'Carbs', 'value': 340},
  {'name': 'Fat', 'value': 114},
  {'name': 'Protein', 'value': 186},
  {'name': 'Sodium', 'value': 3658},
  {'name': 'Sugar', 'value': 109}],
 'goal': [{'name': 'Calories', 'value': 3173},
  {'name': 'Carbs', 'value': 396},
  {'name': 'Fat', 'value': 105},
  {'name': 'Protein', 'value': 160},
  {'name': 'Sodium', 'value': 2300},
  {'name': 'Sugar', 'value': 119}]}

### Parse Functions

In [8]:
"""Creates list of keys and values from the first dictionary, and zips them together"""
def daily_totals(tg_dict):    
    total_cols = [list(tg_dict.keys())[0] + "_" + x['name'].lower() 
                  for x in tg_dict['total']]
    total_values = [y['value'] 
                  for y in tg_dict['total']]

    total_dict = {}
    for col, val in zip(total_cols, total_values):
        total_dict[col] = val
    return total_dict

"""Creates list of keys and values from the second dictionary, and zips them together"""
def daily_goals(tg_dict):
    goal_cols = [list(tg_dict.keys())[1] + "_" + k['name'].lower()
                 for k in tg_dict['goal']]
    goal_values = [i['value']
                 for i in tg_dict['goal']]
    
    goal_dict = {}
    for col, val in zip(goal_cols, goal_values):
        goal_dict[col] = val
    return goal_dict

"""Combines new keys and values for 'Total' and 'Goals' into single dictionary"""
def merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res

"""Iterrates through entire parsed dataframe and appends merged rows"""
def final_parsed_dict():
    results = []
    for row in parse_df['daily_goal'].values:
        tg_dict = json.loads(row)
        results.append(merge(daily_goals(tg_dict), daily_totals(tg_dict)))
    return results

In [9]:
tg_parse = pd.DataFrame(final_parsed_dict())

In [10]:
tg_parse.head()

Unnamed: 0,goal_calcium,goal_calories,goal_carbs,goal_chol,goal_fat,goal_fiber,goal_iron,goal_mon fat,goal_ply fat,goal_potass.,...,total_mon fat,total_ply fat,total_potass.,total_protein,total_sat fat,total_sodium,total_sugar,total_trn fat,total_vit a,total_vit c
0,,3173.0,396.0,,105.0,,,,,,...,,,,186.0,,3658.0,109.0,,,
1,,1572.0,196.0,,52.0,,,,,,...,,,,50.0,,855.0,63.0,,,
2,,1832.0,229.0,,61.0,,,,,,...,,,,114.0,,2215.0,100.0,,,
3,,1685.0,210.0,,56.0,,,,,,...,,,,98.0,,1765.0,105.0,,,
4,,1597.0,199.0,,53.0,,,,,,...,,,,202.0,,1101.0,71.0,,,


In [11]:
concat_df = pd.concat([raw_df, tg_parse], axis=1)

In [12]:
concat_df.isna().sum()

userId                 0
diary_date             0
food_entries           0
daily_goal             0
goal_calcium      564262
goal_calories          7
goal_carbs         14145
goal_chol         568935
goal_fat           17859
goal_fiber        344347
goal_iron         558171
goal_mon fat      587096
goal_ply fat      586693
goal_potass.      567771
goal_protein        7321
goal_sat fat      570611
goal_sodium       196659
goal_sugar        218272
goal_trn fat      585359
goal_vit a        586413
goal_vit c        584701
total_calcium     564262
total_calories        26
total_carbs        14146
total_chol        568935
total_fat          17866
total_fiber       344349
total_iron        558171
total_mon fat     586759
total_ply fat     586426
total_potass.     567771
total_protein       7331
total_sat fat     570611
total_sodium      196660
total_sugar       218270
total_trn fat     585359
total_vit a       586413
total_vit c       584701
dtype: int64

In [13]:
total_goal_df = concat_df.drop(['food_entries','daily_goal','goal_calcium','goal_chol',
                                'goal_iron','goal_mon fat','goal_ply fat','goal_potass.',
                                'goal_sat fat','goal_trn fat','goal_vit a','goal_vit c',
                                'total_calcium','total_chol','total_iron','total_mon fat',
                                'total_ply fat','total_potass.','total_sat fat','total_trn fat',
                                'total_vit a','total_vit c'], axis=1)

In [14]:
total_goal_df.fillna(0, inplace=True)

In [15]:
# for col in final_df.columns:
#     final_df[col] = final_df[col].fillna(0)

In [16]:
total_goal_df['diary_date'] = pd.to_datetime(total_goal_df['diary_date'])

In [17]:
total_goal_df.isna().sum()

userId            0
diary_date        0
goal_calories     0
goal_carbs        0
goal_fat          0
goal_fiber        0
goal_protein      0
goal_sodium       0
goal_sugar        0
total_calories    0
total_carbs       0
total_fat         0
total_fiber       0
total_protein     0
total_sodium      0
total_sugar       0
dtype: int64

In [18]:
total_goal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587187 entries, 0 to 587186
Data columns (total 16 columns):
userId            587187 non-null int64
diary_date        587187 non-null datetime64[ns]
goal_calories     587187 non-null float64
goal_carbs        587187 non-null float64
goal_fat          587187 non-null float64
goal_fiber        587187 non-null float64
goal_protein      587187 non-null float64
goal_sodium       587187 non-null float64
goal_sugar        587187 non-null float64
total_calories    587187 non-null float64
total_carbs       587187 non-null float64
total_fat         587187 non-null float64
total_fiber       587187 non-null float64
total_protein     587187 non-null float64
total_sodium      587187 non-null float64
total_sugar       587187 non-null float64
dtypes: datetime64[ns](1), float64(14), int64(1)
memory usage: 71.7 MB


## Labeling:
- Achievement Column: Indicates whether or not a user has met their daily goal within a tolerance of 10%.
    - 1 = Succeed
    - 0 = Fail

In [19]:
# Create sum column of all nutritional values inputted for the day
totals = ['total_calories','total_carbs','total_fat','total_fiber',
          'total_protein','total_sodium','total_sugar']

total_goal_df['total_sum'] = total_goal_df[totals].apply(np.sum, axis=1)

In [20]:
# Create sum column of all nutritional goal values for the day
goals = ['goal_calories','goal_carbs','goal_fat','goal_fiber',
          'goal_protein','goal_sodium','goal_sugar']

total_goal_df['goal_sum'] = total_goal_df[goals].apply(np.sum, axis=1)

In [21]:
total_goal_df['achievement'] = np.where(abs((total_goal_df['total_sum'] - total_goal_df['goal_sum']) 
                                             / total_goal_df['goal_sum']) <= 0.1, 1, 0)

In [22]:
total_goal_df.head()

Unnamed: 0,userId,diary_date,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum,achievement
0,1,2014-09-14,3173.0,396.0,105.0,0.0,160.0,2300.0,119.0,2924.0,340.0,114.0,0.0,186.0,3658.0,109.0,7331.0,6253.0,0
1,1,2014-09-15,1572.0,196.0,52.0,0.0,79.0,2300.0,59.0,2430.0,96.0,37.0,0.0,50.0,855.0,63.0,3531.0,4258.0,0
2,1,2014-09-16,1832.0,229.0,61.0,0.0,92.0,2300.0,69.0,1862.0,158.0,54.0,0.0,114.0,2215.0,100.0,4503.0,4583.0,1
3,1,2014-09-17,1685.0,210.0,56.0,0.0,85.0,2300.0,63.0,2251.0,187.0,60.0,0.0,98.0,1765.0,105.0,4466.0,4399.0,1
4,1,2014-09-18,1597.0,199.0,53.0,0.0,80.0,2300.0,60.0,2001.0,113.0,81.0,0.0,202.0,1101.0,71.0,3569.0,4289.0,0


## Survival Analysis

In [23]:
days = total_goal_df['userId'].value_counts().sort_index()
days = pd.DataFrame(days).rename(columns={'userId': 'days'})
days.tail()

Unnamed: 0,days
9893,22
9894,3
9895,169
9896,15
9897,90


In [24]:
cols = total_goal_df.drop(['userId','diary_date','achievement'], axis=1).columns

In [25]:
condense_df = total_goal_df.groupby(['userId'])[cols].sum()
condense_df.head()

Unnamed: 0_level_0,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,375085.0,46633.0,12472.0,0.0,19029.0,400200.0,14076.0,401750.0,32265.0,13562.0,0.0,22408.0,360487.0,14227.0,844699.0,867495.0
2,90925.0,11387.0,2955.0,0.0,4714.0,138000.0,3422.0,87430.0,10872.0,2537.0,0.0,2550.0,100217.0,4573.0,208179.0,251403.0
3,9438.0,1178.0,312.0,196.0,469.0,16100.0,0.0,11969.0,811.0,267.0,120.0,323.0,10156.0,0.0,23646.0,27693.0
4,44211.0,5524.0,1464.0,0.0,2223.0,62100.0,0.0,24636.0,2682.0,1001.0,0.0,1153.0,40061.0,0.0,69533.0,115522.0
5,572414.0,0.0,15948.0,7564.0,35677.0,411700.0,21485.0,536631.0,0.0,13818.0,6305.0,33720.0,358700.0,21707.0,970881.0,1064788.0


In [33]:
survival = pd.merge(days, condense_df, left_index=True, right_index=True)

In [37]:
survival['achievement'] = np.where(abs((survival['total_sum'] - survival['goal_sum']) 
                                        / survival['goal_sum']) <= 0.1, 1, 0)

In [41]:
survival.head()

Unnamed: 0,days,goal_calories,goal_carbs,goal_fat,goal_fiber,goal_protein,goal_sodium,goal_sugar,total_calories,total_carbs,total_fat,total_fiber,total_protein,total_sodium,total_sugar,total_sum,goal_sum,achievement
1,174,375085.0,46633.0,12472.0,0.0,19029.0,400200.0,14076.0,401750.0,32265.0,13562.0,0.0,22408.0,360487.0,14227.0,844699.0,867495.0,1
2,60,90925.0,11387.0,2955.0,0.0,4714.0,138000.0,3422.0,87430.0,10872.0,2537.0,0.0,2550.0,100217.0,4573.0,208179.0,251403.0,0
3,7,9438.0,1178.0,312.0,196.0,469.0,16100.0,0.0,11969.0,811.0,267.0,120.0,323.0,10156.0,0.0,23646.0,27693.0,0
4,27,44211.0,5524.0,1464.0,0.0,2223.0,62100.0,0.0,24636.0,2682.0,1001.0,0.0,1153.0,40061.0,0.0,69533.0,115522.0,0
5,179,572414.0,0.0,15948.0,7564.0,35677.0,411700.0,21485.0,536631.0,0.0,13818.0,6305.0,33720.0,358700.0,21707.0,970881.0,1064788.0,1


### Saving and reading from json:

In [28]:
# total_goal_df.to_json('data/mfp-goals.json', orient='records', lines=True)

In [32]:
# total_goal_df = pd.read_json('data/mfp-goals.json', lines=True)

# total_goal_df['diary_date'] = pd.to_datetime(total_goal_df['diary_date'])

# total_goal_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score

In [None]:
#Setting X and y variables, then Train Test Split
X = final_df.drop(['achievement','diary_date','userId','goal_sum','total_sum',
                   'goal_calories','goal_carbs','goal_fat','goal_fiber',
                   'goal_protein','goal_sodium','goal_sugar'], axis=1)
y = final_df.achievement

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [None]:
loreg = LogisticRegression(solver='lbfgs', max_iter=1000)
loreg.fit(X_train, y_train)
loreg_preds = loreg.predict(X_test)

In [None]:
def print_metrics(y, loreg_preds):
    print("Precision Score: {}".format(precision_score(y, loreg_preds)))
    print("Recall Score: {}".format(recall_score(y, loreg_preds)))
    print("Accuracy Score: {}".format(accuracy_score(y, loreg_preds)))
    print("F1 Score: {}".format(f1_score(y, loreg_preds)))
    
print_metrics(y_test, loreg_preds)

In [None]:
# json_dict.keys()[0]
# Gives Error: 'dict_keys' object does not support indexing, hence needs to be set to a list to iterate.

### Goal:
Create 1 column per dictionary in both total key and goal key

- total
    + total_calories
    + total_carbs
    + total_fat
    + total_protein
    + total_sodium
    + total_sugar
- goal
    + goal_calories
    + goal_carbs
    + goal_fat
    + goal_protein
    + goal_sodium
    + goal_sugar

In [None]:
# df = pd.DataFrame([[1, 4],
#                    [2, 5],
#                    [3, 6]],
#                   columns=["a", "b"])
# df

In [None]:
# df["c"] = df.apply(np.sum, axis=1)
# df