# Objective Binge Classifier #

Import dependencies

In [1]:
!pip install openpyxl
import pandas as pd
import json

You should consider upgrading via the '/Users/sanderputs/IdeaProjects/nlp/venv/bin/python -m pip install --upgrade pip' command.[0m


Set paths and response columns

In [2]:
work_dir = '../resources/data_ignored/'
input_file = work_dir + 'merged_unfiltered.xlsx'
output_file = work_dir + 'binge_classifier_out.xlsx'
output_file_raters = work_dir + 'binge_classifier_rates_out.xlsx'
output_file_food_intake = work_dir + 'binge_classifier_food_intake.xlsx'

df = pd.read_excel(input_file)

col_name_post_instance_key = 'POST_instance_key'
col_name_post_loc = 'POST_LOC' # Self-reported Loss Of Control
col_name_pre_type = 'PRE_TYPE'
col_post_sub_binge = 'POST_SUB_BINGE'

text_export = False # is data exported in text representation?

# 0 no
# 1 yes
# 2 na
loc_true = 1 # Self-reported Loss Of Control required value

# 0 no
# 1 yes
# 2 na
sub_binge_true = 1 # Subjective Binge required value

# 1 ontbijt
# 2 lunch
# 3 diner
# 4 anders
pre_type_category_other = ['Anders']
if not text_export:
    pre_type_category_other = [4]


columns_type_counts = df[col_name_pre_type].value_counts()
print('counts', columns_type_counts)

counts 4.0    1175
1.0     886
3.0     879
2.0     822
Name: PRE_TYPE, dtype: int64


Set paths and columns of config document which included portion sizes

In [3]:
path_config = '../resources/config/binge_config.xlsx'

df_config = pd.read_excel(path_config)
print(df_config.columns)

Index(['#', 'Category', '1 portion refers to', 'Std “meals’', 'Std “other”',
       'meals_int', 'other_int', 'meals_factor', 'other_factor', 'calories',
       'comment'],
      dtype='object')


Classification methods

In [3]:
# helper methods

def is_nan(num):
    return num != num

def get_food_cat_config_value(col_name, category_number):
    return df_config.loc[df_config['#'] == category_number, col_name].item()

def is_value_exceeding_threshold(row, col_name, threshold):
    value = row[col_name]
    if isinstance(value, str):
        return 0
    if value > threshold:
        return 1
    else:
        return 0
    
def is_meal(row):
    if row[col_name_pre_type] in pre_type_category_other:
        return 0
    else:
        return 1

Methods to determine valid binge instance
To be a valid instance, a row needs a post instance, loss of control, subjective binge and portion size

In [4]:
def is_subj_binge_instance(row):
    if is_nan(row[col_name_post_instance_key]):
        return "POST_INSTANCE_MISSING"
    if row[col_name_post_loc] != loc_true:
        return "POST_LOC_FALSE"
    if row[col_post_sub_binge] != sub_binge_true:
        return "POST_SUB_BINGE_FALSE"
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            return 'SUBJ_BINGE'
    return "NO_PORTION"

Count valid subjective binge instances

In [5]:
print('total instances', df['instance_key_x'].count())

df['subj_binge_instance'] = df.apply(is_subj_binge_instance, axis=1)
print('subj_binge_instance value count')
print(df['subj_binge_instance'].value_counts())


Method 1: determine objective binge by single category portions

In [6]:
def get_binge_value(row, category_number):
    if row[col_name_pre_type] in pre_type_category_other:
        col_name_bing_int = 'other_int'
    else:
        col_name_bing_int = 'meals_int'
    return df_config.loc[df_config['#'] == category_number, col_name_bing_int].item()

def is_objective_binge_by_single_category(row):
    if row['subj_binge_instance'] != 'SUBJ_BINGE':
        return 0
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            value = row[col_name_post_intake_por]
            binge_value = get_binge_value(row, category_number)
            if value > binge_value:
                return 0
            else:
                return 1

total instances 3782
subj_binge_instance value count
POST_LOC_FALSE           2513
SUBJ_BINGE                658
POST_INSTANCE_MISSING     493
POST_SUB_BINGE_FALSE      118
Name: subj_binge_instance, dtype: int64


Method 2: apply object binge by automatically normalize portion size,
where 1 equals the max normal portion size

In [7]:
def get_objective_binge_normalized_value(row):
    normalized_value = 0
    if row['subj_binge_instance'] != 'SUBJ_BINGE':
        return ''
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            portions = row[col_name_post_intake_por]
            max_normal_portion_size = get_binge_value(row, category_number)
            if max_normal_portion_size == 0:
                 # when 0 allowed it should exceed threshold
                return 1001
            else:
                normalized_portion_factor = 1 / max_normal_portion_size
                current = portions * normalized_portion_factor
            normalized_value = normalized_value + current
    return normalized_value

Method 3: apply object binge by automatically normalize portion size,
where 1 equals the max normal portion size

In [8]:
def get_factor_value(row, category_number):
    if row[col_name_pre_type] in pre_type_category_other:
        col_name_bing_int = 'other_factor'
    else:
        col_name_bing_int = 'meals_factor'
    return df_config.loc[df_config['#'] == category_number, col_name_bing_int].item()

def get_objective_binge_normalized_manual_value(row):
    normalized_value = 0
    if row['subj_binge_instance'] != 'SUBJ_BINGE':
        return ''
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            portions = row[col_name_post_intake_por]
            factor_value = get_factor_value(row, category_number)
            current = portions * factor_value
            normalized_value = normalized_value + current
    return normalized_value

Method 4: calories

In [9]:
def get_objective_binge_calories(row):
    total_calories = 0
    if row['subj_binge_instance'] != 'SUBJ_BINGE':
        return ''
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            portions = row[col_name_post_intake_por]
            portion_calories = get_food_cat_config_value('calories', category_number)
            current = portions * portion_calories
            total_calories = total_calories + current
    return total_calories

Preform classifications

In [10]:
# method 1 single category portions
df['objective_binge_single_category'] = df.apply(is_objective_binge_by_single_category, axis=1)

# method 2 normalized portions
df['objective_binge_normalized_value'] = df.apply(get_objective_binge_normalized_value, axis=1)
df['objective_binge_normalized'] = df.apply(is_value_exceeding_threshold, args=('objective_binge_normalized_value', 1), axis=1)

# method 3 manually normalized portions
df['objective_binge_factor_normalized_value'] = df.apply(get_objective_binge_normalized_manual_value, axis=1)
df['objective_binge_factor_normalized'] = df.apply(is_value_exceeding_threshold, args=('objective_binge_factor_normalized_value', 1), axis=1)

# method 4 calories
df['objective_binge_calories_value'] = df.apply(get_objective_binge_calories, axis=1)
df['objective_binge_calories'] = df.apply(is_value_exceeding_threshold, args=('objective_binge_calories_value', 1000), axis=1)

Objective binge classification results

In [11]:
print('subjective binges classified as objective binges')
print('objective_binge_single_category')
print(df['objective_binge_single_category'].value_counts())

print('objective_binge_normalized')
print(df['objective_binge_normalized'].value_counts())

print('objective_binge_factor_normalized')
print(df['objective_binge_factor_normalized'].value_counts())

print('objective_binge_calories')
print(df['objective_binge_calories'].value_counts())

Method to write food intake to JSON format

In [12]:
def get_food_intake(row):
    intake_list = []
    for x in range(24):
        category_number = x + 1
        col_name_post_intake_por = 'POST_INTAKE_POR_'+str(category_number)
        if not is_nan(row[col_name_post_intake_por]):
            value = row[col_name_post_intake_por]
            intake = { "category_number": category_number, "intake_por": value }
            intake_list.append(intake)
        else:
            continue
    return json.dumps(intake_list)

df['meal'] = df.apply(is_meal, axis=1)
df['food_intake'] = df.apply(get_food_intake, axis=1)

subjective binges classified as objective binges
objective_binge_single_category
0    3479
1     303
Name: objective_binge_single_category, dtype: int64
objective_binge_normalized
0    3236
1     546
Name: objective_binge_normalized, dtype: int64
objective_binge_factor_normalized
0    3236
1     546
Name: objective_binge_factor_normalized, dtype: int64
objective_binge_calories
0    3542
1     240
Name: objective_binge_calories, dtype: int64


Prepare a dataset for labelling of objective binges and normal eating moments
Select which classification result to use for objective binges

In [13]:
col_objective_binge = 'objective_binge_single_category'
objective_df = df[df[col_objective_binge] == 1]
no_binge_df = df[df[col_post_sub_binge] == 0]

obj_binge_count = len(objective_df.index)
print('objective binge entries', obj_binge_count)
print('no binge', len(no_binge_df.index))

no_binge_sample = no_binge_df.sample(n=obj_binge_count) #equal sample size

df_to_label = pd.concat([objective_df, no_binge_sample])
df_to_label = df_to_label[['instance_key_x','PRE_COGN']].sample(frac=1) #shuffle

Write dataframes to excel

In [14]:
print('Writing dataframe and raters file')
df.to_excel(output_file)
df_to_label.to_excel(output_file_raters)

objective binge entries 303
no binge 2373


Write food intake for subjective binges incl obj binge classificiation

In [15]:
print('writing food intake for subjective binges')
sub_df = df[df['subj_binge_instance'] == 'SUBJ_BINGE']
print('subjective binges', len(sub_df.index))

grouped = sub_df.groupby(['food_intake','meal']).sum()[[
                      'objective_binge_single_category',
                      'objective_binge_normalized',
                      'objective_binge_factor_normalized',
                      'objective_binge_calories', 'POST_SUB_BINGE']]
grouped.to_excel(output_file_food_intake)

writing result
