# Estimation of Obesity Levels 
### based on Eating Habits&Physical Cond.

- https://www.sciencedirect.com/science/article/pii/S2352340919306985

This paper presents data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. The data contains 17 attributes and 2111 records, the records are labeled with the class variable NObesity (Obesity Level), that allows classification of the data using the values of Insufficient Weight, Normal Weight, Overweight Level I, Overweight Level II, Obesity Type I, Obesity Type II and Obesity Type III.

77% of the data was generated synthetically using the Weka tool and the SMOTE filter, 23% of the data was collected directly from users through a web platform. This data can be used to generate intelligent computational tools to identify the obesity level of an individual and to build recommender systems that monitor obesity levels. 

eating habits attr:
- FAVC => Frequent consumption of high caloric food

- FCVC => Frequency of consumption of vegetables

- NCP => Number of main meals

- CAEC => Consumption of food between meals

- CH20 => Consumption of water daily

- CALC => Consumption of alcohol

physical attr:
- SCC => Calories consumption monitoring

- FAF => Physical activity frequency

- TUE => Time using technology devices

- MTRANS => Transportation used

other attr:

- GENDER

- AGE

- HEIGHT

- WEIGHT

In [None]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import altair as alt

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
#read data
df = pd.read_csv('../input/obesity-levels/ObesityDataSet_raw_and_data_sinthetic.csv')
df

In [None]:
df.info()

In [None]:
#rename cols
df=df.rename(columns={"Gender": "GENDER", "Age": "AGE", 
                      "Height": "HEIGHT", "Weight": "WEIGHT",
                      "family_history_with_overweight": "HIST_OVERWEIGHT",
                      "NObeyesdad": "OBESITY_LEVEL"})

In [None]:
print("   # of unique values for each column")
print("***************************************")
for column in df.columns:
    print(f"{column} --> {df[column].nunique()}")
    print("-------------------------")

In [None]:
#change data types float -> int
df["AGE"] = df["AGE"].astype(int)

In [None]:
#class distribution of the target attribute -->> see target class is BALANCED
sns.set_style("dark")
sns.set(rc={'figure.figsize':(16,8)})
sns.countplot(x="OBESITY_LEVEL", data=df, 
              #palette=sns.color_palette("Paired", 7), 
              palette=sns.cubehelix_palette(),
              saturation=10).set(title='Obesity Levels Distribution')

==> 350 instances are belong to "obesity_type_1"; more than 250 instances are belong to "insufficient_weight" class.

==> We can say that the dataset is "balanced" which is really important in learning phase later :)

In [None]:
sns.set_style("dark")
sns.countplot(x="AGE", data=df, 
              palette=sns.cubehelix_palette(start=.5, rot=-.75,), 
              saturation=1).set(title='Age distribution of the dataset')

==> We can see that mostly young people were joined the survey.

==> Top 3 ages: 21, 18, 19

In [None]:
sns.set_style("dark")
sns.countplot(x="GENDER", data=df, palette=sns.dark_palette("#88d", 8), 
              saturation=10, hue="OBESITY_LEVEL").set(title='Gender distribution of Obesity Levels')

plt.legend(loc='upper right', bbox_to_anchor=(1.2, 1.))

==> Females do not have "obesity_type_2" ; males do not have "obesity_type_3"

In [None]:
sns.catplot(x="GENDER", hue="OBESITY_LEVEL", col="SMOKE",
                data=df, kind="count",
                height=6, aspect=.8,
                palette="rocket")


In [None]:
sns.catplot(x="GENDER", hue="OBESITY_LEVEL", col="CALC",
                data=df, kind="count",
                height=6, aspect=.8,
                palette="dark:salmon_r")

In [None]:
sns.set(rc={"font.style":"normal",
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':30,
            'figure.figsize':(12.0, 6),
            'xtick.labelsize':25,
            'ytick.labelsize':20})

sns.set(style="white",font_scale=1)


sns.set_style("dark")
sns.countplot(x="MTRANS", data=df, palette=sns.light_palette("salmon"), 
              saturation=10, edgecolor=(0,0,0), linewidth=2).set(title='Transportation Used')

In [None]:
sns.set(rc={"font.style":"normal",
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':30,
            'figure.figsize':(12, 6),
            'xtick.labelsize':25,
            'ytick.labelsize':20})


sns.set(style="white",font_scale=1)

sns.set_style("dark")
sns.countplot(x="MTRANS", data=df, palette=sns.diverging_palette(260, 20), 
              saturation=10, edgecolor=(0,0,0), linewidth=2, hue="GENDER").set(title='Transportation Used ~ Gender')

In [None]:
df

In [None]:
sns.set(rc={"font.style":"normal",
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':30,
            'figure.figsize':(7, 7),
            'xtick.labelsize':10,
            'ytick.labelsize':8})


sns.set(style="white",font_scale=1)

sns.set_style("dark")
sns.countplot(x="HIST_OVERWEIGHT", data=df, palette=sns.diverging_palette(360, 10), 
              saturation=10, edgecolor=(0,0,0), linewidth=2, hue="GENDER").set(title='Family History ~ Gender')

In [None]:
sns.catplot(x="GENDER", hue="HIST_OVERWEIGHT", col="OBESITY_LEVEL",
                data=df, kind="count",
                height=6, aspect=.8,
                palette="ch:s= -3.89, r= -9.2")

In [None]:
sns.set(rc={"font.style":"normal",
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':30,
            'figure.figsize':(12, 6),
            'xtick.labelsize':25,
            'ytick.labelsize':20})


sns.set(style="white",font_scale=1)

sns.set_style("dark")
sns.countplot(x="FAVC", data=df, palette=sns.color_palette("YlOrBr", 3), 
              saturation=10, edgecolor=(0,0,0), linewidth=2, hue="GENDER").set(title='Frequent consumption of High Caloric Food ~ Gender')

In [None]:
# library
import matplotlib.pyplot as plt
from palettable.colorbrewer.qualitative import Pastel2_4

# create data
names=list(df["CAEC"].unique())
sizes=[df["CAEC"].value_counts()[unique_class]*100/len(df["CAEC"]) for unique_class in names]
colors = Pastel2_4.hex_colors
explode = (0, 0, 0, 0)  # explode a slice if required

plt.pie(sizes, explode=explode, labels=names, colors=colors,
        autopct='%1.1f%%', shadow=True)
        
#draw a circle at the center of pie to make it look like a donut
centre_circle = plt.Circle((0,0), 0.75, color='grey', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)


# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.show()

In [None]:
#Codes from Gabriel Preda

def plot_count(feature, title, df, size=1):
    f, ax = plt.subplots(1,1, figsize=(4*size,4))
    total = float(len(df))
    g = sns.countplot(df[feature], order = df[feature].value_counts().index[:20], palette='crest')
    #g.set_title("Number and percentage of {}".format(title))
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(100*height/total),
                ha="center") 
    plt.show()

In [None]:
plot_count("HEIGHT", "HEIGHT", df, 4)

In [None]:
interval = alt.selection_interval()

points = alt.Chart(df).mark_point().encode(
  x='OBESITY_LEVEL',
  y='CALC',
  color=alt.condition(interval, 'SMOKE', alt.value('lightgray'))
).properties(
  selection=interval
)

histogram = alt.Chart(df).mark_bar().encode(
  x='count()',
  y='SMOKE',
  color='SMOKE'
).transform_filter(interval)

points & histogram

In [None]:
#pivot tables
df.pivot_table(index =['OBESITY_LEVEL','MTRANS', 'SMOKE'])

In [None]:
#for cat data distribution
import matplotlib

plt.figure(figsize=(32, 32))
matplotlib.rc('axes', titlesize=24)#cols size

cat_feature_col=["FAVC", "CAEC", "CALC", "SCC", "MTRANS", "GENDER"]
for i, column in enumerate(cat_feature_col, 1):
    plt.subplot(3, 3, i)
    df[df["OBESITY_LEVEL"] == "Obesity_Type_III"][column].hist(bins=20, color='darkkhaki', label='OBESITY_LEVEL = Obesity_Type_III', alpha=1)
    df[df["OBESITY_LEVEL"] == "Normal_Weight"][column].hist(bins=20, color='olive', label='OBESITY_LEVEL = Normal_Weight', alpha=1)
    plt.legend(fontsize='medium')
    plt.title(column)

In [None]:
#heatmap for correlation coefficient

# calculate correlation
df_corr = df.corr()

# correlation matrix
sns.set(font_scale=0.8)
plt.figure(figsize=(12,8))
sns.heatmap(df_corr, annot=True, fmt=".4f",vmin=-1, vmax=1, linewidths=.5, cmap = sns.color_palette("vlag", as_cmap=True))

#plt.yticks(rotation=0)
plt.show()

In [None]:
!pip install ppscore

In [None]:
import seaborn as sns
import ppscore as pps

matrix_df = pps.matrix(df).pivot(columns='x', index='y',  values='ppscore')

sns.set(font_scale=0.8)
plt.figure(figsize=(16,12))
sns.heatmap(matrix_df, annot=True, cmap = sns.color_palette("YlOrBr", as_cmap=True))
plt.show()

# ML 

In [None]:
#outlier detection & handling (filling with mean)
cont_feature_col=["FCVC", "NCP", "CH2O", "FAF", "TUE", "HEIGHT", "WEIGHT", "AGE"]
cont_df=df[cont_feature_col]

# find the IQR
q1 = df[cont_feature_col].quantile(.25)
q3 = df[cont_feature_col].quantile(.75)
IQR = q3-q1

outliers_df = np.logical_or((df[cont_feature_col] < (q1 - 1.5 * IQR)), (df[cont_feature_col] > (q3 + 1.5 * IQR))) 

outlier_list=[]
total_outlier=[]
for col in list(outliers_df.columns):
    try:
        total_outlier.append(outliers_df[col].value_counts()[True])
        outlier_list.append((outliers_df[col].value_counts()[True] / outliers_df[col].value_counts().sum()) * 100)
    except:
        outlier_list.append(0)
        total_outlier.append(0)
        
outlier_list

outlier_df=pd.DataFrame(zip(list(outliers_df.columns), total_outlier, outlier_list), columns=['name of the column', 'total', 'outlier(%)'])

#see totally how many outliers in cont features
outlier_df.set_index('name of the column', inplace=True)
#del outlier_df.index.name
outlier_df

In [None]:
df_cont=df[cont_feature_col]
out_nan_df=df_cont[~outliers_df]
out_nan_df

In [None]:
for col in cont_feature_col:
    col_mean=df[col].mean() #calculate mean for each col
    out_nan_df[col]=out_nan_df[col].fillna(col_mean) #first convert outliers to Nan values then fill Nan's with col mean
    #df[cont_feature_col]=df_cont

In [None]:
df_only_cat=df.drop(columns=cont_feature_col)
#concat df_only_cat and clear cont_df of outliers
df_final=pd.concat([out_nan_df, df_only_cat], axis=1)
df_final

In [None]:
df_final.describe()

In [None]:
#split here for test - size=400

df_test=df_final.sample(n = 400)
ind=df_test.index

ind_list=ind.to_list()
df_train=df_final.drop(ind_list)

print("train ==> ", df_train.shape)
print("test ==> ", df_test.shape)

In [None]:
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)

In [None]:
df_train.reset_index(inplace=True)
df_train=df_train.drop(columns='index')
df_train=df_train.drop(columns='level_0')

df_test.reset_index(inplace=True)
df_test=df_test.drop(columns='index')
df_test=df_test.drop(columns='level_0')

In [None]:
df_train

In [None]:
df_test

In [None]:
#encoding
ord_feature_list=["GENDER", "HIST_OVERWEIGHT", "SMOKE", "MTRANS", "SCC", "CALC", "CAEC", "FAVC", "OBESITY_LEVEL"]
df_ord=df_train[ord_feature_list]
col_names_list=df_ord.columns

enc = OrdinalEncoder()
enc.fit(df_ord)
df_ord_arr=enc.transform(df_ord)

encoded_cat_df=pd.DataFrame(df_ord_arr, columns=col_names_list)

In [None]:
#concat cat & cont dataframes
cont_feature_list=["FCVC", "NCP", "CH2O", "FAF", "TUE", "HEIGHT", "WEIGHT", "AGE"]
df_cont=df_train[cont_feature_list]

train_df_final = pd.concat([encoded_cat_df, df_cont], axis=1)

In [None]:
#X, y splitting
y_imp = train_df_final.loc[:, 'OBESITY_LEVEL'].values
X_imp = train_df_final.drop('OBESITY_LEVEL', axis=1)


#feature importances
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators = 100, max_depth=5)
rf_clf.fit(X_imp, y_imp)

pd.Series(rf_clf.feature_importances_, index = X_imp.columns).nlargest(24).plot(kind = 'pie',
                                                                                figsize = (8, 8),
                                                                                title = 'Feature importance from RandomForest', colormap='twilight', fontsize=10)

In [None]:
# split df to X and Y
from sklearn.model_selection import train_test_split

y = train_df_final.loc[:, 'OBESITY_LEVEL'].values
X = train_df_final.drop('OBESITY_LEVEL', axis=1)

# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state=100)

In [None]:
#normalization(make all values bet. 0-1)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_normalized_arr=scaler.transform(X_train)
X_train_normalized_df=pd.DataFrame(X_train_normalized_arr, columns=X_train.columns.to_list())

X_test_normalized_arr=scaler.transform(X_test)
X_test_normalized_df=pd.DataFrame(X_test_normalized_arr, columns=X_test.columns.to_list())

In [None]:
X_train_normalized_df

In [None]:
X_test_normalized_df

In [None]:
print("x_train: ", len(X_train_normalized_df), " ---  y_train: ", len(y_train))
print("x_test: ", len(X_test_normalized_df), " ---  x_test", len(y_test))

In [None]:
#import necessary libraries
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB
from sklearn.svm import LinearSVC,SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# cross-validation with 10 splits
cv = StratifiedShuffleSplit(n_splits=10, random_state = 42, test_size=0.2)

In [None]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = ""
    str_out += ("\n\n   TEST SCORES\n")
    str_out += ("===================================================================\n")
    
    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")
    str_out += ("---------------------------------------------------\n")
    

    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n")
    str_out += ("---------------------------------------------------\n")
    
    #print FP, FN
    str_out += ("FALSE POSITIVES:\n")
    fp = conf_mat[1][0]
    pos_labels = conf_mat[1][0]+conf_mat[1][1]
    str_out += ("{} out of {} positive labels ({:.4f}%)\n".format(fp, pos_labels,fp/pos_labels))
    str_out += ("\n")
    str_out += ("---------------------------------------------------\n")

    str_out += ("FALSE NEGATIVES:\n")
    fn = conf_mat[0][1]
    neg_labels = conf_mat[0][1]+conf_mat[0][0]
    str_out += ("{} out of {} negative labels ({:.4f}%)\n".format(fn, neg_labels, fn/neg_labels))
    str_out += ("\n")
    str_out += ("--------------------------------------------------\n")

    #print classification report
    str_out += ("PRECISION, RECALL, F1 scores:\n\n")
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

# Classifier-1: Decision Tree CART

In [None]:
# CART decision tree
cart = DecisionTreeClassifier(random_state = 0)

# parameters 
parameters = {
                "criterion": ["gini","entropy"],
                "splitter": ["best","random"],
                "class_weight": [None, "balanced"],
                }

# grid search for parameters
grid_1 = GridSearchCV(estimator=cart, param_grid=parameters, cv=cv, n_jobs=-1)
grid_1.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_1.best_params_, grid_1.best_score_))

# prediction results
y_pred = grid_1.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-2: Naive-Bayes

In [None]:
# Naive-Bayes with different approaches
nb_list = [ GaussianNB(), MultinomialNB(), ComplementNB()]

for nb in nb_list:
    print("-----------", str(nb), "--------------")
    
    # parameters 
    parameters = {}

    # grid search for parameters
    grid_2 = GridSearchCV(estimator=nb, param_grid=parameters, cv=cv, n_jobs=-1)
    grid_2.fit(X_train_normalized_df, y_train)

    # print best scores
    print("The best parameters are %s with a score of %0.4f\n"
          % (grid_2.best_params_, grid_2.best_score_))

    # prediction results
    y_pred = grid_2.predict(X_test_normalized_df)

    # print accuracy metrics
    results, false = display_test_scores(y_test, y_pred)
    print(results)

# Classifier-3: SVM

In [None]:
# SVM classifier
svm = SVC(tol=1e-5, random_state=0)

# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.01, 0.1, 1, 10, 100],
                'max_iter': [100, 1000, 5000]
            }

# grid search for parameters
grid_3 = GridSearchCV(estimator=svm, param_grid=parameters, cv=cv, n_jobs=-1)
grid_3.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_3.best_params_, grid_3.best_score_))

# prediction results
y_pred = grid_3.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-4: kNN

In [None]:
knn = KNeighborsClassifier()
# parameters 
parameters = {
                "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
                "n_neighbors": [5,15,25]
    }

# grid search for parameters
grid_4 = GridSearchCV(estimator=knn, param_grid=parameters, cv=cv, n_jobs=-1)
grid_4.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_4.best_params_, grid_4.best_score_))

# prediction results
y_pred = grid_4.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-5: Logistic Regression

In [None]:
logit = LogisticRegression(random_state=0)
# parameters 
parameters = {
                "penalty":['l1', 'l2'],
                "C": [0.01, 0.1, 1, 10, 100],
                "max_iter": [100,1000,5000],
             }

# grid search for parameters
grid_5 = GridSearchCV(estimator=logit, param_grid=parameters, cv=cv, n_jobs=-1)
grid_5.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_5.best_params_, grid_5.best_score_))

# prediction results
y_pred = grid_5.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-6:RF

In [None]:
rf = RandomForestClassifier(random_state=0)

# parameters 
parameters = {
                "bootstrap": ["True","False"],
                "max_features": [None, "sqrt", "log2"],
                "class_weight": [None, "balanced", "balanced_subsample"],
                "max_samples": [None, 0.3, 0.5, 0.7, 0.9],
                "n_estimators": [10, 100, 200]
                
}

# grid search for parameters
grid_6 = GridSearchCV(estimator=rf, param_grid=parameters, cv=cv, n_jobs=-1)
grid_6.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_6.best_params_, grid_6.best_score_))

# prediction results
y_pred = grid_6.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-7: Bagging Meta Estimator

In [None]:
from sklearn.ensemble import BaggingClassifier

bag = BaggingClassifier(random_state=0)

# parameters 
parameters = {
                "bootstrap": ["True","False"],
                "max_features": [0.3, 0.5, 0.7, 0.9, 1],
                "max_samples": [0.3, 0.5, 0.7, 0.9],
                "n_estimators": [10, 100, 200]
                
}

# grid search for parameters
grid_7 = GridSearchCV(estimator=bag, param_grid=parameters, cv=cv, n_jobs=-1)
grid_7.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_7.best_params_, grid_7.best_score_))

# prediction results
y_pred = grid_7.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier-8: ExtraTressClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

extra = ExtraTreesClassifier(random_state=0)

# parameters 
parameters = {
                "bootstrap": ["True","False"],
                "max_features": [None, "sqrt", "log2"],
                "class_weight": [None, "balanced", "balanced_subsample"],
                "max_samples": [None, 0.3, 0.5, 0.7, 0.9],
                "n_estimators": [10, 100, 200]
                
}

# grid search for parameters
grid_8 = GridSearchCV(estimator=extra, param_grid=parameters, cv=cv, n_jobs=-1)
grid_8.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_8.best_params_, grid_8.best_score_))

# prediction results
y_pred = grid_8.predict(X_test_normalized_df)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# 