In [None]:
# import library
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', 250)

_________________________

# Dataset

In [None]:
# Import Dataset
df = pd.read_csv("obesity_data.csv")
display(df.head(2),df.sample(2),df.tail(2))

In [None]:
# Dataset Information
display(df.describe(),
pd.DataFrame({
    'feature': df.columns.values,
    'dtypes': [df[col].dtype for col in df.columns],
    'n_unique': df.nunique().values,
    'n_nan': [df[col].isna().sum() for col in df.columns],
    'n_dupe': [df.duplicated().sum() for row in df.columns],
    'sample_unique': [df[col].unique() for col in df.columns]
    })
)
print(f'''
      Columns that having missing value\t= {df.isnull().any().sum()} : {df.columns[df.isna().any()].tolist()}
      Columns that are clean\t\t= {df.shape[1] - df.isnull().any().sum()} : {df.columns[df.notna().all()].tolist()}
      Columns\t\t\t\t= {df.shape[1]}
      Rows that having missing value\t= {df.isnull().sum().sum()}
      Rows\t\t\t\t= {df.shape[0]}
      Percentage of missing value\t= {(df.isnull().sum().sum()/np.product(df.shape)) * 100}
      ''')

In [None]:
# Obesity Distribution
numeric_columns = df.select_dtypes(include=np.number).columns.tolist()

# Adjust the number of rows and columns in the subplot grid
num_rows = (len(numeric_columns) + 1) // 3 + (1 if len(numeric_columns) % 3 != 0 else 0)
num_cols = min(len(numeric_columns), 3)

fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(15, 4 * num_rows))
axes = axes.flatten()

for i, col in enumerate(numeric_columns[:5]):
    for obesity_level in df["ObesityCategory"].unique():
        subset_data = df[df["ObesityCategory"] == obesity_level]
        colors = sns.color_palette()[list(df["ObesityCategory"].unique()).index(obesity_level)]
        sns.kdeplot(data=subset_data, x=col, fill=True, ax=axes[i], color=colors)

        middle_value = subset_data[col].median()
        axes[i].axvline(middle_value, linestyle='dashed', linewidth=2, color=colors)

    axes[i].set_title(f"{col} Distribution")
    axes[i].legend()

# Remove empty subplots if any
for j in range(num_rows * num_cols - len(numeric_columns)):
    fig.delaxes(axes[-(j+1)])
    
plt.suptitle("Obesity Distribution",y=1.02, fontsize=15)
plt.tight_layout()
plt.show()


In [None]:
# Outliers
def find_anomalies(data, column_name):
    q1, q3 = data.quantile([0.25, 0.75])
    iqr = q3 - q1
    limit = iqr * 1.5
    bot = q1 - limit
    top = q3 + limit
    outliers_count = ((data < (bot)) | (data > (top))).sum()

    result = pd.DataFrame({
        'Column': [column_name],
        'IQR': [iqr],
        'Lower Bound': [bot],
        'Upper Bound': [top],
        'Outliers': [outliers_count]
    })
    return result

df_outliers = pd.DataFrame(columns=['Column', 'IQR', 'Lower Bound', 'Upper Bound', 'Outliers'])

for column in numeric_columns:
    result = find_anomalies(df[column], column)
    df_outliers = pd.concat([df_outliers, result], ignore_index=True)
    
# Remove Outliers
for i,low,up in zip(df_outliers.Column.unique(),df_outliers["Lower Bound"].unique(),df_outliers["Upper Bound"].unique()):
    df = df[(df[i] >= low) & (df[i] <= up)]

display(df_outliers)
print(f"Numbers of Outliers Removed : {df_outliers['Outliers'].sum()}")

In [None]:
# Gender Obesity Distribution
male_counts = df[df['Gender'] == 'Male']['ObesityCategory'].value_counts()
female_counts = df[df['Gender'] == 'Female']['ObesityCategory'].value_counts()
gen_count = df.Gender.value_counts().reset_index(name = "counts")
data = [male_counts, female_counts]
titles = ['Male', 'Female']

# Create Pie Plot
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

for i in range(2):
    axes[i].pie(data[i], labels=data[i].index, autopct='%1.1f%%', startangle=90, explode=(0.05, 0.05, 0.05, 0.05),
               wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})
    axes[i].set_title(titles[i])

# Create a legend outside the subplots
legend_labels = [f'{index}: {count}' for index, count in zip(df.Gender.unique(), gen_count.counts)]
legend = plt.legend(legend_labels, title='Gender Counts', bbox_to_anchor=(1, 1), labelspacing=1, handlelength=0, handleheight=0)

plt.suptitle("Gender Obesity Categories", fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# Obesity Category Comparation
Category_counts = df['ObesityCategory'].value_counts()

plt.pie(Category_counts, labels=Category_counts.index, autopct='%1.2f%%',  explode=(0.05, 0.05, 0.05, 0.05), startangle=90,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1, 'antialiased': True})

legend_labels = [f'{index}: {count}' for index, count in zip(Category_counts.index, Category_counts)]
plt.legend(legend_labels, title='Category Counts',bbox_to_anchor=(1, 1))

plt.suptitle("Obesity Category Comparation", fontsize=15)
plt.tight_layout()
plt.show()

In [None]:
# Encode
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])
df["ObesityCategory"] = label_encoder.fit_transform(df["ObesityCategory"])

df.sample(5)

In [None]:
# Correlation
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=.5)
plt.title("Correlation Heatmap")
plt.show()

________________

# Train Dataset

In [None]:
# Seen & Unseen
from sklearn.model_selection import train_test_split
seen, unseen = train_test_split(df, test_size=0.05, random_state=42)
print(f"""
Seen = {seen.shape}
Unseen = {unseen.shape}""")

In [None]:
# Train Test Split
X = seen.drop("ObesityCategory", axis=1)
y = seen["ObesityCategory"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"""
X_train = {X_train.shape}
X_test = {X_test.shape}""")

In [None]:
# Import Model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier

lr = LogisticRegression(random_state=42)
knn = KNeighborsClassifier()
svm = SVC(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
adaboost = AdaBoostClassifier(random_state=42)
nb = GaussianNB()
mlp = MLPClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
lgbm = LGBMClassifier(random_state=42)
lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
gp = GaussianProcessClassifier(random_state=42)
ridge = RidgeClassifier(random_state=42)
perceptron = Perceptron(random_state=42)
gbr = GradientBoostingClassifier(random_state=42)
sgd = SGDClassifier(random_state=42)
cb = CatBoostClassifier(random_state=42,verbose=0)

models = [lr, knn, svm, dt, rf, adaboost, nb, mlp, xgb, lgbm, lda, qda, gp, ridge, perceptron, gbr, sgd, cb]

In [None]:
# Model Benchmarking
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef
from sklearn.preprocessing import label_binarize

accuracy = []
precision = []
recall = []
f1 = []
kappa = []
mcc = []

crossval = KFold(n_splits=5, shuffle=True, random_state=42)

for model in models:
    
    # Accuracy
    model_cv_accuracy = cross_val_score(
        model, 
        X, 
        y, 
        cv=crossval, 
        scoring='accuracy', 
        error_score='raise'
    )
    accuracy.append(model_cv_accuracy.mean())
    
    # Precision
    y_pred = cross_val_predict(model, X, y, cv=crossval)
    model_cv_precision = precision_score(y, y_pred, average="weighted")
    precision.append(model_cv_precision)

    # Recall
    model_cv_recall = recall_score(y, y_pred, average="weighted")
    recall.append(model_cv_recall)
    
    # F1 Score
    model_cv_f1 = cross_val_score(
        model, 
        X, 
        y, 
        cv=crossval, 
        scoring='f1_weighted', 
        error_score='raise',
    )
    f1.append(model_cv_f1.mean())
    
    # Cohen's Kappa
    model_cv_kappa = cohen_kappa_score(y, y_pred)
    kappa.append(model_cv_kappa)
    
    # Matthews Correlation Coefficient (MCC)
    model_cv_mcc = matthews_corrcoef(y, y_pred)
    mcc.append(model_cv_mcc)
    
df_eval = pd.DataFrame({
    'Model': [
        'Logistic Regression',
        'K-Nearest Neighbors',
        'SVC',
        'Decision Tree Classifier',
        'Random Forest Classifier',
        'AdaBoost Classifier',
        'GaussianNB',
        'MLP Classifier',
        'XGBoost Classifier',
        'LGBM Classifier',
        'Linear Discriminant Analysis',
        'Quadratic Discriminant Analysis',
        'Gaussian Process Classifier',
        'Ridge Classifier',
        'Perceptron',
        'Gradient Boosting Classifier',
        'SGD Classifier',
        'CatBoost Classifier'],
    'Accuracy' : accuracy,
    'Prec.' : precision,
    'Recall' : recall,
    'F1' : f1,
    'Kappa' : kappa,
    'MCC' : mcc
})
df_eval.sort_values(by="Accuracy", ascending=False)

In [None]:
# Compare ROC-AUC Score & Classification Report
lgbm.fit(X_train, y_train)

y_pred_default = lgbm.predict(X_test)
y_pred_proba_default = lgbm.predict_proba(X_test)

# Apply softmax to get probabilities
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / exp_x.sum(axis=1, keepdims=True)
y_pred_proba_default_softmax = softmax(y_pred_proba_default)

roc_auc_default = roc_auc_score(y_test, y_pred_proba_default_softmax, multi_class='ovr')
report_default = classification_report(y_test, y_pred_default)

print('ROC AUC Score Default LGBM Classifier : ', roc_auc_default)
print('\nClassification Report Default LGBM Classifier : \n', report_default)

In [None]:
# Final Model
df_eval = df_eval[df_eval["Model"]=="LGBM Classifier"]
df_eval['ROC-AUC'] = roc_auc_default
df_eval.reset_index(drop=True)

___________________

# Evaluate

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred_default)

plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm', cbar=False)

plt.title('Confusion Matrix - LGBM Classifier')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')

class_labels = ["Normal weight", "Obese", "Overweight", "Underweight"]
tick_marks = [0.5, 1.5, 2.5, 3.5]
plt.xticks(tick_marks, class_labels)
plt.yticks(tick_marks, class_labels)

plt.tight_layout()
plt.show()

In [None]:
# Testing Model
Normal_Weight = df[df['ObesityCategory'] == 0]
Obese = df[df['ObesityCategory'] == 1]
Over_Weight = df[df['ObesityCategory'] == 2]
Under_Weight = df[df['ObesityCategory'] == 3]
metrics = ['mean', 'median']
result_frames = []

for metric in metrics:
    Normal_frame = getattr(Normal_Weight, metric)().to_frame().T
    Obese_frame = getattr(Obese, metric)().to_frame().T
    Over_frame = getattr(Over_Weight, metric)().to_frame().T
    Under_frame = getattr(Under_Weight, metric)().to_frame().T
    result_frames.append(pd.concat([Normal_frame.assign(label=f'Normal_{metric}'), 
                                    Obese_frame.assign(label=f'Obese_{metric}'),
                                    Over_frame.assign(label=f'Over_{metric}'), 
                                    Under_frame.assign(label=f'Under_{metric}')],ignore_index=True))

dfval = pd.concat(result_frames, ignore_index=True)
dfval = dfval[['label'] + [col for col in dfval.columns if col != 'label']]

print("Before (Using mean & median of every columns based on ObesityCategory) :")
display(dfval)

# Result Testing Model
dfval["Predicted ObesityCategory"] = lgbm.predict(dfval.drop(columns={"ObesityCategory","label"}))
print("After :")
display(dfval)

In [None]:
# Testing Model on Unseen
unseen["Predicted ObesityCategory"] = lgbm.predict(unseen.drop(columns={"ObesityCategory"}))
unseen

In [None]:
# Feature Importance
input_layer_weights = lgbm.feature_importances_
feature_weights = pd.DataFrame({'Feature': X.columns, 'Importance': input_layer_weights})

feature_weights.sort_values(by='Importance', ascending=True, inplace=True)
feature_weights.plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.title('Feature Importance in LightGBM Model')
plt.xlabel('Importance')
plt.ylabel('Feature')

plt.tight_layout()
plt.show()

# Conclusion

**1. Based on the modeling that has been conducted, Classification analysis has been successfully performed using a machine learning approach. In this analysis, the features "BMI" have been identified as the most influential ones for classifying 'ObesityCategory'.**

**2. Based on the evaluation results of the Light Gradient Boosting Machine (LGBM) Classifier model, it is evident that the model exhibits exceptional performance in classifying the given dataset:**

- **Accuracy:** The LGBM Classifier achieves an outstanding accuracy of 99.68%, indicating its ability to correctly classify instances with an exceptionally high level of precision.

- **Precision:** With a precision score of 99.68%, the model showcases that 99.68% of its positive predictions are accurate, highlighting its reliability in correctly identifying positive cases.

- **Recall:** The model demonstrates an impressive recall rate of 99.68%, signifying its capability to effectively capture 99.68% of all actual positive cases.

- **F1 Score:** The F1 Score, standing at 99.68%, emphasizes the harmonious balance between precision and recall, reinforcing the model's robustness in handling both false positives and false negatives.

- **Kappa Coefficient:** The Kappa coefficient attains a high value of 99.54%, indicating an exceptional agreement between the model's predictions and the actual classes.

- **Matthews Correlation Coefficient (MCC):** With an MCC value of 99.54%, the model exhibits a strong and reliable relationship between its predictions and the actual classes.

- **ROC-AUC:** The Area Under the ROC Curve (ROC-AUC) achieves a perfect value of 100%, showcasing the model's exceptional ability to distinguish between positive and negative classes with precision.

**In conclusion, based on these evaluation metrics, it can be confidently asserted that the LGBM Classifier model excels in classifying the provided dataset. The exceptionally high values across accuracy, precision, recall, F1 Score, Kappa, MCC, and ROC-AUC collectively demonstrate the model's reliability and outstanding performance in handling the classification task for this specific dataset.**
