## Importing Libraries

In [1]:
from pprint import pprint
import pandas as pd
import boto3
import yaml
import os
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("darkgrid")

In [None]:
def read_yaml_file(path, file):
    # reading credentials files
    with open(f"{os.path.join(path, file)}") as f:
        try:
            content = yaml.safe_load(f)
        except yaml.YAMLError as e:
            raise e

    return content


CONFIG_PATH = os.path.join("..", "src", "config")

In [None]:
credentials_config = read_yaml_file(path=CONFIG_PATH, file="credentials.yaml")

general_settings = read_yaml_file(path=CONFIG_PATH, file="settings.yaml")

SEED = 42
ARTIFACTS_OUTPUT_PATH = general_settings["ARTIFACTS_PATH"]
FEATURES_OUTPUT_PATH = general_settings["FEATURES_PATH"]
RAW_FILE_PATH = os.path.join(
    general_settings["DATA_PATH"], general_settings["RAW_FILE_NAME"]
)
PROCESSED_RAW_FILE = "Preprocessed_" + general_settings["RAW_FILE_NAME"]
PROCESSED_RAW_FILE_PATH = os.path.join(
    general_settings["DATA_PATH"], PROCESSED_RAW_FILE
)

## Exploratory Data Analysis (EDA)

In [None]:
if credentials_config["S3"] != "YOUR_S3_BUCKET_URL":
    s3 = boto3.client(
        "s3",
        aws_access_key_id=credentials_config["AWS_ACCESS_KEY"],
        aws_secret_access_key=credentials_config["AWS_SECRET_KEY"],
    )

    # downloading the original file from the aws s3 bucket
    if not os.path.exists(RAW_FILE_PATH):
        s3.download_file(
            credentials_config["S3"], general_settings["RAW_FILE_NAME"], RAW_FILE_PATH
        )

df = pd.read_csv(RAW_FILE_PATH, sep=",")
pprint(f"Dataset shape: {df.shape}")

In [None]:
df.info()

In [6]:
df = df.drop(columns=["id"])

### Check Duplicated Rows

In [None]:
pprint(f"Number of duplicated rows: {df.duplicated(keep=False).sum()}")

In [None]:
df[df.duplicated(keep=False)].sort_values(["Gender", "Age", "Height", "Weight"])

### Check Label Distribution

In [None]:
plt.figure(figsize=(6, 6))
ax = sns.countplot(
    data=df,
    x="NObeyesdad",
    palette=sns.color_palette(
        "ch:start=.5,rot=-.1", n_colors=df["NObeyesdad"].nunique()
    ),
)

for i in ax.containers:
    ax.bar_label(
        i,
    )

plt.title("Label Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
df["NObeyesdad"].value_counts() / df.shape[0]

### Checking Columns Type

In [None]:
target_column = "NObeyesdad"
categorical_columns = df.select_dtypes("object").columns.tolist()
categorical_columns.remove(target_column)
numerical_columns = df.select_dtypes(exclude="object").columns.tolist()

pprint(f"Target column: {target_column}")
print()
pprint(
    f"{len(categorical_columns)} categorical columns are found. Their names: {categorical_columns}"
)
print()
pprint(
    f"{len(numerical_columns)} numerical columns are found. Their names: {numerical_columns}"
)
print()

### Checking NAN values

In [None]:
null_values_df = df.isnull().sum().reset_index()
null_values_df.columns = ["Variable", "Count"]
null_values_df = null_values_df.sort_values("Count", ascending=False)
null_values_df = null_values_df[null_values_df["Count"] > 0].reset_index(drop=True)

if null_values_df.shape[0] > 0:
    plt.figure(figsize=(5, 4))
    ax = sns.barplot(data=null_values_df, x="Variable", y="Count")

    for i in ax.containers:
        ax.bar_label(
            i,
        )

    plt.title("Amount of NAN Values per Variable")
    plt.xlabel("Variable's Name")
    plt.ylabel("NAN Count")
    plt.xticks(rotation=15)
    plt.show()
else:
    pprint("No NAN values were found!\n")

### Checking Numerical Columns Granularity

In [None]:
numerical_granularity_df = df[numerical_columns].nunique().reset_index()
numerical_granularity_df.columns = ["Variable", "Count"]
numerical_granularity_df

### Checking Categorical Columns Granularity

In [None]:
granularity_df = df[categorical_columns].nunique().reset_index()
granularity_df.columns = ["Variable", "Count"]
granularity_df = granularity_df.sort_values("Count", ascending=False)
granularity_df = granularity_df[granularity_df["Count"] > 0].reset_index(drop=True)

plt.figure(figsize=(10, 4))
ax = sns.barplot(
    data=granularity_df,
    x="Variable",
    y="Count",
    palette=sns.dark_palette("#69d", reverse=False, n_colors=len(categorical_columns)),
)

for i in ax.containers:
    ax.bar_label(
        i,
    )

plt.title("Granularity per Variable")
plt.xlabel("Variable's Name")
plt.ylabel("Unique Values")
plt.xticks(rotation=90)
plt.show()

### Check Categorical Columns Values Distribution

In [None]:
nrows = 4
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(10)
fig.set_figheight(10)

for cc in categorical_columns:
    if c == ncols:
        c = 0
        r += 1

    temp = df[cc].value_counts().reset_index()
    temp.columns = ["Value", "Count"]

    sns.barplot(
        data=temp,
        y="Value",
        x="Count",
        palette=sns.dark_palette("#69d", reverse=True, n_colors=temp.shape[0]),
        ax=axs[r, c],
        orient="h",
    )

    for i in axs[r, c].containers:
        axs[r, c].bar_label(
            i,
        )

    axs[r, c].set_title(cc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Count")

    if (r == nrows - 1) or (r == nrows - 2 and c > 1):
        axs[r, c].set_xlabel("Value")

    c += 1

fig.tight_layout()
fig.show()

### Check Numerical Columns Distributions

In [None]:
nrows = 4
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(10)
fig.set_figheight(10)

for nc in numerical_columns:
    if c == ncols:
        c = 0
        r += 1

    sns.histplot(data=df[nc], ax=axs[r, c], kde=True)

    axs[r, c].set_title(nc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Count")

    if (r == nrows - 1) or (r == nrows - 2 and c > 1):
        axs[r, c].set_xlabel("Value")

    c += 1

fig.tight_layout()
fig.show()

In [None]:
print("Numerical Columns skewness:")
pprint(df[numerical_columns].skew())
print()

print("Numerical Columns variation:")
pprint(df[numerical_columns].var())
print()

print("Numerical Columns standard deviation:")
pprint(df[numerical_columns].std())
print()

### Check Numerical Columns Correlations Between Each Other

In [None]:
sns.heatmap(data=df[numerical_columns].corr(method="spearman"), fmt=".2f", annot=True)

In [None]:
sns.pairplot(data=df[numerical_columns + [target_column]], hue=target_column)

In [None]:
lower_target_granularity = {
    "Obesity_Type_I": "Obesity",
    "Obesity_Type_II": "Obesity",
    "Obesity_Type_III": "Obesity",
    "Overweight_Level_I": "Overweight",
    "Overweight_Level_II": "Overweight",
}
temp = df[numerical_columns + [target_column]].copy()
temp[target_column].replace(lower_target_granularity, inplace=True)

sns.pairplot(data=temp, hue=target_column)

del temp

### Check Numerical Columns Outliers

In [None]:
df.describe()

In [None]:
nrows = 4
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(10)
fig.set_figheight(10)

for nc in numerical_columns:
    if c == ncols:
        c = 0
        r += 1

    sns.boxplot(data=df[nc], ax=axs[r, c])

    axs[r, c].set_title(nc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Value")

    c += 1

fig.tight_layout()
fig.show()

### Check Numerical Columns Correlations with the Target Column

In [None]:
def plot_numerical_column_per_target(column: str, df: pd.DataFrame) -> None:
    stats = (
        df.groupby(target_column)
        .agg(mean=(column, "mean"), median=(column, "median"), std=(column, "std"))
        .reset_index()
    )

    fig, axs = plt.subplots(1, 2)
    fig.set_figwidth(8)
    fig.set_figwidth(12)

    sns.violinplot(
        data=df,
        x=column,
        y=target_column,
        hue=target_column,
        orient="h",
        gridsize=10,
        width=0.9,
        ax=axs[0],
    )
    axs[0].legend([], [], frameon=False)
    axs[0].set_xlabel(column)
    axs[0].set_title(f"{column} Violin Plot per Target Class")

    sns.barplot(
        data=stats,
        x="mean",
        y=target_column,
        orient="h",
        palette=sns.dark_palette("#69d", reverse=False, n_colors=stats.shape[0]),
        errorbar="sd",
        ax=axs[1],
    )
    axs[1].legend([], [], frameon=False)
    axs[1].set_yticklabels([])
    axs[1].set_ylabel("")
    axs[1].set_xlabel(column)
    axs[1].set_title(f"Mean {column} per Target Class")

    for i in axs[1].containers:
        axs[1].bar_label(
            i,
        )

    fig.set_tight_layout(tight="w_pad")
    fig.show()


def plot_categorical_column_per_target(column: str, df: pd.DataFrame) -> None:
    df_gpb = (
        df[[column, target_column]].groupby(target_column).value_counts().reset_index()
    )
    df_pivot = pd.pivot(df_gpb, index=target_column, columns=column, values=0)

    plt.figure(figsize=(10, 8))
    sns.heatmap(df_pivot, annot=True, fmt="g", cmap=sns.color_palette("rocket"))
    plt.show()

#### Numerical Columns

In [None]:
temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

for nm in numerical_columns:
    plot_numerical_column_per_target(column=nm, df=temp)

#### Categorical Columns

In [None]:
for cc in categorical_columns:
    plot_categorical_column_per_target(column=cc, df=df)

### Feature Engineering

#### Body Mass Index (BMI)

In [None]:
df["BMI"] = df["Weight"] / (df["Height"] ** 2)

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="BMI", df=temp)

#### Physical Activity Level (PAL)

In [None]:
df["PAL"] = df["FAF"] - df["TUE"]

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="PAL", df=temp)

#### Body Surface Area (BSA)

In [None]:
def calculate_bsa(gender: str, height: float, weight: float) -> float:
    # Schlich formula
    if gender == "Female":
        return 0.000975482 * (weight**0.46) * (height**1.08)

    return 0.000579479 * (weight**0.38) * (height**1.24)


df["BSA"] = df.apply(
    lambda x: calculate_bsa(x["Gender"], x["Height"], x["Weight"]), axis=1
)

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="BSA", df=temp)

#### Ideal Body Weight (IBW)

In [None]:
def calculate_ibw(gender: str, height: float) -> float:
    # b. j. devine formula
    if gender == "Female":
        return 45.5 + 0.9 * (height - 152)

    return 50 + 0.9 * (height - 152)


df["IBW"] = df.apply(lambda x: calculate_ibw(x["Gender"], x["Height"]), axis=1)
df["diff_W_IBW"] = df["Weight"] - df["IBW"]

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="diff_W_IBW", df=temp)

#### Basal Metabolic Rate (BMR)

In [None]:
def calculate_bmr(age: int, gender: str, height: float, weight: float) -> float:
    s = -161 if gender == "Female" else 5
    return (10 * weight) + (6.25 * height) - (5 * age) + s


df["BMR"] = df.apply(
    lambda x: calculate_bmr(x["Age"], x["Gender"], x["Height"], x["Weight"]), axis=1
)

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="BMR", df=temp)

#### Total Daily Energy Expenditure (TDEE)

In [None]:
def calculate_tdee(bmr: float, activity: float) -> float:
    if activity == 0:
        return bmr * 1.2
    elif activity < 1:
        return bmr * 1.55
    elif activity > 1 and activity <= 2:
        return bmr * 1.725
    else:
        return bmr * 1.9


df["TDEE"] = df.apply(lambda x: calculate_tdee(x["BMR"], x["FAF"]), axis=1)

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="TDEE", df=temp)

#### Sufficient Water Consumption (SWC)

In [None]:
df["SWC"] = df["CH2O"] > ((df["Weight"] / 2) * 0.0295735)

plot_categorical_column_per_target(column="SWC", df=df)

#### Is Sedentary? (IS)

In [None]:
df["IS"] = df["FAF"] <= 1

plot_categorical_column_per_target(column="IS", df=df)

#### Healthy Habits (HH)

In [None]:
def calculate_healthy_habits(row: pd.DataFrame) -> float:
    eat_healthy = -1 if (row["FCVC"] * row["NCP"]) < 3 else 1
    is_sedentary = -1 if row["FAF"] <= 1 else 1
    is_smoker = -1 if row["SMOKE"] == "yes" else 1
    sufficient_water_consumption = (
        -1 if (row["CH2O"] < ((row["Weight"] / 2) * 0.0295735)) else 1
    )
    drink_frequently = (
        -1 if (row["CALC"] == "Always" or row["CALC"] == "Frequently") else 1
    )
    active_person = -1 if (row["TUE"] - row["FAF"]) > 0 else 1
    is_overweight = -1 if (row["Height"] - row["IBW"]) > 0 else 1

    return (
        eat_healthy
        + is_sedentary
        + is_smoker
        + sufficient_water_consumption
        + drink_frequently
        + active_person
        + is_overweight
    )


df["HH"] = df.apply(lambda x: calculate_healthy_habits(x), axis=1)

temp = df.sort_values(target_column, ascending=True).reset_index(drop=True).copy()

plot_numerical_column_per_target(column="HH", df=temp)

#### Ideal Number of Main Meals? (INMM)

In [None]:
df["INMM"] = df["NCP"] == 3

plot_categorical_column_per_target(column="INMM", df=df)

#### Eat Vegetables Every Main Meal? (EVEMM)

In [None]:
df["EVEMM"] = df["FCVC"] >= df["NCP"]

plot_categorical_column_per_target(column="EVEMM", df=df)

In [37]:
## removing downloaded dataset from local
if credentials_config["S3"] != "YOUR_S3_BUCKET_URL":
    os.remove(RAW_FILE_PATH)