## Import Libraries

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer
from pprint import pprint
import os
import joblib
import yaml
import boto3
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings

sns.set_style("darkgrid")
warnings.filterwarnings("ignore")


def read_yaml_file(path, file):
    # reading credentials files
    with open(f"{os.path.join(path, file)}") as f:
        try:
            content = yaml.safe_load(f)
        except yaml.YAMLError as e:
            raise e

    return content


CONFIG_PATH = os.path.join("src", "config")

In [None]:
credentials_config = read_yaml_file(path=CONFIG_PATH, file="credentials.yaml")

general_settings = read_yaml_file(path=CONFIG_PATH, file="settings.yaml")

SEED = 42
ARTIFACTS_OUTPUT_PATH = general_settings["ARTIFACTS_PATH"]
FEATURES_OUTPUT_PATH = general_settings["FEATURES_PATH"]
RAW_FILE_PATH = os.path.join(
    general_settings["DATA_PATH"], general_settings["RAW_FILE_NAME"]
)
PROCESSED_RAW_FILE = "Preprocessed_" + general_settings["RAW_FILE_NAME"]
PROCESSED_RAW_FILE_PATH = os.path.join(
    general_settings["DATA_PATH"], PROCESSED_RAW_FILE
)

## Data Cleaning

### Loading Dataset

In [None]:
if credentials_config["S3"] != "YOUR_S3_BUCKET_URL":
    s3 = boto3.client(
        "s3",
        aws_access_key_id=credentials_config["AWS_ACCESS_KEY"],
        aws_secret_access_key=credentials_config["AWS_SECRET_KEY"],
    )

    # downloading the original file from the aws s3 bucket
    if not os.path.exists(RAW_FILE_PATH):
        s3.download_file(
            credentials_config["S3"], general_settings["RAW_FILE_NAME"], RAW_FILE_PATH
        )

df = pd.read_csv(RAW_FILE_PATH, sep=",")
df = df.drop(columns=["id"])
print(f"Dataset shape: {df.shape}")

### Removing Duplicates

In [None]:
df = df.drop_duplicates(keep="first")
pprint(f"Dataset shape: {df.shape}")

### Transforming Height Units to Centimeters

In [None]:
df["Height"] *= 100
df

### Removing Outliers

In [None]:
# calculating the upper and lower limits
Q1 = df["Age"].quantile(0.25)
Q3 = df["Age"].quantile(0.75)
threshold = 3.5
IQR = Q3 - Q1

pprint(f"Dataset shape before removing the outliers: {df.shape}")

# removing the data samples that exceeds the upper or lower limits
df = df[
    ~((df["Age"] >= (Q3 + threshold * IQR)) | (df["Age"] <= (Q1 - threshold * IQR)))
]
pprint(f"Dataset shape after removing the outliers: {df.shape}")

### Creating New Features

#### Body Mass Index (BMI)

In [None]:
df["BMI"] = df["Weight"] / (df["Height"] ** 2)

#### Physical Activity Level (PAL)

In [None]:
df["PAL"] = df["FAF"] - df["TUE"]

#### Body Surface Area (BSA)

In [None]:
def calculate_bsa(gender: str, height: float, weight: float) -> float:
    # Schlich formula
    if gender == "Female":
        return 0.000975482 * (weight**0.46) * (height**1.08)

    return 0.000579479 * (weight**0.38) * (height**1.24)


df["BSA"] = df.apply(
    lambda x: calculate_bsa(x["Gender"], x["Height"], x["Weight"]), axis=1
)

#### Ideal Body Weight (IBW)

In [None]:
def calculate_ibw(gender: str, height: float) -> float:
    # B. J. Devine formula
    if gender == "Female":
        return 45.5 + 0.9 * (height - 152)

    return 50 + 0.9 * (height - 152)


df["IBW"] = df.apply(lambda x: calculate_ibw(x["Gender"], x["Height"]), axis=1)
df["diff_W_IBW"] = df["Weight"] - df["IBW"]

#### Basal Metabolic Rate (BMR)

In [None]:
def calculate_bmr(age: int, gender: str, height: float, weight: float) -> float:
    s = -161 if gender == "Female" else 5
    return (10 * weight) + (6.25 * height) - (5 * age) + s


df["BMR"] = df.apply(
    lambda x: calculate_bmr(x["Age"], x["Gender"], x["Height"], x["Weight"]), axis=1
)

#### Total Daily Energy Expenditure (TDEE)

In [None]:
def calculate_tdee(bmr: float, activity: float) -> float:
    if activity == 0:
        return bmr * 1.2
    elif activity < 1:
        return bmr * 1.55
    elif activity > 1 and activity <= 2:
        return bmr * 1.725
    else:
        return bmr * 1.9


df["TDEE"] = df.apply(lambda x: calculate_tdee(x["BMR"], x["FAF"]), axis=1)

#### Sufficient Water Consumption

In [None]:
df["SWC"] = df["CH2O"] > ((df["Weight"] / 2) * 0.0295735)  # converting onces to liters
df["SWC"] = df["SWC"].astype(int)

#### Is Sedentary? (IS)

In [None]:
df["IS"] = df["FAF"] <= 1
df["IS"] = df["IS"].astype(int)

#### Healthy Habits

In [None]:
def calculate_healthy_habits(row: pd.DataFrame) -> float:
    eat_healthy = -1 if (row["FCVC"] * row["NCP"]) < 3 else 1
    is_sedentary = -1 if row["FAF"] <= 1 else 1
    is_smoker = -1 if row["SMOKE"] == "yes" else 1
    sufficient_water_consumption = (
        -1 if (row["CH2O"] < ((row["Weight"] / 2) * 0.0295735)) else 1
    )
    drink_frequently = (
        -1 if (row["CALC"] == "Always" or row["CALC"] == "Frequently") else 1
    )
    active_person = -1 if (row["TUE"] - row["FAF"]) > 0 else 1
    is_overweight = (
        -1 if (row["Height"] - calculate_ibw(row["Age"], row["Height"])) > 0 else 1
    )

    return (
        eat_healthy
        + is_sedentary
        + is_smoker
        + sufficient_water_consumption
        + drink_frequently
        + active_person
        + is_overweight
    )


df["HH"] = df.apply(lambda x: calculate_healthy_habits(x), axis=1)

#### Ideal Number of Main Meals? (INMM)

In [None]:
df["INMM"] = df["NCP"] == 3
df["INMM"] = df["INMM"].astype(int)

#### Eat Vegetables Every Main Meal? (EVEMM)

In [None]:
df["EVEMM"] = df["FCVC"] >= df["NCP"]
df["EVEMM"] = df["EVEMM"].astype(int)

### Feature Engineering

In [None]:
nrows = 10
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(12)
fig.set_figheight(15)

# plotting numerical columns distributions
numerical_columns = df.select_dtypes(exclude="object").columns.tolist()

for nc in numerical_columns:
    print(nc)
    if c == ncols:
        c = 0
        r += 1

    sns.histplot(data=df[nc], ax=axs[r, c], kde=True)

    axs[r, c].set_title(nc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Count")

    if (r == nrows - 1) or (r == nrows - 2 and c > 1):
        axs[r, c].set_xlabel("Value")

    c += 1

fig.tight_layout()
fig.show()

#### Transforming `Age` Column Into a Categorical Column

In [None]:
values, bins = pd.qcut(x=df["Age"], q=4, retbins=True, labels=["q1", "q2", "q3", "q4"])
bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))

df["Age"] = values
df["Age"] = df["Age"].astype("object")
df.head()

#### Transforming `IS`, `SWC`, `EVEMM`, `INMM` into Categorical Columns

In [None]:
df["SWC"] = df["SWC"].astype("object")
df["IS"] = df["IS"].astype("object")
df["EVEMM"] = df["EVEMM"].astype("object")
df["INMM"] = df["INMM"].astype("object")
df.head()

#### Transforming `HH` Column Into a Categorical Column

In [None]:
df["HH"] = df["HH"].astype(int)
df["HH"] = pd.qcut(x=df["HH"], q=3, labels=["bad", "ok", "good"])
df["HH"] = df["HH"].astype("object")
df.head()

#### Splitting the Data into Training and Validation Sets

In [None]:
X = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"].values

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.15, stratify=y, random_state=SEED
)

X_train = X_train.reset_index(drop=True)
X_valid = X_valid.reset_index(drop=True)

pprint(f"Train set shape: {X_train.shape} and {y_train.shape}")
pprint(f"Validation set shape: {X_valid.shape} and {y_valid.shape}")

#### Transforming the Numerical Columns (Log Transformation)

In [None]:
numerical_columns = df.select_dtypes(exclude="object").columns.tolist()
epsilon = 1e-10

for nc in numerical_columns:
    if not nc in ["diff_W_IBW", "PAL"]:
        X_train[nc] = np.log(X_train[nc].values + epsilon)
        X_valid[nc] = np.log(X_valid[nc].values + epsilon)

#### Scaling the Numerical Columns

In [None]:
pprint("Training set skewness before scaling:")
pprint(X_train[numerical_columns].skew())
print()
pprint("Validation set skewness before scaling:")
pprint(X_valid[numerical_columns].skew())

In [None]:
scalers = {}

for nc in numerical_columns:
    sc = StandardScaler()
    X_train[nc] = sc.fit_transform(X_train[nc].values.reshape(-1, 1))
    X_valid[nc] = sc.transform(X_valid[nc].values.reshape(-1, 1))
    scalers[nc] = sc

In [None]:
nrows = 7
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(12)
fig.set_figheight(15)

temp_train = X_train.copy()
temp_train["set"] = ["train"] * temp_train.shape[0]

temp_valid = X_valid.copy()
temp_valid["set"] = ["valid"] * temp_valid.shape[0]

temp = pd.concat(
    [temp_train, temp_valid, temp_valid, temp_valid, temp_valid],
    axis=0,
    ignore_index=True,
)

for nc in numerical_columns:
    if c == ncols:
        c = 0
        r += 1

    sns.histplot(data=temp[[nc, "set"]], x=nc, hue="set", ax=axs[r, c], kde=True)

    axs[r, c].set_title(nc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Count")

    if (r == nrows - 1) or (r == nrows - 2 and c > 1):
        axs[r, c].set_xlabel("Value")

    c += 1

del temp, temp_train, temp_valid

fig.tight_layout()
fig.show()

In [None]:
pprint("Training set skewness after scaling:")
pprint(X_train[numerical_columns].skew())
print()
pprint("Validation set skewness after scaling:")
pprint(X_valid[numerical_columns].skew())

#### Encoding the Categorical Columns

In [None]:
nrows = 7
ncols = 2

fig, axs = plt.subplots(nrows=nrows, ncols=ncols)
r, c = 0, 0

fig.set_figwidth(10)
fig.set_figheight(10)

# plotting categorical columns distributions
categorical_columns = df.select_dtypes(include="object").columns.tolist()
target_column = "NObeyesdad"
categorical_columns.remove(target_column)

for cc in categorical_columns:
    if c == ncols:
        c = 0
        r += 1

    temp = df[cc].value_counts().reset_index()
    temp.columns = ["Value", "Count"]

    sns.barplot(
        data=temp,
        y="Value",
        x="Count",
        palette=sns.dark_palette("#69d", reverse=True, n_colors=temp.shape[0]),
        ax=axs[r, c],
        orient="h",
    )

    for i in axs[r, c].containers:
        axs[r, c].bar_label(
            i,
        )

    axs[r, c].set_title(cc)
    axs[r, c].set_xlabel("")
    axs[r, c].set_ylabel("")

    if c == 0:
        axs[r, c].set_ylabel("Count")

    if (r == nrows - 1) or (r == nrows - 2 and c > 1):
        axs[r, c].set_xlabel("Value")

    c += 1

fig.tight_layout()
fig.show()

In [None]:
new_train_df = pd.DataFrame()
new_valid_df = pd.DataFrame()

encoders = {}

for cc in categorical_columns:
    ohe = OneHotEncoder(
        drop="first",
        sparse_output=False,
        handle_unknown="infrequent_if_exist",
        min_frequency=20,
    )

    train_categorical_features = pd.DataFrame(
        ohe.fit_transform(X_train[cc].values.reshape(-1, 1)),
        columns=ohe.get_feature_names_out(),
    )
    train_categorical_features = train_categorical_features.add_prefix(cc + "_")
    new_train_df = pd.concat([new_train_df, train_categorical_features], axis=1)

    valid_categorical_features = pd.DataFrame(
        ohe.transform(X_valid[cc].values.reshape(-1, 1)),
        columns=ohe.get_feature_names_out(),
    )
    valid_categorical_features = valid_categorical_features.add_prefix(cc + "_")
    new_valid_df = pd.concat([new_valid_df, valid_categorical_features], axis=1)

    encoders[cc] = ohe

new_train_df = pd.concat(
    [new_train_df, X_train.drop(columns=categorical_columns)], axis=1
)
new_valid_df = pd.concat(
    [new_valid_df, X_valid.drop(columns=categorical_columns)], axis=1
)

X_train = new_train_df.values.copy()
X_valid = new_valid_df.values.copy()

#### Encoding the Labels

In [None]:
ohe_label = LabelBinarizer(sparse_output=False)

original_y_train = y_train.copy()
original_y_valid = y_valid.copy()

y_train = ohe_label.fit_transform(y_train.reshape(-1, 1))
y_valid = ohe_label.transform(y_valid.reshape(-1, 1))

In [None]:
pprint(f"Train set shape: {X_train.shape} and {y_train.shape}")
pprint(f"Validation set shape: {X_valid.shape} and {y_valid.shape}")

### Saving the Artifacts

In [None]:
# saving the artifacts locally
os.makedirs(ARTIFACTS_OUTPUT_PATH, exist_ok=True)
os.makedirs(FEATURES_OUTPUT_PATH, exist_ok=True)

joblib.dump(scalers, os.path.join(ARTIFACTS_OUTPUT_PATH, "features_sc.pkl"))
joblib.dump(encoders, os.path.join(ARTIFACTS_OUTPUT_PATH, "features_ohe.pkl"))
joblib.dump(ohe_label, os.path.join(ARTIFACTS_OUTPUT_PATH, "label_ohe.pkl"))
joblib.dump(bins, os.path.join(ARTIFACTS_OUTPUT_PATH, "qcut_bins.pkl"))

joblib.dump(X_train, os.path.join(FEATURES_OUTPUT_PATH, "X_train.pkl"))
joblib.dump(y_train, os.path.join(FEATURES_OUTPUT_PATH, "y_train.pkl"))
joblib.dump(X_valid, os.path.join(FEATURES_OUTPUT_PATH, "X_valid.pkl"))
joblib.dump(y_valid, os.path.join(FEATURES_OUTPUT_PATH, "y_valid.pkl"))

In [None]:
# saving the preprocessed dataset locally
new_train_df[target_column] = original_y_train
new_valid_df[target_column] = original_y_valid

preprocessed_data = pd.concat([new_train_df, new_valid_df])
preprocessed_data.to_csv(PROCESSED_RAW_FILE_PATH, index=False, sep=",")

In [None]:
# sending the artifacts to the aws s3 bucket
def upload_folder_s3(root_path: str):
    try:
        for path, _, files in os.walk(root_path):
            directory_name = path.split("/")[-2]
            for file in files:
                s3.upload_file(
                    os.path.join(path, file),
                    credentials_config["S3"],
                    os.path.join(directory_name, file),
                )

    except Exception as err:
        print(err)


if credentials_config["S3"] != "YOUR_S3_BUCKET_URL":

    if os.path.exists(ARTIFACTS_OUTPUT_PATH):
        upload_folder_s3(ARTIFACTS_OUTPUT_PATH)

    if os.path.exists(FEATURES_OUTPUT_PATH):
        upload_folder_s3(FEATURES_OUTPUT_PATH)

    # sending preprocessed dataset saved locally to the aws s3 bucket
    s3.upload_file(
        PROCESSED_RAW_FILE_PATH, credentials_config["S3"], PROCESSED_RAW_FILE
    )

    # removing downloaded dataset from local
    os.remove(RAW_FILE_PATH)
    os.remove(PROCESSED_RAW_FILE_PATH)

    # removing the local artifacts and features
    shutil.rmtree(ARTIFACTS_OUTPUT_PATH)
    shutil.rmtree(FEATURES_OUTPUT_PATH)