# BMI Predictor üßÆ
---

## ETL Process üñ•Ô∏è

In [1]:
# Import necessary libraries
import os
import sys 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Import logging info
import logging

logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s')
logger = logging.getLogger(__name__)

# --- Get the current script directory ---
current_dir = os.getcwd()

# Navigate to the project root
project_root = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(os.path.join(project_root))

## Load Data üó≥Ô∏è

In [2]:
# Define the path to the CSV file
file_path = os.path.join(project_root, 'data', 'bmi.csv')

# Load the CSV into a DataFrame
try:
    df_bmi = pd.read_csv(file_path)
    logger.info(f"‚úÖ Data successfully loaded: {df_bmi.shape[0]} rows, {df_bmi.shape[1]} columns.")
except Exception as e:
    logger.error(f"‚ùå Error loading training data: {e}")

[INFO] ‚úÖ Data successfully loaded: 500 rows, 4 columns.


## Preview üîç

In [3]:
# Preview the first few records
display(df_bmi.head())

# General dataset information
df_bmi.info()

Unnamed: 0,Gender,Height,Weight,Index
0,Male,174,96,4
1,Male,189,87,2
2,Female,185,110,4
3,Female,195,104,3
4,Male,149,61,3


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Gender  500 non-null    object
 1   Height  500 non-null    int64 
 2   Weight  500 non-null    int64 
 3   Index   500 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.8+ KB


### Check üîé

In [4]:
# Null values check
nulls = df_bmi.isnull().sum()
print("\nNull values per column:\n", nulls[nulls > 0])

# Duplicate analysis
duplicates = df_bmi.duplicated().sum()
print(f"\nDuplicate rows: {duplicates}")


Null values per column:
 Series([], dtype: int64)

Duplicate rows: 11


### Cleaning üßº

In [5]:
# --- Drop duplicates ---
print(f"‚û°Ô∏è Before drop_duplicates: {df_bmi.shape}")
df_bmi.drop_duplicates(inplace=True)
print(f"‚úÖ Duplicates removed. New shape: {df_bmi.shape}")

‚û°Ô∏è Before drop_duplicates: (500, 4)
‚úÖ Duplicates removed. New shape: (489, 4)


In [6]:
# --- Checking Gender values ---
df_bmi['Gender'].value_counts()

Gender
Female    250
Male      239
Name: count, dtype: int64

In [7]:
# --- Checking index values ---
df_bmi['Index'].value_counts()

Index
5    195
4    128
2     69
3     64
1     20
0     13
Name: count, dtype: int64

## Creating the BMI Column üóíÔ∏è
- _"Index"_ column is categorical, but is not the real numeric value for BMI, we need to do the calculus

In [8]:
# Convert Height from cm to meters
df_bmi['Height_m'] = df_bmi['Height'] / 100

# Calculate BMI and create a new column
# BMI = Weight (kg) / (Height (m) ** 2)
df_bmi['BMI_Value'] = df_bmi['Weight'] / (df_bmi['Height_m'] ** 2)

# Optional: Round the value to two decimal places for clarity
df_bmi['BMI_Value'] = df_bmi['BMI_Value'].round(2)

# Optional: Drop the intermediate 'Height_m' column
df_bmi = df_bmi.drop(columns=['Height_m'])

# Verify the first rows with the new column
print("DataFrame after calculating BMI:")
print(df_bmi.head())

DataFrame after calculating BMI:
   Gender  Height  Weight  Index  BMI_Value
0    Male     174      96      4      31.71
1    Male     189      87      2      24.36
2  Female     185     110      4      32.14
3  Female     195     104      3      27.35
4    Male     149      61      3      27.48


## Binary Codification of Gender
- Convert Male on 0 and Female on 1.

In [9]:
# Values : Assign 0 to 'Male' and 1 to 'Female'
# (The choice of 0/1 is arbitrary, the important thing is consistency)
gender_mapping = {'Male': 0, 'Female': 1}

# Apply the mapping to the 'Gender' column
df_bmi['Gender_Encoded'] = df_bmi['Gender'].map(gender_mapping)

# Optional: Drop the original 'Gender' column as we now have the encoded one
df_bmi = df_bmi.drop(columns=['Gender'])

# Verify first rows after encoding
print("\nDataFrame after encoding Gender:")
print(df_bmi.head())
print("\nData types verification:")
df_bmi.info()


DataFrame after encoding Gender:
   Height  Weight  Index  BMI_Value  Gender_Encoded
0     174      96      4      31.71               0
1     189      87      2      24.36               0
2     185     110      4      32.14               1
3     195     104      3      27.35               1
4     149      61      3      27.48               0

Data types verification:
<class 'pandas.core.frame.DataFrame'>
Index: 489 entries, 0 to 499
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Height          489 non-null    int64  
 1   Weight          489 non-null    int64  
 2   Index           489 non-null    int64  
 3   BMI_Value       489 non-null    float64
 4   Gender_Encoded  489 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 22.9 KB


## Outliers Detection ü™§

In [10]:
# --- Outliers detection

# Define the columns to check
cols_to_check = ['Height', 'Weight', 'BMI_Value']
outliers_indices = set()

In [11]:
for col in cols_to_check:
    # 1. Calculate Q1 (25th Percentile) and Q3 (75th Percentile)
    Q1 = df_bmi[col].quantile(0.25)
    Q3 = df_bmi[col].quantile(0.75)
    
    # 2. Calculate IQR
    IQR = Q3 - Q1
    
    # 3. Define the limits
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 4. Find indices of outliers
    col_outliers = df_bmi[
        (df_bmi[col] < lower_bound) | (df_bmi[col] > upper_bound)
    ].index
    
    # 5. Add indices to principal
    outliers_indices.update(col_outliers)
    
    print(f"--- Columna: {col} ---")
    print(f"L√≠mites: ({lower_bound:.2f}, {upper_bound:.2f})")
    print(f"Outliers encontrados: {len(col_outliers)}")

--- Columna: Height ---
L√≠mites: (114.00, 226.00)
Outliers encontrados: 0
--- Columna: Weight ---
L√≠mites: (-4.00, 220.00)
Outliers encontrados: 0
--- Columna: BMI_Value ---
L√≠mites: (-1.82, 75.97)
Outliers encontrados: 3


In [12]:
# Total number of rows with at least one outlier in any of the columns
print(f"\nTotal number of rows with at least one detected outlier: {len(outliers_indices)}")


Total number of rows with at least one detected outlier: 3


In [13]:
# --- Drop Outliers
print(f"‚û°Ô∏è Before dropping outliers: {df_bmi.shape}")

# A new DataFrame called df_bmi_cleaned to keep the original intact
df_bmi_cleaned = df_bmi.drop(outliers_indices, axis=0)

# 2. Restart the index of the new DataFrame so that it is consecutive (0, 1, 2, ...)
df_bmi_cleaned = df_bmi_cleaned.reset_index(drop=True)

‚û°Ô∏è Before dropping outliers: (489, 5)


In [14]:
print("--- Cleaning Report ---")
print(f"‚û°Ô∏è Original Shape: {df_bmi.shape}")
print(f"üöÆ Eliminated rows (outliers): {len(outliers_indices)}")
print(f"‚úÖ Cleaned DataFrame Shape: {df_bmi_cleaned.shape}")

--- Cleaning Report ---
‚û°Ô∏è Original Shape: (489, 5)
üöÆ Eliminated rows (outliers): 3
‚úÖ Cleaned DataFrame Shape: (486, 5)


## Export the new DataSet ‚ùáÔ∏è

In [15]:
# -- Update the original dataframe
df_bmi = df_bmi_cleaned.copy()

df_bmi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 486 entries, 0 to 485
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Height          486 non-null    int64  
 1   Weight          486 non-null    int64  
 2   Index           486 non-null    int64  
 3   BMI_Value       486 non-null    float64
 4   Gender_Encoded  486 non-null    int64  
dtypes: float64(1), int64(4)
memory usage: 19.1 KB


In [16]:
# --- Export the new DataSet ‚ùáÔ∏è
output_file_path = os.path.join(project_root, 'data', 'bmi_cleaned.csv')
try:
    df_bmi.to_csv(output_file_path, index=False)
    logger.info(f"‚úÖ Cleaned data successfully exported to: {output_file_path}")
except Exception as e:
    logger.error(f"‚ùå Error exporting cleaned data: {e}")

[INFO] ‚úÖ Cleaned data successfully exported to: f:\Files\PythonProjects\bmi-project\data\bmi_cleaned.csv
