# Obesity Level Analysis

## Data Import

In [1]:
import pandas as pd

# Load data
df = pd.read_csv('/kaggle/input/estimation-of-obesity-levels-uci-dataset/ObesityDataSet_raw_and_data_sinthetic.csv')

# Display the first few rows of the dataframe
df.head()

## Data Cleaning

In [2]:
# Remove byte prefixes and decode values properly
df = df.applymap(lambda x: x.strip("b'").strip("'") if isinstance(x, str) else x)

# Check for duplicates
if df.duplicated().sum() > 0:
    df = df.drop_duplicates()

# Check for missing values and fill them
missing_values = df.isnull().sum()
if missing_values.any():
    df.fillna(method='ffill', inplace=True)

# Display the cleaned data
df.head()

## Feature Engineering

In [3]:
def categorize_weight(row):
    bmi = row['Weight'] / (row['Height'] ** 2)
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    elif 30 <= bmi < 35:
        return 'Obesity I'
    elif 35 <= bmi < 40:
        return 'Obesity II'
    else:
        return 'Obesity III'

df['Weight_Category'] = df.apply(categorize_weight, axis=1)

# Display the first few rows to confirm the new feature
df.head()

## Exploratory Data Analysis

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
plt.figure(figsize=(12, 10))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

## Visualizations

In [5]:
# Distribution plots
for column in ['Age', 'Height', 'Weight']:
    plt.figure(figsize=(10, 6))
    sns.displot(df, x=column, kind='kde', hue='Weight_Category', height=6, aspect=1.5)
    plt.title(f'Distribution of {column}')
    plt.show()

In [6]:
# Histograms for categorical data
categorical_cols = ['Gender', 'family_history_with_overweight', 'FAVC', 'MTRANS']
for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(x=col, data=df, hue='Weight_Category')
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=45)
    plt.show()