In [40]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [41]:
def load_data(file_path):
    return pd.read_csv(file_path, sep=",")


# Load dataset
df = load_data("ObesityDataSet.csv")

# Display basic information
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [42]:
def compute_statistics(df):
    stats = {}

    num_features = df.select_dtypes(include=[np.number])
    stats['numerical'] = pd.DataFrame({
        'mean': num_features.mean(),
        'median': num_features.median(),
        'min': num_features.min(),
        'max': num_features.max(),
        'std': num_features.std(),
        '5th_percentile': num_features.quantile(0.05),
        '95th_percentile': num_features.quantile(0.95),
        'missing_values': num_features.isnull().sum()
    })

    cat_features = df.select_dtypes(exclude=[np.number])
    stats['categorical'] = pd.DataFrame({
        'unique_classes': cat_features.nunique(),
        'missing_values': cat_features.isnull().sum()
    })

    class_proportions = {}
    for col in cat_features:
        class_proportions[col] = cat_features[col].value_counts(normalize=True).to_dict()

    stats['categorical']['class_proportions'] = class_proportions

    return stats


stats = compute_statistics(df)


def save_statistics(stats, num_output_file="numerical_statistics.csv", cat_output_file="categorical_statistics.txt"):
    stats['numerical'].to_csv(num_output_file)

    with open(cat_output_file, "w") as f:
        f.write("Categorical Statistics\n")
        f.write("====================\n\n")

        for col, data in stats['categorical'].iterrows():
            f.write(f"Feature: {col}\n")
            f.write(f"Unique Classes: {data['unique_classes']}\n")
            f.write(f"Missing Values: {data['missing_values']}\n")
            f.write("Class Proportions:\n")
            for class_label, proportion in stats['categorical']['class_proportions'].get(col, {}).items():
                f.write(f"  {class_label}: {proportion:.4f}\n")
            f.write("\n")

Unnamed: 0,mean,median,min,max,std,5th_percentile,95th_percentile,missing_values
Age,24.3126,22.77789,14.0,61.0,6.345968,17.891428,38.09807,0
Height,1.701677,1.700499,1.45,1.98,0.093305,1.548291,1.85,0
Weight,86.586058,83.0,39.0,173.0,26.191172,48.5,131.916152,0
FCVC,2.419043,2.385502,1.0,3.0,0.533927,1.523214,3.0,0
NCP,2.685628,3.0,1.0,4.0,0.778039,1.0,3.750881,0
CH2O,2.008011,2.0,1.0,3.0,0.612953,1.0,3.0,0
FAF,1.010298,1.0,0.0,3.0,0.850592,0.0,2.677133,0
TUE,0.657866,0.62535,0.0,2.0,0.608927,0.0,2.0,0


In [43]:
# Display categorical statistics
for col, data in stats['categorical'].iterrows():
    print(f"Feature: {col}")
    print(f"Unique Classes: {data['unique_classes']}")
    print(f"Missing Values: {data['missing_values']}")
    print("Class Proportions:")
    for class_label, proportion in stats['categorical']['class_proportions'].get(col, {}).items():
        print(f"  {class_label}: {proportion:.4f}")
    print("\n")


Feature: Gender
Unique Classes: 2
Missing Values: 0
Class Proportions:
  Male: 0.5059
  Female: 0.4941


Feature: family_history_with_overweight
Unique Classes: 2
Missing Values: 0
Class Proportions:
  yes: 0.8176
  no: 0.1824


Feature: FAVC
Unique Classes: 2
Missing Values: 0
Class Proportions:
  yes: 0.8839
  no: 0.1161


Feature: CAEC
Unique Classes: 4
Missing Values: 0
Class Proportions:
  Sometimes: 0.8361
  Frequently: 0.1146
  Always: 0.0251
  no: 0.0242


Feature: SMOKE
Unique Classes: 2
Missing Values: 0
Class Proportions:
  no: 0.9792
  yes: 0.0208


Feature: SCC
Unique Classes: 2
Missing Values: 0
Class Proportions:
  no: 0.9545
  yes: 0.0455


Feature: CALC
Unique Classes: 4
Missing Values: 0
Class Proportions:
  Sometimes: 0.6637
  no: 0.3027
  Frequently: 0.0332
  Always: 0.0005


Feature: MTRANS
Unique Classes: 5
Missing Values: 0
Class Proportions:
  Public_Transportation: 0.7485
  Automobile: 0.2165
  Walking: 0.0265
  Motorbike: 0.0052
  Bike: 0.0033


Feature: NObey

In [37]:
def create_folders():
    plot_folders = ['boxplots', 'violinplots', 'error_bars', 'histograms', 'conditional_histograms', 'regressions',
                    'heatmaps']
    for folder in plot_folders:
        if not os.path.exists(folder):
            os.makedirs(folder)

In [38]:
def visualize_data(df):
    create_folders()

    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns

    for cat_col in categorical_cols:
        for num_col in numerical_cols:
            # Boxplot
            plt.figure(figsize=(12, 6))
            sns.boxplot(x=df[cat_col], y=df[num_col])
            plt.xticks(rotation=45, ha='right')
            plt.title(f'Boxplot of {num_col} by {cat_col}')
            plt.tight_layout()
            plt.savefig(f'boxplots/boxplot_{num_col}_by_{cat_col}.png', bbox_inches='tight')
            plt.close()

            # Violinplot
            plt.figure(figsize=(12, 6))
            sns.violinplot(x=df[cat_col], y=df[num_col])
            plt.xticks(rotation=45, ha='right')
            plt.title(f'Violinplot of {num_col} by {cat_col}')
            plt.tight_layout()
            plt.savefig(f'violinplots/violinplot_{num_col}_by_{cat_col}.png', bbox_inches='tight')
            plt.close()

            # Error Bars - proper error bar plot
            plt.figure(figsize=(12, 6))
            sns.pointplot(data=df, x=cat_col, y=num_col, errorbar=("ci", 95), capsize=0.2,
                          linestyle="none")  # 95% confidence interval
            plt.xticks(rotation=45, ha='right')
            plt.title(f'Error Bars of {num_col} by {cat_col}')
            plt.tight_layout()
            plt.savefig(f'error_bars/error_bars_{num_col}_by_{cat_col}.png', bbox_inches='tight')
            plt.close()

            # Histogram - basic
            plt.figure(figsize=(12, 6))
            sns.histplot(df[num_col], kde=True, bins=50)
            plt.title(f'Histogram of {num_col}')
            plt.tight_layout()
            plt.savefig(f'histograms/histogram_{num_col}.png', bbox_inches='tight')
            plt.close()

            # Histogram with hue (conditional histogram)
            plt.figure(figsize=(12, 6))
            sns.histplot(df, x=num_col, hue=cat_col, multiple="stack", kde=True, bins=30)
            plt.title(f'Conditional Histogram of {num_col} by {cat_col}')
            plt.tight_layout()
            plt.savefig(f'conditional_histograms/conditional_histogram_{num_col}_by_{cat_col}.png', bbox_inches='tight')
            plt.close()

    # Create Correlation Heatmap
    num_features = df.select_dtypes(include=[np.number])
    correlation_matrix = num_features.corr()

    # Plot the heatmap
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('heatmaps/correlation_heatmap.png', bbox_inches='tight')
    plt.close()

    # Regression Analysis
    for num_col_1 in numerical_cols:
        for num_col_2 in numerical_cols:
            if num_col_1 != num_col_2:
                plt.figure(figsize=(12, 6))
                sns.regplot(x=df[num_col_1], y=df[num_col_2], scatter_kws={'s': 20}, line_kws={'color': 'red'})
                plt.title(f'Regression between {num_col_1} and {num_col_2}')
                plt.xlabel(num_col_1)
                plt.ylabel(num_col_2)
                plt.tight_layout()
                plt.savefig(f'regressions/regression_{num_col_1}_vs_{num_col_2}.png', bbox_inches='tight')
                plt.close()

In [39]:
visualize_data(df)