In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Function to load data with error handling
def load_data(file_path):
    try:
        df = pd.read_csv(file_path, comment='#', on_bad_lines='skip')
        df.columns = df.columns.str.strip()  # Strip any spaces from column names
        return df
    except pd.errors.ParserError as e:
        print(f"Error parsing the CSV file: {e}")
        return None
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        return None

# Define the file paths
male_file_path = "nhanes_adult_male_bmx_2020.csv"
female_file_path = "nhanes_adult_female_bmx_2020 .csv"

# Load the datasets
male_df = load_data(male_file_path)
female_df = load_data(female_file_path)

if male_df is None or female_df is None:
    print("Failed to load one or both datasets. Please check file paths and data format.")
    exit()

# Display dataset information
print("Male DataFrame Head:\n", male_df.head())
print("\nFemale DataFrame Head:\n", female_df.head())

# Convert DataFrames to NumPy arrays
male = male_df.to_numpy()
female = female_df.to_numpy()

# Define correct column names
weight_col = "BMXWT"  # Correct column for weight
height_col = "BMXHT"  # Correct column for height
waist_col = "BMXWAIST"  # Correct column for waist circumference
hip_col = "BMXHIP"  # Correct column for hip circumference

# Check if weight and height columns exist
if weight_col in male_df.columns and weight_col in female_df.columns:
    male_weights = pd.to_numeric(male_df[weight_col], errors='coerce').dropna().values
    female_weights = pd.to_numeric(female_df[weight_col], errors='coerce').dropna().values
else:
    print(f"Weight column '{weight_col}' not found in one of the datasets.")
    exit()

if height_col in male_df.columns and height_col in female_df.columns:
    male_heights = pd.to_numeric(male_df[height_col], errors='coerce').dropna().values / 100  # Convert cm to meters
    female_heights = pd.to_numeric(female_df[height_col], errors='coerce').dropna().values / 100  # Convert cm to meters
else:
    print(f"Height column '{height_col}' not found in one of the datasets.")
    exit()

# Check if the arrays are empty
if male_weights.size == 0 or female_weights.size == 0:
    print("One or both weight arrays are empty. Please check the data.")
    exit()

# Determine common x-axis limits
min_weight = min(female_weights.min(), male_weights.min())
max_weight = max(female_weights.max(), male_weights.max())

# Plot histograms
plt.figure(figsize=(10, 6))
plt.subplot(2, 1, 1)
plt.hist(female_weights, bins=30, color='pink', alpha=0.7)
plt.title('Female Weights')
plt.xlim(min_weight, max_weight)

plt.subplot(2, 1, 2)
plt.hist(male_weights, bins=30, color='blue', alpha=0.7)
plt.title('Male Weights')
plt.xlim(min_weight, max_weight)

plt.tight_layout()
plt.show()

# Boxplot for weights
plt.figure(figsize=(8, 6))
plt.boxplot([female_weights, male_weights], labels=['Female', 'Male'])
plt.title('Boxplot of Weights')
plt.ylabel('Weight (kg)')
plt.show()

# Compute statistical metrics
female_weight_stats = {
    'mean': np.mean(female_weights),
    'median': np.median(female_weights),
    'std': np.std(female_weights),
    'variance': np.var(female_weights),
    'skewness': pd.Series(female_weights).skew(),
    'kurtosis': pd.Series(female_weights).kurt()
}

male_weight_stats = {
    'mean': np.mean(male_weights),
    'median': np.median(male_weights),
    'std': np.std(male_weights),
    'variance': np.var(male_weights),
    'skewness': pd.Series(male_weights).skew(),
    'kurtosis': pd.Series(male_weights).kurt()
}

print("Female Weight Statistics:", female_weight_stats)
print("Male Weight Statistics:", male_weight_stats)

# Compute BMI
female_bmi = female_weights / (female_heights ** 2)
male_bmi = male_weights / (male_heights ** 2)

# Standardize the dataset
zfemale = zscore(np.column_stack((female_weights, female_heights, female_bmi)), nan_policy='omit')
zmale = zscore(np.column_stack((male_weights, male_heights, male_bmi)), nan_policy='omit')

# Select relevant columns for scatterplot matrix
zfemale_selected = zfemale[:, [1, 0, 2]]
zmale_selected = zmale[:, [1, 0, 2]]

sns.pairplot(pd.DataFrame(zfemale_selected, columns=['Height', 'Weight', 'BMI']))
plt.show()

# Compute correlation coefficients
pearson_corr = np.corrcoef(zfemale_selected, rowvar=False)
spearman_corr = pd.DataFrame(zfemale_selected).corr(method='spearman').to_numpy()

print("Pearson's Correlation Coefficients:\n", pearson_corr)
print("Spearman's Correlation Coefficients:\n", spearman_corr)

# Compute waist-to-height and waist-to-hip ratios if columns exist
if waist_col in male_df.columns and waist_col in female_df.columns and hip_col in male_df.columns and hip_col in female_df.columns:
    male_waist_to_height = pd.to_numeric(male_df[waist_col], errors='coerce') / male_heights
    male_waist_to_hip = pd.to_numeric(male_df[waist_col], errors='coerce') / pd.to_numeric(male_df[hip_col], errors='coerce')

    female_waist_to_height = pd.to_numeric(female_df[waist_col], errors='coerce') / female_heights
    female_waist_to_hip = pd.to_numeric(female_df[waist_col], errors='coerce') / pd.to_numeric(female_df[hip_col], errors='coerce')

    # Boxplot for ratios
    plt.figure(figsize=(10, 6))
    plt.boxplot([female_waist_to_height.dropna(), female_waist_to_hip.dropna(), 
                 male_waist_to_height.dropna(), male_waist_to_hip.dropna()], 
                labels=['Female Waist-to-Height', 'Female Waist-to-Hip', 
                        'Male Waist-to-Height', 'Male Waist-to-Hip'])
    plt.title('Boxplot of Ratios')
    plt.ylabel('Ratio')
    plt.show()
else:
    print("Waist/Hip columns not found. Skipping ratio analysis.")

# Conclusion
conclusion = """
**Conclusion:**

The analysis provides insights into the body measurements of adult males and females. 
- Histograms and boxplots show differences in weight distributions between genders. 
- Correlation coefficients reveal relationships between various body measurements. 
- Ratios like waist-to-height and waist-to-hip provide additional indicators of health risks.
"""
print(conclusion)