# NHANES 2017-2018 データ探索

## 目的
- NHANESデータの基本統計量を把握
- 欠損値パターンを分析
- 重要変数の分布を確認
- 異常値の検出

## データ概要
- 参加者数: 9,254名
- データソース: NHANES 2017-2018
- 評価項目: 心血管、代謝、腎、肝、血液マーカー

In [None]:
# ライブラリのインポート
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from pathlib import Path
import sys

# プロジェクトルートをパスに追加
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.data.loader import NHANESLoader
from src.data.preprocessor import NHANESPreprocessor
from src.data.validator import DataValidator

# 日本語フォント設定
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# 表示設定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# 図のスタイル
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. データの読み込み

In [None]:
# データローダーの初期化
data_dir = project_root / 'data' / 'raw'
loader = NHANESLoader(data_dir)

# CSVファイルのリストを表示
csv_files = list(data_dir.glob('*.csv'))
print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f"  - {f.name}")

In [None]:
# データの読み込み（最初のCSVファイルを読み込む）
if csv_files:
    df_raw = loader.load_csv(csv_files[0].name)
    print(f"\nLoaded data shape: {df_raw.shape}")
    print(f"Columns: {df_raw.columns.tolist()}")
else:
    print("No CSV files found. Please place data files in data/raw/")

## 2. 基本統計量

In [None]:
# データ情報の取得
if 'df_raw' in locals():
    info = loader.get_data_info(df_raw)
    
    print("=" * 60)
    print("DATA INFORMATION")
    print("=" * 60)
    print(f"Number of rows: {info['n_rows']:,}")
    print(f"Number of columns: {info['n_columns']}")
    print(f"Memory usage: {info['memory_usage_mb']:.2f} MB")
    print("\nData types:")
    print(pd.Series(info['dtypes']).value_counts())

In [None]:
# 基本統計量
if 'df_raw' in locals():
    df_raw.describe()

## 3. 欠損値分析

In [None]:
# 欠損値の割合
if 'df_raw' in locals():
    missing_data = pd.DataFrame({
        'Missing_Count': df_raw.isnull().sum(),
        'Missing_Percentage': (df_raw.isnull().sum() / len(df_raw) * 100).round(2)
    })
    missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
    
    print("\n" + "=" * 60)
    print("MISSING DATA SUMMARY")
    print("=" * 60)
    print(missing_data)
    
    # 欠損値の可視化
    if len(missing_data) > 0:
        plt.figure(figsize=(12, 6))
        missing_data['Missing_Percentage'].plot(kind='barh', color='salmon')
        plt.xlabel('Missing Percentage (%)')
        plt.title('Missing Data by Variable')
        plt.tight_layout()
        plt.show()

In [None]:
# missingnoによる欠損値パターンの可視化
if 'df_raw' in locals():
    msno.matrix(df_raw, figsize=(14, 8), fontsize=10)
    plt.title('Missing Data Pattern')
    plt.tight_layout()
    plt.show()

## 4. データ前処理

In [None]:
# 前処理の実行
if 'df_raw' in locals():
    preprocessor = NHANESPreprocessor()
    df_processed = preprocessor.preprocess(df_raw)
    
    print("\n" + "=" * 60)
    print("PREPROCESSING SUMMARY")
    print("=" * 60)
    summary = preprocessor.get_preprocessing_summary(df_raw, df_processed)
    print(f"Original shape: {summary['original_shape']}")
    print(f"Processed shape: {summary['processed_shape']}")
    print(f"\nNew columns added: {len(summary['new_columns'])}")
    for col in summary['new_columns']:
        print(f"  - {col}")

## 5. 人口統計的特徴

In [None]:
# 年齢分布
if 'df_processed' in locals() and 'RIDAGEYR' in df_processed.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # ヒストグラム
    axes[0].hist(df_processed['RIDAGEYR'].dropna(), bins=50, color='steelblue', edgecolor='black')
    axes[0].set_xlabel('Age (years)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Age Distribution')
    axes[0].axvline(df_processed['RIDAGEYR'].median(), color='red', linestyle='--', label='Median')
    axes[0].legend()
    
    # 年齢グループ
    if 'age_group' in df_processed.columns:
        age_group_counts = df_processed['age_group'].value_counts().sort_index()
        axes[1].bar(range(len(age_group_counts)), age_group_counts.values, color='lightgreen', edgecolor='black')
        axes[1].set_xticks(range(len(age_group_counts)))
        axes[1].set_xticklabels(age_group_counts.index, rotation=45)
        axes[1].set_ylabel('Count')
        axes[1].set_title('Age Group Distribution')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nAge statistics:")
    print(df_processed['RIDAGEYR'].describe())

In [None]:
# 性別分布
if 'df_processed' in locals() and 'gender_label' in df_processed.columns:
    gender_counts = df_processed['gender_label'].value_counts()
    
    plt.figure(figsize=(8, 6))
    plt.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%', 
            colors=['lightblue', 'lightpink'], startangle=90)
    plt.title('Gender Distribution')
    plt.axis('equal')
    plt.show()
    
    print(f"\nGender counts:")
    print(gender_counts)

## 6. 重要変数の分布

In [None]:
# 脂質プロファイル
if 'df_processed' in locals():
    lipid_vars = ['LBXTC', 'LBDHDD', 'LBDLDL', 'LBXTR']
    lipid_labels = ['Total Cholesterol', 'HDL', 'LDL', 'Triglycerides']
    
    existing_lipid_vars = [v for v in lipid_vars if v in df_processed.columns]
    
    if existing_lipid_vars:
        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
        axes = axes.ravel()
        
        for i, (var, label) in enumerate(zip(existing_lipid_vars, lipid_labels[:len(existing_lipid_vars)])):
            data = df_processed[var].dropna()
            axes[i].hist(data, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
            axes[i].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.1f}')
            axes[i].axvline(data.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {data.median():.1f}')
            axes[i].set_xlabel(f'{label} (mg/dL)')
            axes[i].set_ylabel('Frequency')
            axes[i].set_title(f'{label} Distribution')
            axes[i].legend()
        
        plt.tight_layout()
        plt.show()

In [None]:
# 代謝マーカー
if 'df_processed' in locals():
    metabolic_vars = ['LBXGLU', 'LBXGH', 'LBXIN']
    metabolic_labels = ['Glucose', 'HbA1c', 'Insulin']
    
    existing_metabolic_vars = [v for v in metabolic_vars if v in df_processed.columns]
    
    if existing_metabolic_vars:
        n_vars = len(existing_metabolic_vars)
        fig, axes = plt.subplots(1, n_vars, figsize=(6*n_vars, 5))
        if n_vars == 1:
            axes = [axes]
        
        for i, (var, label) in enumerate(zip(existing_metabolic_vars, metabolic_labels[:n_vars])):
            data = df_processed[var].dropna()
            axes[i].hist(data, bins=50, color='orange', alpha=0.7, edgecolor='black')
            axes[i].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.1f}')
            axes[i].set_xlabel(label)
            axes[i].set_ylabel('Frequency')
            axes[i].set_title(f'{label} Distribution')
            axes[i].legend()
        
        plt.tight_layout()
        plt.show()

## 7. 変数間相関分析

In [None]:
# 脂質マーカー間の相関
if 'df_processed' in locals():
    lipid_vars = ['LBXTC', 'LBDHDD', 'LBDLDL', 'LBXTR']
    existing_lipid_vars = [v for v in lipid_vars if v in df_processed.columns]
    
    if len(existing_lipid_vars) > 1:
        correlation_matrix = df_processed[existing_lipid_vars].corr()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                    square=True, linewidths=1, cbar_kws={"shrink": 0.8})
        plt.title('Lipid Profile Correlation Matrix')
        plt.tight_layout()
        plt.show()

## 8. データ検証

In [None]:
# データ品質検証
if 'df_processed' in locals():
    validator = DataValidator()
    validation_results = validator.validate(df_processed)
    
    print(validator.generate_report())

## 9. データ保存

In [None]:
# 前処理済みデータの保存
if 'df_processed' in locals():
    processed_dir = project_root / 'data' / 'processed'
    processed_dir.mkdir(exist_ok=True)
    
    output_path = processed_dir / 'nhanes_processed.csv'
    df_processed.to_csv(output_path, index=False)
    print(f"\nProcessed data saved to: {output_path}")
    print(f"Shape: {df_processed.shape}")

## まとめ

このノートブックでは:
1. NHANESデータの読み込みと基本統計量の確認
2. 欠損値パターンの分析
3. データの前処理（欠損値処理、外れ値処理、派生変数生成）
4. 人口統計的特徴の可視化
5. 重要変数の分布確認
6. 変数間相関の分析
7. データ品質の検証
8. 前処理済みデータの保存

を実行しました。

次のステップ:
- 02_feature_engineering.ipynb: 追加の特徴量エンジニアリング
- 03_risk_model_development.ipynb: リスクスコアモデルの開発とテスト