In [13]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [9]:
# 데이터 불러오기
df = pd.read_excel('../sample_data.xlsx')

# 컬럼명 정리
df.rename(columns={"moratlity_rate": "mortality_rate"}, inplace=True)

# 범주형 변수 인코딩
df['address'] = df['address'].map({'NY': 0, 'CA': 1})

# 특성과 타겟 분리
X = df.drop(columns=['mortality_rate'])
y = df['mortality_rate']

# object → 숫자형 변환
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')

# 결측치 제거 (X와 y 동기화)
X = X.dropna()
y = y.loc[X.index]

# 훈련/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
df.head()

Unnamed: 0,year,age,address,mortality_rate,male_ratio,female_ratio,male_count,female_count,ns1_ratio,ns2_ratio,ns3_ratio,ns4_ratio,sm1_ratio,sm2_ratio,ns1_count,ns2_count,ns3_count,ns4_count,sm1_count,sm2_count
0,2015,10,0,0.0004,0.45,0.55,45,55,0.333,0.333,0.333,0.0,0.0,0.0,30,30,30,0,0,0
1,2015,20,0,0.0003,0.45,0.55,45,55,0.0,0.0,0.333,0.333,0.333,0.333,0,0,30,30,30,30
2,2015,30,0,0.0002,0.45,0.55,45,55,0.2,0.2,0.2,0.2,0.2,0.2,50,50,50,50,50,50
3,2015,40,0,0.00025,0.45,0.55,45,55,0.333,0.333,0.333,0.0,0.0,0.0,30,30,30,0,0,0
4,2015,50,0,0.00031,0.45,0.55,45,55,0.0,0.0,0.333,0.333,0.333,0.333,0,0,30,30,30,30


In [11]:
# 의사결정나무 훈련 및 평가
# 결정트리 모델 학습
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# 테스트세트 예측 및 성능 계산
y_pred_dt = dt.predict(X_test)
rmse_dt = mean_squared_error(y_test, y_pred_dt, squared=False)
r2_dt = r2_score(y_test, y_pred_dt)
print("Decision Tree - RMSE:", rmse_dt, ", R^2:", r2_dt)

Decision Tree - RMSE: 0.0 , R^2: 1.0


In [14]:
# 랜덤포레스트 훈련 및 평가가
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest - RMSE:", rmse_rf, ", R^2:", r2_rf)

Random Forest - RMSE: 2.9504296378017462e-05 , R^2: 0.9972974505595682


In [15]:
# XGBoost 훈련 및 평가
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
r2_xgb = r2_score(y_test, y_pred_xgb)
print("XGBoost - RMSE:", rmse_xgb, ", R^2:", r2_xgb)

XGBoost - RMSE: 0.00020612619967424057 , R^2: 0.8680924918260379


In [None]:
# 성능비교
metrics_df = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest", "XGBoost"],
    "RMSE": [rmse_dt, rmse_rf, rmse_xgb],
    "R^2": [r2_dt, r2_rf, r2_xgb]
})
metrics_df = metrics_df.round(6)  # 소수점 6자리로 정렬
print(metrics_df.to_markdown(index=False))

| Model         |     RMSE |      R^2 |
|:--------------|---------:|---------:|
| Decision Tree | 0        | 1        |
| Random Forest | 3e-05    | 0.997297 |
| XGBoost       | 0.000206 | 0.868092 |


In [21]:
# 변수 중요도 추출 & 출력
def show_importances(model, feature_names, model_name):
    if hasattr(model, "feature_importances_"):
        imp = model.feature_importances_
    else:
        # XGBoost 경우
        imp_dict = model.get_booster().get_score(importance_type='weight')
        imp = np.array([imp_dict.get(f"f{i}", 0) for i in range(len(feature_names))])
    imp_series = pd.Series(imp, index=feature_names).sort_values(ascending=False)
    print(f"\n### {model_name} Feature Importances ###")
    print(imp_series.round(4).to_markdown())

show_importances(dt, X.columns, "Decision Tree")
show_importances(rf, X.columns, "Random Forest")
show_importances(xgb, X.columns, "XGBoost")


### Decision Tree Feature Importances ###
|              |      0 |
|:-------------|-------:|
| age          | 0.8371 |
| address      | 0.1512 |
| ns4_ratio    | 0.0076 |
| ns2_count    | 0.0022 |
| ns1_count    | 0.0015 |
| ns3_ratio    | 0.0005 |
| sm1_ratio    | 0      |
| sm1_count    | 0      |
| ns4_count    | 0      |
| ns3_count    | 0      |
| sm2_ratio    | 0      |
| year         | 0      |
| ns2_ratio    | 0      |
| ns1_ratio    | 0      |
| female_count | 0      |
| male_count   | 0      |
| female_ratio | 0      |
| male_ratio   | 0      |
| sm2_count    | 0      |

### Random Forest Feature Importances ###
|              |      0 |
|:-------------|-------:|
| age          | 0.7996 |
| address      | 0.131  |
| ns4_count    | 0.0097 |
| ns2_count    | 0.0083 |
| ns1_ratio    | 0.007  |
| sm1_count    | 0.0065 |
| ns3_count    | 0.0062 |
| year         | 0.0056 |
| ns2_ratio    | 0.0047 |
| sm1_ratio    | 0.0045 |
| ns4_ratio    | 0.0041 |
| ns1_count    | 0.0039 |
| ns

In [22]:
# 가장 적합한 모델로 장래 30년 사망률 예측 (여기서는 XGBoost 선택)
best_model = xgb

future_years = list(range(2026, 2056))
age_groups   = [30, 40, 50, 60, 70, 80]
# 여기에 사용자가 입력할 값들
address       = 1     # CA
male_ratio    = 0.47
female_ratio  = 0.53
ns_ratios     = [0.25, 0.25, 0.25, 0.25]
sm_ratios     = [0.15, 0.15]
male_count    = 4700
female_count  = 5300
ns_counts     = [2000, 2000, 2000, 2000]
sm_counts     = [1500, 1500]

# 입력 데이터 생성
future_rows = []
for year in future_years:
    for age in age_groups:
        future_rows.append({
            'year': year,
            'age': age,
            'address': address,
            'male_ratio': male_ratio,
            'female_ratio': female_ratio,
            'male_count': male_count,
            'female_count': female_count,
            'ns1_ratio': ns_ratios[0], 'ns2_ratio': ns_ratios[1],
            'ns3_ratio': ns_ratios[2], 'ns4_ratio': ns_ratios[3],
            'sm1_ratio': sm_ratios[0], 'sm2_ratio': sm_ratios[1],
            'ns1_count': ns_counts[0], 'ns2_count': ns_counts[1],
            'ns3_count': ns_counts[2], 'ns4_count': ns_counts[3],
            'sm1_count': sm_counts[0], 'sm2_count': sm_counts[1],
        })

future_df = pd.DataFrame(future_rows)
future_df['predicted_mortality'] = best_model.predict(future_df)

# 5) 피벗 테이블로 보기 좋게 정리
pivot = future_df.pivot(index='year', columns='age', values='predicted_mortality').round(6)
print("\n## 2026–2055년 XGBoost 예측 사망률 ##")
print(pivot.to_markdown())


## 2026–2055년 XGBoost 예측 사망률 ##
|   year |       30 |       40 |       50 |       60 |      70 |       80 |
|-------:|---------:|---------:|---------:|---------:|--------:|---------:|
|   2026 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2027 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2028 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2029 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2030 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2031 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2032 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2033 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2034 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2035 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 0.00064 | 0.000788 |
|   2036 | 0.000405 | 0.000405 | 0.000405 | 0.000405 | 

In [20]:
y_pred_xgb

array([0.00040457, 0.00160118, 0.00040457, 0.00040457, 0.00113654,
       0.00064024, 0.00040457, 0.00040457, 0.00040457, 0.00040457,
       0.00040457, 0.00040457, 0.00040457, 0.00040457, 0.00113654,
       0.00040457, 0.00040457, 0.00160118, 0.00040457, 0.00040457,
       0.00078776], dtype=float32)