<a href="https://colab.research.google.com/github/shwoa/Capstone/blob/main/CatBoost(GWAS%2C10000).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from google.colab import files
uploaded = files.upload()

Saving 2016년 표현형 데이터.xlsx to 2016년 표현형 데이터.xlsx
Saving GWAS_SNP(10000).csv to GWAS_SNP(10000).csv
Saving 유전형인코딩_최빈값.csv to 유전형인코딩_최빈값.csv


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import os
import warnings
warnings.filterwarnings("ignore")

# 📁 파일 경로
snp_path = "GWAS_SNP(10000).csv"
geno_path = "유전형인코딩_최빈값.csv"
pheno_path = "2016년 표현형 데이터.xlsx"

In [None]:
# ✅ 데이터 로딩
top_snp_df = pd.read_csv(snp_path)
geno_df = pd.read_csv(geno_path, index_col=0)
pheno_df = pd.read_excel(pheno_path, index_col=0)

In [None]:
# 💡 표현형 이름 매핑 (pheno_df에 맞춰야 함)
trait_name_map = {
    "과중": "과중 (g)",
    "과장": "과장 (mm)",
    "과폭": "과폭 (mm)",
    "과피두께": "과피두께 (mm)",
    "당도": "당도 (%)",
    "과실경도": "과실경도 (kg)"
}

# 📂 모델 저장 폴더 생성
os.makedirs("saved_models_catboost_gwas", exist_ok=True)

In [None]:
# 🎯 예측 대상 표현형 목록
traits = top_snp_df["Trait"].unique()
results = []

for trait in traits:
    try:
        top_snps = top_snp_df[top_snp_df["Trait"] == trait]["SNP"].tolist()
        top_snps = [snp for snp in top_snps if snp in geno_df.index]
        if len(top_snps) == 0:
            print(f"❌ {trait} - 사용 가능한 SNP 없음")
            continue

        geno_top = geno_df.loc[top_snps].T
        X = geno_top

        # 표현형 이름 변환
        trait_col = trait_name_map.get(trait, trait)
        if trait_col not in pheno_df.columns:
            print(f"❌ {trait} → '{trait_col}' 표현형 데이터에 없음")
            continue

        y = pheno_df[trait_col].loc[X.index]

        # 🔀 학습/검증 분할
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # 🧠 CatBoost 학습
        model = CatBoostRegressor(verbose=0, random_seed=42)
        model.fit(X_train, y_train)

        # 저장
        model_path = f"saved_models_catboost_gwas/{trait}_catboost_model.cbm"
        model.save_model(model_path)

        # 📈 평가
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results.append({"표현형": trait, "MSE": mse, "R²": r2})
    except Exception as e:
        print(f"⚠️ {trait} 예측 중 오류 발생: {e}")

In [None]:
# 📋 결과 정리
results_df = pd.DataFrame(results)
print("✅ 모델 성능 요약:")
display(results_df)
results_df.to_csv("CatBoost_GWAS_모델성능.csv", index=False)

✅ 모델 성능 요약:


Unnamed: 0,표현형,MSE,R²
0,과중,1117.657774,0.543886
1,과장,45.050793,0.519385
2,과폭,109.521656,0.561617
3,과피두께,0.570648,0.596555
4,과실경도,0.008641,0.31216
5,당도,0.547497,0.670605


In [None]:
# ✅ 앞쪽 10개 샘플 ID 선택
sample_ids = geno_df.columns[:10]

# ✅ 해당 샘플만 추출 (샘플 x SNP 구조로 전치)
new_samples_df = geno_df[sample_ids].T

# ✅ 저장
new_samples_df.to_csv("새로운_유전형.csv")


In [None]:
import pandas as pd
import os
from catboost import CatBoostRegressor

# 🔹 새로운 유전형 데이터 (샘플 x SNP)
new_samples_df = pd.read_csv("새로운_유전형.csv", index_col=0)

# 🔹 모델 경로 (CatBoost GWAS용)
model_dir = "saved_models_catboost_gwas"
all_predictions = []

for file in os.listdir(model_dir):
    if file.endswith(".cbm"):  # ✅ CatBoost 모델 확장자
        trait = file.replace("_catboost_model.cbm", "")
        model_path = os.path.join(model_dir, file)

        try:
            # ✅ 모델 로드
            model = CatBoostRegressor()
            model.load_model(model_path)

            # ✅ 모델이 요구하는 SNP만 선택
            required_snps = model.feature_names_
            input_df = new_samples_df[required_snps]

            # ✅ 예측 수행
            preds = model.predict(input_df)

            # ✅ 결과 저장
            for sample_id, pred in zip(input_df.index, preds):
                all_predictions.append({
                    "샘플": sample_id,
                    "표현형": trait,
                    "예측값": pred
                })

        except Exception as e:
            print(f"⚠️ {trait} 예측 오류: {e}")

In [None]:
# 📋 예측 결과 정리 및 저장
results_pred_df = pd.DataFrame(all_predictions)
print("✅ 새로운 샘플 예측 완료:")
display(results_pred_df.head())

results_pred_df.to_csv("CatBoost_GWAS_예측결과.csv", index=False)

✅ 새로운 샘플 예측 완료:


Unnamed: 0,샘플,표현형,예측값
0,TC1_175,과중,87.185185
1,TC1_187,과중,32.039517
2,TC1_001,과중,42.469872
3,TC1_009,과중,14.212549
4,TC1_016,과중,45.875434
