<a href="https://colab.research.google.com/github/shunnyK/AI/blob/main/weather_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import numpy as np

# -----------------------------
# 1) 데이터 준비
# -----------------------------
# Colab 드라이브 경로 예시
path = "/content/drive/MyDrive/weather.csv"
# 우리 세션 업로드 경로였다면: path = "/mnt/data/weather.csv"

df = pd.read_csv(path)

# (1) 라벨 결정
if "RainTomorrow" in df.columns:
    # Yes/No → 1/0 (대소문자 섞여도 처리)
    y = df["RainTomorrow"].astype(str).str.strip().str.lower().map({"yes": 1, "no": 0})
    # 혹시 다른 문자열이 있으면 결측으로 간주 후 제거
    keep_idx = y.isin([0, 1])
    df = df.loc[keep_idx].copy()
    y = y.loc[keep_idx].astype(int)
    # X에서 누수 방지: RainTomorrow, RISK_MM 제거
    drop_cols = [c for c in ["RainTomorrow", "RISK_MM"] if c in df.columns]
    X = df.drop(columns=drop_cols)
else:
    # RainTomorrow가 없으면 RISK_MM > 0을 이진 라벨로 생성
    if "RISK_MM" not in df.columns:
        raise ValueError("라벨을 만들 수 없습니다: 'RainTomorrow'도 없고 'RISK_MM'도 없습니다.")
    y = (df["RISK_MM"] > 0).astype(int)
    X = df.drop(columns=["RISK_MM"])

# (2) 결측치 간단 제거
data = pd.concat([X, y.rename("target")], axis=1).dropna()
X = data.drop(columns=["target"])
y = data["target"]

# (3) 범주형 → 원-핫 인코딩
X = pd.get_dummies(X)

# (4) 학습/테스트 분리 (층화 분할)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------
dt = DecisionTreeClassifier(random_state=42)

knn = Pipeline([
    ("scaler", StandardScaler(with_mean=False) if hasattr(X_train, "sparse") and X_train.sparse else StandardScaler()),
    ("clf", KNeighborsClassifier(n_neighbors=5, weights="distance"))
])

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
knn.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
models = {"Decision Tree": dt, "KNN": knn}

print("=== Test Accuracy ===")
best_name, best_acc = None, -1
for name, model in models.items():
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    print(f"{name:13s}: {acc:.4f}")
    if acc > best_acc:
        best_name, best_acc, best_pred = name, acc, pred

print(f"\n=== Best Model: {best_name} (accuracy={best_acc:.4f}) ===")
print("\nClassification report")
print(classification_report(y_test, best_pred, target_names=["NoRain","Rain"]))

print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, best_pred))