In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score



# -----------------------------

# 1) 데이터 준비

# -----------------------------

cols = ["sepal_length", "sepal_width", "petal_length", "petal_width", "label"]

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/iris.data", header=None, names=cols).dropna()



X = df.drop(columns=["label"])

y = df["label"]



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

dt = DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr = LogisticRegression(max_iter=500)



# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)



# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.9333
Random Forest : 0.9000
Logistic Reg. : 0.9667


In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score



# -----------------------------

# 1) 데이터 준비

# -----------------------------



df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/breast_cancer.csv", header=None, names=cols, skiprows=1).dropna()



X = df.drop(columns=["label"])

y = df["label"]



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

dt = DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr = LogisticRegression(max_iter=500)



# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)



# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.9298
Random Forest : 0.9298
Logistic Reg. : 0.8158


In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score



# -----------------------------

# 1) 데이터 준비

# -----------------------------



df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/heart.csv", header=None, names=cols, skiprows=1).dropna()



X = df.drop(columns=["label"])

y = df["label"]



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

dt = DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr = LogisticRegression(max_iter=500)



# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)



# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.6885
Random Forest : 0.7377
Logistic Reg. : 0.7705


In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



# -----------------------------

# 1) 데이터 준비

# -----------------------------



# Define column names for the car evaluation dataset
car_cols = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "label"]

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/car_evaluation.csv", header=None, names=car_cols).dropna()



X = df.drop(columns=["label"])

y = df["label"]



# Identify categorical features for one-hot encoding
categorical_features = ["buying", "maint", "doors", "persons", "lug_boot", "safety"]



# Create a column transformer to apply one-hot encoding to categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)


X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, stratify=y, random_state=42

)



# -----------------------------

# 2) 모델 구성

# -----------------------------

# Create pipelines for each model including the preprocessor
dt = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', DecisionTreeClassifier(random_state=42))])

rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))])

lr = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=1000))]) # Increased max_iter for convergence


# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)



# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))



print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.9740
Random Forest : 0.9798
Logistic Reg. : 0.9017


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/diabetes.csv")

X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 정의
# -----------------------------
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=1000)

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
dt_acc = accuracy_score(y_test, dt.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
lr_acc = accuracy_score(y_test, lr.predict(X_test))

print("=== Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print(f"Random Forest : {rf_acc:.4f}")
print(f"Logistic Reg. : {lr_acc:.4f}")


=== Test Accuracy ===
Decision Tree : 0.7273
Random Forest : 0.7468
Logistic Reg. : 0.7143


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# -----------------------------
# 1) 데이터 준비
# -----------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/diabetes.csv")

X = df.drop(columns=["Outcome"])
y = df["Outcome"]   # 보통은 분류용인데, 이번엔 연속값 예측한다고 가정

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 2) 모델 정의
# -----------------------------
lr = LinearRegression()
dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=200, random_state=42)

# -----------------------------
# 3) 모델 학습
# -----------------------------
lr.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
models = {"Linear Regression": lr, "Decision Tree": dt, "Random Forest": rf}

for name, model in models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"=== {name} ===")
    print(f"MSE  : {mse:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"R^2  : {r2:.4f}\n")

=== Linear Regression ===
MSE  : 0.1710
RMSE : 0.4136
R^2  : 0.2550

=== Decision Tree ===
MSE  : 0.2532
RMSE : 0.5032
R^2  : -0.1030

=== Random Forest ===
MSE  : 0.1683
RMSE : 0.4102
R^2  : 0.2671

