In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline


# =====================================================
# 1) Dataset จากอาจารย์
# =====================================================

data = [
[1,0.8,0.83,0.66,1.9,1.1,1],
[1,0.9,0.36,0.32,1.4,0.74,0.99],
[0,0.8,0.88,0.7,0.8,0.18,0.98],
[0,1,0.87,0.87,0.7,1.05,0.99],
[1,0.9,0.75,0.68,1.3,0.52,0.98],
[0,1,0.65,0.65,0.6,0.52,0.98],
[1,0.95,0.97,0.92,1,1.23,0.99],
[0,0.95,0.87,0.83,1.9,1.35,1.02],
[0,1,0.45,0.45,0.8,0.32,1],
[0,0.95,0.36,0.34,0.5,0,1.04],
[0,0.85,0.39,0.33,0.7,0.28,0.99],
[0,0.7,0.76,0.53,1.2,0.15,0.98],
[0,0.8,0.46,0.37,0.4,0.38,1.01],
[0,0.2,0.39,0.08,0.8,0.11,0.99],
[0,1,0.9,0.9,1.1,1.04,0.99],
[1,1,0.84,0.84,1.9,2.06,1.02],
[0,0.65,0.42,0.27,0.5,0.11,1.01],
[0,1,0.75,0.75,1,1.32,1],
[0,0.5,0.44,0.22,0.6,0.11,0.99],
[1,1,0.63,0.63,1.1,1.07,0.99],
[0,1,0.33,0.33,0.4,0.18,1.01],
[0,0.9,0.93,0.84,0.6,1.59,1.02],
[1,1,0.58,0.58,1,0.53,1],
[0,0.95,0.32,0.3,1.6,0.89,0.99],
[1,1,0.6,0.6,1.7,0.96,0.99],
[1,1,0.69,0.69,0.9,0.4,0.99],
[0,1,0.73,0.73,0.7,0.4,0.99]
]

columns = ["REMISS","CELL","SMEAR","INFIL","LI","BLAST","TEMP"]
df = pd.DataFrame(data, columns=columns)

X = df.drop("REMISS", axis=1).values
y = df["REMISS"].values

# =====================================================
# 2) Train/Test Split (80/20)
# =====================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=50
)

# =====================================================
# 3) Feature Scaling
# =====================================================

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# =====================================================
# 4) Logistic Regression using scikit-learn
# =====================================================

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred_sklearn = model.predict(X_test)
acc_sklearn = accuracy_score(y_test, y_pred_sklearn)

print("Scikit-learn Accuracy:", acc_sklearn)

# =====================================================
# 5) Batch Gradient Descent (From Scratch) ปรับ learning rate กับ epochs ไปเรื่อยๆจนได้ตัวเลขที่ใกล้1 แบบsignificant
# =====================================================

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def batch_gradient_descent(X, y, lr=0.01, epochs=10000):
    m, n = X.shape
    w = np.zeros(n)
    b = 0

    for _ in range(epochs):
        z = np.dot(X, w) + b
        y_hat = sigmoid(z)

        dw = (1/m) * np.dot(X.T, (y_hat - y))
        db = (1/m) * np.sum(y_hat - y)

        w -= lr * dw
        b -= lr * db

    return w, b

w_batch, b_batch = batch_gradient_descent(X_train, y_train)

z_test = np.dot(X_test, w_batch) + b_batch
y_pred_batch = (sigmoid(z_test) >= 0.5).astype(int)

acc_batch = accuracy_score(y_test, y_pred_batch)

print("Batch Gradient Descent Accuracy:", acc_batch)

# =====================================================
# 6) Stochastic Gradient Descent (From Scratch) ปรับ learning rate กับ epochs ไปเรื่อยๆจนได้ตัวเลขที่ใกล้1 แบบsignificant
# =====================================================

def stochastic_gradient_descent(X, y, lr=0.01, epochs=10000):
    m, n = X.shape
    w = np.zeros(n)
    b = 0

    for _ in range(epochs):
        for i in range(m):
            xi = X[i]
            yi = y[i]

            z = np.dot(xi, w) + b
            y_hat = sigmoid(z)

            error = y_hat - yi

            w -= lr * error * xi
            b -= lr * error

    return w, b

w_sgd, b_sgd = stochastic_gradient_descent(X_train, y_train)

z_test_sgd = np.dot(X_test, w_sgd) + b_sgd
y_pred_sgd = (sigmoid(z_test_sgd) >= 0.5).astype(int)

acc_sgd = accuracy_score(y_test, y_pred_sgd)

print("Stochastic Gradient Descent Accuracy:", acc_sgd)


##-----ลองใช้ polynomaial_cross validation---ลองปรับ cvจาก5 เพิ่มลด จนรู้ว่าข้อมูลที่มีน้อย ถ้าแบ่งfoldเยอะmodelมันจะทำpatternทำให้จริงๆแล้วไม่ได้แม่น แล้วค่าที่ได้จะเหวี่ยง ไม่นิ่ง-----##
model = make_pipeline(
    PolynomialFeatures(degree=1, include_bias=False),
    StandardScaler(),
    LogisticRegression(max_iter=10000)
)

scores = cross_val_score(model, X, y, cv=2)

print("เทสๆAccuracy each fold:", scores)
print("เทสๆAccuracy avg:", np.mean(scores))

Scikit-learn Accuracy: 0.8333333333333334
Batch Gradient Descent Accuracy: 0.8333333333333334
Stochastic Gradient Descent Accuracy: 0.8333333333333334
เทสๆAccuracy each fold: [0.78571429 0.76923077]
เทสๆAccuracy avg: 0.7774725274725275
