In [2]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
# === cell 2: imports & settings ===
# (Code cell)
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import joblib

# display settings
pd.set_option('display.max_columns', 50)
plt.rcParams['figure.figsize'] = (8,5)


In [5]:
# === cell 3: load data ===
data_path = "electives_data.csv"
df = pd.read_csv(data_path)
print("Shape:", df.shape)
df.head()


Shape: (30, 6)


Unnamed: 0,course_code,course_name,difficulty,workload,professor,result
0,830342-3,ประชากรและการอนามัยเจริญพันธุ์,2,3,4,1
1,835355-3,ศิลปะบำบัด,3,3,3,1
2,199432-1,สารพิษในชีวิตประจำวัน,4,3,2,0
3,830341-4,สังคมวิทยาสุขภาพ,2,4,3,0
4,837210-1,สังคมดิจิทัลและประเด็นการพัมนา,3,2,3,1


In [7]:
# === cell 6: เลือกฟีเจอร์ (อย่างน้อย 3 ปัจจัยตามโจทย์) ===

# ระบุชื่อคอลัมน์ที่ใช้เป็น target และ features
target_col = 'result'
features = ['difficulty', 'workload', 'professor']

# สร้างชุดข้อมูล X, y สำหรับเทรนโมเดล
X = df[features].copy()
y_raw = df[target_col].astype(str).copy()


In [8]:
# === cell 7: encode target ===
le_target = LabelEncoder()
y = le_target.fit_transform(y_raw)
print("Classes:", le_target.classes_)


Classes: ['0' '1']


In [9]:
# === cell 8: check types and decide preprocessing ===
numeric_cols = [c for c in features if pd.api.types.is_numeric_dtype(X[c])]
categorical_cols = [c for c in features if c not in numeric_cols]
print("Numeric:", numeric_cols)
print("Categorical:", categorical_cols)


Numeric: ['difficulty', 'workload', 'professor']
Categorical: []


In [10]:
# === cell 9: build preprocessing pipelines ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])


In [11]:
# === cell 10: train/test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (24, 3) Test: (6, 3)


In [12]:
# === cell 11: train multiple models (DecisionTree, LogisticRegression, SVM) ===
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=2000, random_state=42),
    'SVM': SVC(probability=True, random_state=42)
}

fitted = {}
scores = {}
for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    fitted[name] = pipe
    scores[name] = acc
    print(f"--- {name} ---")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred, target_names=le_target.classes_))
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


--- DecisionTree ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         4

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

Confusion matrix:
 [[2 0]
 [0 4]]
--- LogisticRegression ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         4

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6

Confusion matrix:
 [[2 0]
 [0 4]]
--- SVM ---
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00         4

    accuracy

In [13]:
# === cell 12: cross-validation check (optional, but useful with small data) ===
from sklearn.model_selection import cross_val_score
for name, model in models.items():
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])
    cv_scores = cross_val_score(pipe, X, y, cv=5, scoring='accuracy')
    print(f"{name} CV mean acc: {cv_scores.mean():.3f} (std {cv_scores.std():.3f})")


DecisionTree CV mean acc: 0.800 (std 0.067)
LogisticRegression CV mean acc: 0.900 (std 0.082)
SVM CV mean acc: 0.900 (std 0.082)


In [14]:
# === cell 13: choose best model and save ===
best_name = max(scores, key=scores.get)
best_pipe = fitted[best_name]
print("Best model:", best_name, "accuracy:", scores[best_name])
# Save pipeline + label encoder + features
out_path = r"C:\Users\Dell\Downloads\gdss2025\best_model_joblib.pkl"
joblib.dump({'pipeline': best_pipe, 'label_encoder': le_target, 'features': features}, out_path)
print("Saved model to:", out_path)


Best model: DecisionTree accuracy: 1.0
Saved model to: C:\Users\Dell\Downloads\gdss2025\best_model_joblib.pkl


In [16]:
sample = pd.DataFrame({
    'difficulty': [3],
    'workload': [4],
    'professor': [3]   # ใช้ค่า numeric แทน 'Prof A'
})
pred = best_pipe.predict(sample)
print("Pred (label):", le_target.inverse_transform(pred))


Pred (label): ['0']


In [17]:
df.to_csv(r"C:\Users\Dell\Downloads\gdss2025\cleaned_data_for_report.csv", index=False)
print("Saved cleaned data for report.")


Saved cleaned data for report.


In [19]:
# แก้ไขโค้ดในส่วนที่คุณบันทึกโมเดล
from joblib import dump

# 1. สร้าง Dictionary ที่มีข้อมูลทั้งหมดที่ app.py ต้องการ
model_package = {
    'pipeline': best_pipe,
    'label_encoder': le_target,
    'features': features # รายชื่อคอลัมน์ที่ใช้ในการทำนาย
}

# 2. บันทึก Dictionary นี้ลงในไฟล์ .pkl
dump(model_package, "best_model_joblib.pkl")
print("Saved complete model package to best_model_joblib.pkl")

Saved complete model package to best_model_joblib.pkl
