In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib

# 1. Setup
if not os.path.exists('model'):
    os.makedirs('model')

# 2. Load Data
# Ensure the file name matches what you downloaded
if os.path.exists('Dry_Bean_Dataset.csv'):
    df = pd.read_csv('Dry_Bean_Dataset.csv')
elif os.path.exists('Dry_Bean.csv'):
    df = pd.read_csv('Dry_Bean.csv')
else:
    print("Error: Dataset CSV not found. Please download it from Kaggle.")
    exit()

X = df.drop('Class', axis=1)
y = df['Class']

# Encode Target (Strings -> Numbers)
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, 'model/label_encoder.pkl') # Save for app

# Split (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'model/scaler.pkl')

# Save Sample Test Data (with original labels for user clarity)
test_df = X_test.copy()
test_df['Class'] = le.inverse_transform(y_test)
test_df.to_csv('sample_test_data.csv', index=False)
print("Sample test data saved.")

# 3. Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=3000, multi_class='multinomial'),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', objective='multi:softprob')
}

# 4. Train & Evaluate
results = []
print("Training models... (This may take a minute)")

for name, model in models.items():
    # Use scaled data for Logistic/KNN, raw for others (optional, but good practice)
    if name in ["Logistic Regression", "KNN"]:
        X_t, X_v = X_train_scaled, X_test_scaled
    else:
        X_t, X_v = X_train, X_test

    model.fit(X_t, y_train)
    preds = model.predict(X_v)
    probs = model.predict_proba(X_v)

    # Calculate Metrics (Weighted for Multi-Class)
    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "AUC": roc_auc_score(y_test, probs, multi_class='ovr', average='weighted'),
        "Precision": precision_score(y_test, preds, average='weighted'),
        "Recall": recall_score(y_test, preds, average='weighted'),
        "F1": f1_score(y_test, preds, average='weighted'),
        "MCC": matthews_corrcoef(y_test, preds)
    }
    results.append(metrics)

    joblib.dump(model, f'model/{name.replace(" ", "_")}.pkl')
    print(f"✔ {name} completed.")

# 5. Output for README
print("\n=== Copy this table to your README ===")
print(pd.DataFrame(results).round(3).to_markdown(index=False))

Sample test data saved.
Training models... (This may take a minute)




✔ Logistic Regression completed.
✔ Decision Tree completed.
✔ KNN completed.
✔ Naive Bayes completed.
✔ Random Forest completed.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✔ XGBoost completed.

=== Copy this table to your README ===
| Model               |   Accuracy |   AUC |   Precision |   Recall |    F1 |   MCC |
|:--------------------|-----------:|------:|------------:|---------:|------:|------:|
| Logistic Regression |      0.927 | 0.994 |       0.928 |    0.927 | 0.927 | 0.912 |
| Decision Tree       |      0.89  | 0.933 |       0.891 |    0.89  | 0.89  | 0.868 |
| KNN                 |      0.923 | 0.982 |       0.924 |    0.923 | 0.924 | 0.908 |
| Naive Bayes         |      0.758 | 0.962 |       0.756 |    0.758 | 0.756 | 0.709 |
| Random Forest       |      0.925 | 0.993 |       0.926 |    0.925 | 0.926 | 0.91  |
| XGBoost             |      0.924 | 0.994 |       0.925 |    0.924 | 0.925 | 0.909 |
