# Analyse how tuning hyperparameters on one data set impacts the model results.

Model performance depends on how data is split and how hyperparameters are tuned. Understanding the balance between training, validation and test sets is key to building ML models that generalise well beyond a single data set. 

First, using logistic regression and 60:20:20

In [2]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
 
# Load the default wine dataset
data = load_wine()
X = data.data
y = data.target

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)

# Fit the logistic regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Evaluate the validation set
val_preds = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Evaluate the test set
test_preds = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed classification report on test set
print("\nClassification Report on Test Set:")
print(classification_report(y_test, test_preds, target_names=data.target_names))

Validation Accuracy: 0.9444
Test Accuracy: 0.9444

Classification Report on Test Set:
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        12
     class_1       0.88      1.00      0.93        14
     class_2       1.00      0.80      0.89        10

    accuracy                           0.94        36
   macro avg       0.96      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train using SVM and 70:15:15

In [3]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

data = load_wine()
X = data.data
y = data.target

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp
)

model = SVC(kernel='rbf', C=1.0, random_state=42)
model.fit(X_train, y_train)

val_preds = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.4f}")

test_preds = model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

print("\nClassification Report on Test Set:")
print(classification_report(y_test, test_preds, target_names=data.target_names))

Validation Accuracy: 0.7407
Test Accuracy: 0.7407

Classification Report on Test Set:
              precision    recall  f1-score   support

     class_0       1.00      0.89      0.94         9
     class_1       0.61      1.00      0.76        11
     class_2       1.00      0.14      0.25         7

    accuracy                           0.74        27
   macro avg       0.87      0.68      0.65        27
weighted avg       0.84      0.74      0.69        27

