In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

# 1. Load data from Numbers.txt
# Adjust 'sep' according to your file's delimiter (e.g., sep='\t' for tab-separated, sep=' ' for space-separated, or sep=',' for comma-separated)
data = pd.read_csv("Numbers.txt", delim_whitespace=True)

# Assuming the last column is the target variable
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values
print(y)


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# 2. Define classifiers with default parameters (untuned)
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),  # default k=5
    "Random Forest": RandomForestClassifier(random_state=42)
}

# --- PART 1: Cross-validation without tuning ---
print("=== Cross-validation without tuning ===")
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: CV Accuracy: {np.mean(cv_scores):.3f} (+/- {np.std(cv_scores):.3f})")

# --- PART 2: Cross-validation with tuning (using GridSearchCV) ---
print("\n=== Cross-validation with tuning ===")
param_grids = {
    "Logistic Regression": {
        "C": [0.1, 1, 10]
    },
    "K-Nearest Neighbors": {
        "n_neighbors": [1, 3, 5, 7]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 5, 10]
    }
}

best_models = {}
for name, model in models.items():
    print(f"\nTuning {name}:")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    best_models[name] = grid.best_estimator_
    print(f"Best parameters for {name}: {grid.best_params_}")
    print(f"Best CV Accuracy for {name}: {grid.best_score_:.3f}")

# --- PART 3: Demonstrate optimism: training error, CV error after tuning, and test error ---
print("\n=== Optimism of training: Error comparison ===")
for name, model in best_models.items():
    model.fit(X_train, y_train)
    
    # Training error (usually optimistically high)
    y_train_pred = model.predict(X_train)
    train_acc = accuracy_score(y_train, y_train_pred)
    
    # Cross-validation error after tuning
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_acc = np.mean(cv_scores)
    
    # Test error (unseen data)
    y_test_pred = model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    print(f"\n{name}:")
    print(f"Training Accuracy: {train_acc:.3f}")
    print(f"Cross-Validated Accuracy: {cv_acc:.3f}")
    print(f"Test Accuracy: {test_acc:.3f}")
    
    # Flexible models (like KNN with a small k or Random Forest) may show a larger gap 
    # between the high training accuracy and lower test accuracy, demonstrating the optimism of training.


[0 9 8 ... 5 9 9]
=== Cross-validation without tuning ===
Logistic Regression: CV Accuracy: 0.920 (+/- 0.013)
K-Nearest Neighbors: CV Accuracy: 0.926 (+/- 0.013)
Random Forest: CV Accuracy: 0.934 (+/- 0.015)

=== Cross-validation with tuning ===

Tuning Logistic Regression:
Best parameters for Logistic Regression: {'C': 0.1}
Best CV Accuracy for Logistic Regression: 0.929

Tuning K-Nearest Neighbors:
Best parameters for K-Nearest Neighbors: {'n_neighbors': 3}
Best CV Accuracy for K-Nearest Neighbors: 0.936

Tuning Random Forest:
Best parameters for Random Forest: {'max_depth': None, 'n_estimators': 200}
Best CV Accuracy for Random Forest: 0.936

=== Optimism of training: Error comparison ===

Logistic Regression:
Training Accuracy: 0.985
Cross-Validated Accuracy: 0.929
Test Accuracy: 0.900

K-Nearest Neighbors:
Training Accuracy: 0.968
Cross-Validated Accuracy: 0.936
Test Accuracy: 0.908

Random Forest:
Training Accuracy: 1.000
Cross-Validated Accuracy: 0.936
Test Accuracy: 0.907
