In [5]:
# ==========================
# 🧠 Import Libraries
# ==========================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# ==========================
# 📥 Load Data
# ==========================
df = pd.read_csv('train.csv')[['Age','Pclass','Fare','Survived']]

# Split into features and target
X = df.drop(columns=['Survived'])
y = df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

# ==========================
# ⚙️ Create Pipeline (KNNImputer + LogisticRegression)
# ==========================
pipe = Pipeline([
    ('imputer', KNNImputer()),
    ('model', LogisticRegression(max_iter=1000))
])

# ==========================
# 🔍 Define Hyperparameter Grid for both Imputer & Model
# ==========================
param_grid = {
    'imputer__n_neighbors': [7 ],
    'imputer__weights': ['uniform', 'distance'],
    'model__C': [0.01, 0.1, 1, 10],
    'model__solver': ['liblinear', 'lbfgs'],
    'model__penalty': ['l2']
}

# ==========================
# 🚀 Apply Grid Search CV
# ==========================
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

# ==========================
# 📊 Display Best Results
# ==========================
print("🔍 Best Hyperparameters Found:")
for param, value in grid.best_params_.items():
    print(f"  {param}: {value}")

print(f"\n✅ Best Cross-Validation Accuracy: {grid.best_score_:.4f}")

# ==========================
# 🎯 Evaluate on Test Set
# ==========================
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

test_acc = accuracy_score(y_test, y_pred)
print(f"\n🎯 Test Accuracy using best parameters: {test_acc:.4f}")


🔍 Best Hyperparameters Found:
  imputer__n_neighbors: 7
  imputer__weights: distance
  model__C: 1
  model__penalty: l2
  model__solver: liblinear

✅ Best Cross-Validation Accuracy: 0.7120

🎯 Test Accuracy using best parameters: 0.6816


In [4]:
from sklearn.model_selection import cross_val_score

for k in [1, 3, 5, 7]:
    knn = KNNImputer(n_neighbors=k, weights='distance')
    X_train_trf = knn.fit_transform(X_train)
    X_test_trf = knn.transform(X_test)
    
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_trf, y_train)
    scores = cross_val_score(lr, X_train_trf, y_train, cv=5)
    
    print(f"n_neighbors={k}: CV mean={scores.mean():.4f}, Test={accuracy_score(y_test, lr.predict(X_test_trf)):.4f}")


n_neighbors=1: CV mean=0.7121, Test=0.6872
n_neighbors=3: CV mean=0.7078, Test=0.7151
n_neighbors=5: CV mean=0.7092, Test=0.7095
n_neighbors=7: CV mean=0.7078, Test=0.7039
