In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Load the dataset
file_path = 'cleaned_german_credit_data_updated.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Assume the last column is the target variable and the rest are features
X = df.iloc[:, :-1]  # Features
y = df.iloc[:, -1]   # Target variable (binary or multiclass)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for k
param_grid = {'n_neighbors': np.arange(1, 31)}  # Test k values from 1 to 20

# Create a k-NN model
knn_model = KNeighborsClassifier()

# Perform GridSearchCV to find the optimal k
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best k value
best_k = grid_search.best_params_['n_neighbors']
print(f'Optimal number of neighbors (k): {best_k}')

# Use the best model to make predictions
best_knn_model = grid_search.best_estimator_
y_pred = best_knn_model.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the best k-NN model: {accuracy:.2f}')

# Display confusion matrix and classification report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

found 0 physical cores < 1
  File "C:\Users\Samsung\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Optimal number of neighbors (k): 26
Accuracy of the best k-NN model: 0.54
Confusion Matrix:
[[17 33]
 [15 40]]

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.34      0.41        50
           1       0.55      0.73      0.62        55

    accuracy                           0.54       105
   macro avg       0.54      0.53      0.52       105
weighted avg       0.54      0.54      0.52       105

