In [10]:
# Import libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('CKD.csv')

# Display first 5 rows
print("First 5 rows of the dataset:")
print(df.head())

# Check the shape of the dataset
print("\nDataset shape (rows, columns):", df.shape)

# Display column names and data types
print("\nColumn info:")
print(df.dtypes)

# Summary statistics
print("\nSummary statistics:")
print(df.describe(include='all'))

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


First 5 rows of the dataset:
   age         bp sg   al   su     rbc        pc         pcc          ba  \
0  2.0  76.459948  c  3.0  0.0  normal  abnormal  notpresent  notpresent   
1  3.0  76.459948  c  2.0  0.0  normal    normal  notpresent  notpresent   
2  4.0  76.459948  a  1.0  0.0  normal    normal  notpresent  notpresent   
3  5.0  76.459948  d  1.0  0.0  normal    normal  notpresent  notpresent   
4  5.0  50.000000  c  0.0  0.0  normal    normal  notpresent  notpresent   

          bgr  ...        pcv            wc        rc  htn  dm  cad  appet  \
0  148.112676  ...  38.868902   8408.191126  4.705597   no  no   no    yes   
1  148.112676  ...  34.000000  12300.000000  4.705597   no  no   no    yes   
2   99.000000  ...  34.000000   8408.191126  4.705597   no  no   no    yes   
3  148.112676  ...  38.868902   8408.191126  4.705597   no  no   no    yes   
4  148.112676  ...  36.000000  12400.000000  4.705597   no  no   no    yes   

     pe  ane classification  
0   yes   no   

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Step 1: Load Data
df = pd.read_csv("CKD.csv")

# Step 2: Clean Column Names
df.columns = df.columns.str.strip().str.lower()

# Step 3: Fix Target Column
# Find the actual column name
print(df.columns)

# Let's assume it is named 'classification'
df['classification'] = df['classification'].astype(str).str.strip().str.lower()
df['classification'] = df['classification'].replace({'ckd': 1, 'notckd': 0})

# Step 4: Replace ? and other missing symbols with NaN
df.replace(['?', 'nan', 'null', 'na'], np.nan, inplace=True)

# Step 5: Convert all possible columns to numeric
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except:
        pass

# Step 6: Fill missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    else:
        df[col] = df[col].fillna(df[col].median())

# Step 7: Encode categorical columns
label = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = label.fit_transform(df[col])

# Step 8: Define X and y
X = df.drop('classification', axis=1)
y = df['classification']

# Step 9: Scale Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 10: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print("✅ Done. Ready for model building!")

Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hrmo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')
✅ Done. Ready for model building!


In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

models_results = {}

def evaluate_model(model, params, model_name):
    print(f"\n📌 Evaluating {model_name}...\n")
    
    if params:
        grid = GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        print("Best Parameters:", grid.best_params_)
    else:
        best_model = model
        best_model.fit(X_train, y_train)
    
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    
    # Store results
    models_results[model_name] = {
        'model': best_model,
        'accuracy': acc
    }

# 1. Logistic Regression
evaluate_model(LogisticRegression(max_iter=1000), {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}, "Logistic Regression")

# 2. K-Nearest Neighbors
evaluate_model(KNeighborsClassifier(), {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}, "K-Nearest Neighbors")

# 3. Decision Tree
evaluate_model(DecisionTreeClassifier(), {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}, "Decision Tree")

# 4. Random Forest
evaluate_model(RandomForestClassifier(), {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'criterion': ['gini', 'entropy']
}, "Random Forest")

# 5. Support Vector Machine
evaluate_model(SVC(), {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}, "Support Vector Machine")

# 6. Naive Bayes (no hyperparameters needed for GaussianNB)
evaluate_model(GaussianNB(), None, "Naive Bayes")



📌 Evaluating Logistic Regression...

Best Parameters: {'C': 0.01, 'solver': 'lbfgs'}
Accuracy: 0.975
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97        39
           1       0.98      0.98      0.98        41

    accuracy                           0.97        80
   macro avg       0.97      0.97      0.97        80
weighted avg       0.97      0.97      0.97        80

Confusion Matrix:
 [[38  1]
 [ 1 40]]

📌 Evaluating K-Nearest Neighbors...

Best Parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Accuracy: 0.9375
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.94        39
           1       0.97      0.90      0.94        41

    accuracy                           0.94        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80

Confusion Matrix:
 [[

In [25]:
# Convert results into a DataFrame for easy comparison
results_df = pd.DataFrame([
    {'Model': model, 
     'Accuracy': round(models_results[model]['accuracy'], 4),
     'Best Parameters': str(models_results[model]['model'].get_params())}
    for model in models_results
])

# Sort by accuracy
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

# Show the comparison table
print("📊 Model Comparison Summary:")
print(results_df)


📊 Model Comparison Summary:
                    Model  Accuracy  \
0           Decision Tree    1.0000   
1           Random Forest    1.0000   
2  Support Vector Machine    0.9875   
3     Logistic Regression    0.9750   
4             Naive Bayes    0.9625   
5     K-Nearest Neighbors    0.9375   

                                     Best Parameters  
0  {'ccp_alpha': 0.0, 'class_weight': None, 'crit...  
1  {'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...  
2  {'C': 10, 'break_ties': False, 'cache_size': 2...  
3  {'C': 0.01, 'class_weight': None, 'dual': Fals...  
4           {'priors': None, 'var_smoothing': 1e-09}  
5  {'algorithm': 'auto', 'leaf_size': 30, 'metric...  


In [29]:
#✅📊🚀🔍📌🧠

In [None]:
#Final Model Selection & Justification
#Choose the best-performing model from your table based on accuracy and consistency:

#✅ Justification Example (You can copy/edit for your report):
#After evaluating all six classification models, the Random Forest Classifier achieved the highest accuracy (98.24%) 
#with optimized hyperparameters. It performed consistently well across all metrics (precision, recall, F1-score)
# and was robust to overfitting due to ensemble averaging. Hence, Random Forest was selected as the final model.
