In [None]:
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score

# Load the CKD dataset
df = pd.read_csv('/content/drive/MyDrive/Dataset Folder/mcdx.csv')
df = df.drop('race',axis=1)

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
column_names=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'rbcc']
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill missing values with mode for nominal features
nom_cols = ['rbc', 'pc', 'pcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in nom_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert all values in 'sex' feature to male or female
df['sex'] = df['sex'].replace([' ', 'm ', '0'], 'male')
df['sex'] = df['sex'].apply(lambda x: 'female' if x != 'male' else x)
df['sex'] = le.fit_transform(df['sex'])

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
for col in nom_cols:
    df[col] = le.fit_transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
df.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,bgr,bu,...,pcv,sex,rbcc,htn,dm,cad,appet,pe,ane,class
0,-0.125043,0.310774,0.372752,0.076249,-0.380269,0,2,1,-0.122346,-0.368672,...,0.709349,0,0.811575,2,2,1,1,1,1,stage2
1,-2.330095,-1.289779,0.372752,2.363728,-0.380269,0,2,1,-1.499132,-0.723582,...,0.355191,0,0.370741,1,1,1,1,1,1,stage1
2,0.627902,0.310774,0.342233,0.838742,2.507853,2,2,1,3.31393,-0.03348,...,-0.057993,0,0.855658,1,2,1,2,1,2,stage3B
3,-0.125043,-0.222744,0.326973,2.363728,-0.380269,2,1,2,-0.16786,0.025672,...,0.001033,0,0.238491,2,1,1,2,2,2,stage4
4,0.036303,0.310774,0.342233,0.838742,-0.380269,2,2,1,-0.293022,-0.565844,...,0.178112,0,0.547075,1,1,1,1,1,1,stage3B
5,0.520339,0.844292,0.357492,1.601235,-0.380269,0,0,1,-0.657131,-0.585562,...,0.414218,0,0.458908,2,2,1,1,2,1,stage3A
6,0.950593,-0.222744,0.342233,-0.686244,-0.380269,0,2,1,-0.361293,-0.013763,...,0.237138,0,0.018074,1,1,1,1,1,1,stage5
7,-1.415805,-3.957369,0.357492,0.838742,3.47056,2,1,1,3.166011,-0.467258,...,0.709349,0,0.723408,1,2,1,1,2,1,stage2
8,0.090084,1.37781,0.357492,1.601235,-0.380269,2,1,2,0.071087,0.104541,...,0.060059,0,0.282574,2,2,1,1,1,2,stage3B
9,0.143866,0.844292,0.372752,0.838742,-0.380269,1,1,2,-0.702644,1.031249,...,-0.176046,0,0.150324,2,2,1,2,1,2,stage5


# Decison tree with RFECV


In [None]:
# Separate features and target
X = pd.DataFrame(df.drop('class', axis=1))
y = pd.DataFrame(df['class'])

# Initialize decision tree classifier
dtc = DecisionTreeClassifier()
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Rectify overfitting using grid search and cross-validation
from sklearn.model_selection import GridSearchCV

# Define the grid of hyperparameters to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize grid search object
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X, y)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Use RFECV to perform feature selection and cross-validation
rfecv = RFECV(estimator=dtc, step=1, cv=5, scoring='accuracy', min_features_to_select=7)
rfecv.fit(X_train, y_train.values.ravel())

# Print the optimal number of features selected
print("Optimal number of features: ", rfecv.n_features_)

# Print the selected features
print("Selected features: ", X_train.columns[rfecv.support_])

Best hyperparameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}
Optimal number of features:  10
Selected features:  Index(['age', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'hemo', 'rbcc'], dtype='object')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions using the test set
y_pred = grid_search.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {:.6f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))

Accuracy score: 0.883333
Precision: 0.901667
Recall: 0.883333
F1 score: 0.887612


# SVM with RFECV

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_validate

# Separate features and target
X = pd.DataFrame(df.drop('class', axis=1))
y = pd.DataFrame(df['class'])

# Initialize linear kernel SVM classifier
svm = SVC(kernel='linear', random_state=42)

# Perform recursive feature elimination with cross-validation
rfecv = RFECV(estimator=svm, cv=5)
rfecv.fit(X, y)

# Get the selected features
X_rfecv = X.loc[:, rfecv.support_]

# Define the grid of hyperparameters to search
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Initialize SVM classifier
svm_tuned = SVC()

# Initialize grid search object
grid_search = GridSearchCV(estimator=svm_tuned, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_rfecv, y)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize SVM classifier with best hyperparameters
svm_tuned = SVC(C=grid_search.best_params_['C'],
                gamma=grid_search.best_params_['gamma'],
                kernel=grid_search.best_params_['kernel'])

# Perform cross-validation and get evaluation metrics
scores = cross_validate(svm_tuned, X_rfecv, y, cv=5, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))

# Print evaluation metrics
print("Accuracy:", np.mean(scores['test_accuracy']))
print("Precision:", np.mean(scores['test_precision_macro']))
print("Recall:", np.mean(scores['test_recall_macro']))
print("F1 Score:", np.mean(scores['test_f1_macro']))

Best hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.7074999999999999
Precision: 0.5810250088455564
Recall: 0.5313555725320432
F1 Score: 0.5213720455929238


# Random Forest With RFECV

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Perform recursive feature elimination with cross-validation
rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100), cv=5)
X_rfecv = rfecv.fit_transform(X, y)

# Print the selected features
selected_features = X.columns[rfecv.support_]
print("Selected features:", list(selected_features))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y, test_size=0.3)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize random forest classifier
rfc = RandomForestClassifier()

# Initialize grid search object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize random forest classifier with best hyperparameters
rfc_tuned = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    min_samples_split=grid_search.best_params_['min_samples_split'],
                                    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                    )

# Fit the classifier to the data
rfc_tuned.fit(X_train, y_train)

# Calculate accuracy score on test data
test_accuracy = rfc_tuned.score(X_test, y_test)

# Test the model
y_pred = rfc_tuned.predict(X_test)

# Print accuracy score, precision, recall and F1 score
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))

Selected features: ['age', 'sg', 'al', 'rbc', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'rbcc']
Best hyperparameters: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Accuracy score: 0.825
Precision: 0.7714979381821908
Recall: 0.6982084331708391
F1 score: 0.7201360544217688


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
