In [None]:
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score

# Load the CKD dataset
df = pd.read_csv('/content/drive/MyDrive/Dataset Folder/mcdx.csv')
df = df.drop('race',axis=1)

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
column_names=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'rbcc']
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

# Fill missing values with mode for nominal features
nom_cols = ['rbc', 'pc', 'pcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in nom_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert all values in 'sex' feature to male or female
df['sex'] = df['sex'].replace([' ', 'm ', '0'], 'male')
df['sex'] = df['sex'].apply(lambda x: 'female' if x != 'male' else x)
df['sex'] = le.fit_transform(df['sex'])

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
for col in nom_cols:
    df[col] = le.fit_transform(df[col])
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Decision Tree with ANOVA

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Separate features and target
X = pd.DataFrame(df.drop('class', axis=1))
y = pd.DataFrame(df['class'])



# Use ANOVA to perform feature selection
k = 15  # Number of features to select
anova_selector = SelectKBest(score_func=f_classif, k=k)
X_anova = anova_selector.fit_transform(X, y.values.ravel())

# Print the selected features
selected_features = X.columns[anova_selector.get_support()]
print("Selected features: ", selected_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_anova, y, test_size=0.3, random_state=0)

# Initialize decision tree classifier
dtc = DecisionTreeClassifier()

# Rectify overfitting using grid search and cross-validation
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train.values.ravel())

# Make predictions using the test set
y_pred = grid_search.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {:.6f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))

Selected features:  Index(['age', 'al', 'rbc', 'pc', 'bu', 'sc', 'sod', 'hemo', 'pcv', 'rbcc',
       'htn', 'dm', 'appet', 'pe', 'ane'],
      dtype='object')
Accuracy score: 0.825000
Precision: 0.846043
Recall: 0.825000
F1 score: 0.829187


# SVM with ANOVA

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_validate

# Separate features and target
X = pd.DataFrame(df.drop('class', axis=1))
y = pd.DataFrame(df['class'])

# Perform ANOVA F-value feature selection
k_best = SelectKBest(score_func=f_classif, k=15)
k_best.fit(X, y)

# Get the selected features
X_k_best = X.loc[:, k_best.get_support()]

# Define the grid of hyperparameters to search
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 1, 10],
    'kernel': ['linear', 'rbf','sigmoid']
}

# Initialize SVM classifier
svm_tuned = SVC()

# Initialize grid search object
grid_search = GridSearchCV(estimator=svm_tuned, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_k_best, y)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize SVM classifier with best hyperparameters
svm_tuned = SVC(C=grid_search.best_params_['C'],
                gamma=grid_search.best_params_['gamma'],
                kernel=grid_search.best_params_['kernel'])

# Perform cross-validation and get evaluation metrics
scores = cross_validate(svm_tuned, X_k_best, y, cv=5, scoring=('accuracy', 'precision_macro', 'recall_macro', 'f1_macro'))

# Print evaluation metrics
print("Accuracy:", np.mean(scores['test_accuracy']))
print("Precision:", np.mean(scores['test_precision_macro']))
print("Recall:", np.mean(scores['test_recall_macro']))
print("F1 Score:", np.mean(scores['test_f1_macro']))


Best hyperparameters: {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
Accuracy: 0.705
Precision: 0.5244452916218192
Recall: 0.5251570325099737
F1 Score: 0.5115014467070819


# RF with ANOVA

In [None]:
from sklearn.feature_selection import f_classif, SelectKBest
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier

# Separate features and target
X = pd.DataFrame(df.drop('class', axis=1))
y = pd.DataFrame(df['class'])



# Use ANOVA to perform feature selection
k = 15  # Number of features to select
anova_selector = SelectKBest(score_func=f_classif, k=k)
X_anova = anova_selector.fit_transform(X, y.values.ravel())

# Print the selected features
selected_features = X.columns[anova_selector.get_support()]
print("Selected features: ", selected_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_anova, y, test_size=0.3, random_state=0)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize random forest classifier
rfc = RandomForestClassifier()

# Initialize grid search object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize random forest classifier with best hyperparameters
rfc_tuned = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    min_samples_split=grid_search.best_params_['min_samples_split'],
                                    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                    )
# Make predictions using the test set
y_pred = grid_search.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {:.6f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))


Selected features:  Index(['age', 'al', 'rbc', 'pc', 'bu', 'sc', 'sod', 'hemo', 'pcv', 'rbcc',
       'htn', 'dm', 'appet', 'pe', 'ane'],
      dtype='object')
Best hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy score: 0.850000
Precision: 0.847803
Recall: 0.850000
F1 score: 0.846425


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
