In [None]:
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

# New Section

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data
df = pd.read_csv('/content/drive/MyDrive/Dataset Folder/ckdb.csv')

# Replace '\t?' values with NaN
df = df.replace('\t?', np.nan)

# Convert numerical columns to float
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
df[num_cols] = df[num_cols].astype(float)

# Fill missing values with median for numerical features
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing values with mode for nominal features
nom_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in nom_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
for col in nom_cols:
    df[col] = le.fit_transform(df[col])

# Normalize numerical features using z-score normalization
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Rename the target column
df = df.rename(columns={'classification': 'target'})


# Decision tree with anova

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Separate features and target
X = pd.DataFrame(df.drop('target', axis=1))
y = pd.DataFrame(df['target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Perform ANOVA feature selection
k_best = SelectKBest(score_func=f_classif, k=7)
X_train_anova = k_best.fit_transform(X_train, y_train.values.ravel())

# Print the selected features
print("Selected features: ", X_train.columns[k_best.get_support()])

# Initialize decision tree classifier
dtc = DecisionTreeClassifier()

# Define the grid of hyperparameters to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize grid search object
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train_anova, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize decision tree classifier with best hyperparameters
dtc = DecisionTreeClassifier(criterion=grid_search.best_params_['criterion'],
                             max_depth=grid_search.best_params_['max_depth'],
                             min_samples_split=grid_search.best_params_['min_samples_split'],
                             min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

# Fit decision tree classifier with selected features
dtc.fit(X_train_anova, y_train.values.ravel())

# Make predictions using the test set
X_test_anova = k_best.transform(X_test)
y_pred = dtc.predict(X_test_anova)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {:.6f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))


Selected features:  Index(['id', 'sg', 'al', 'hemo', 'pcv', 'rc', 'htn'], dtype='object')
Best hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5}
Accuracy score: 0.991667
Precision: 0.991775
Recall: 0.991667
F1 score: 0.991646


#SVM with ANOVA

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
# Perform ANOVA to select top k features
k = 5
anova = SelectKBest(score_func=f_classif, k=k)
X_train_anova = anova.fit_transform(X_train, y_train.values.ravel())
X_test_anova = anova.transform(X_test)

# Train SVM classifier on selected features
svm_pipeline_anova = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())
])

svm_param_grid_anova = {
    'svm__C': [10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1],
    'svm__kernel': ['linear', 'rbf']
}

svm_grid_search_anova = GridSearchCV(svm_pipeline_anova, svm_param_grid_anova, cv=5)

svm_grid_search_anova.fit(X_train_anova, y_train.values.ravel())

print("Best hyperparameters for SVM with ANOVA:", svm_grid_search_anova.best_params_)

# Make predictions using the test set for SVM with ANOVA
svm_y_pred_anova = svm_grid_search_anova.predict(X_test_anova)

# Calculate accuracy score for SVM with ANOVA
svm_accuracy_anova = accuracy_score(y_test, svm_y_pred_anova)
print("SVM with ANOVA accuracy score: {:.6f}".format(svm_accuracy_anova))

# Calculate precision
precision_anova = precision_score(y_test, svm_y_pred_anova, average='weighted')
print("Precision: {:.6f}".format(precision_anova))

# Calculate recall
recall_anova = recall_score(y_test, svm_y_pred_anova, average='weighted')
print("Recall: {:.6f}".format(recall_anova))

# Calculate F1 score
f1_anova = f1_score(y_test, svm_y_pred_anova, average='weighted')
print("F1 score: {:.6f}".format(f1_anova))


Best hyperparameters for SVM with ANOVA: {'svm__C': 10, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
SVM with ANOVA accuracy score: 0.991667
Precision: 0.991852
Recall: 0.991667
F1 score: 0.991686


# Random Forest with ANOVA

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Perform feature selection using ANOVA F-test
selector = SelectKBest(f_classif, k=5)
X_anova = selector.fit_transform(X, y)

# Print the selected features
selected_features = X.columns[selector.get_support()]
print("Selected features:", list(selected_features))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_anova, y, test_size=0.3)
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)
# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize random forest classifier
rfc = RandomForestClassifier()

# Initialize grid search object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize random forest classifier with best hyperparameters
rfc_tuned = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    min_samples_split=grid_search.best_params_['min_samples_split'],
                                    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                    )

# Fit the classifier to the data
rfc_tuned.fit(X_train, y_train)

# Calculate accuracy score on test data
test_accuracy = rfc_tuned.score(X_test, y_test)

# Test the model
y_pred = rfc_tuned.predict(X_test)

# Print accuracy score, precision, recall and F1 score
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))


Selected features: ['id', 'sg', 'hemo', 'pcv', 'htn']
Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy score: 0.9916666666666667
Precision: 0.9878048780487805
Recall: 0.99375
F1 score: 0.9906825064057768


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
