In [None]:
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data
df = pd.read_csv('ckdb.csv')

# Replace '\t?' values with NaN
df = df.replace('\t?', np.nan)

# Convert numerical columns to float
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']
df[num_cols] = df[num_cols].astype(float)

# Fill missing values with median for numerical features
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing values with mode for nominal features
nom_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
for col in nom_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Convert nominal features to numerical using label encoding
le = LabelEncoder()
for col in nom_cols:
    df[col] = le.fit_transform(df[col])

# Normalize numerical features using z-score normalization
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Rename the target column
df = df.rename(columns={'classification': 'target'})


# Sample data

In [None]:
df.head(10)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,target
0,0,-0.210031,0.254214,0.421486,0.076249,-0.380269,1,1,0,0,...,0.603224,-0.197314,0.550044,1,4,1,0,0,0,ckd
1,1,-2.627234,-1.972476,0.421486,2.363728,-0.380269,1,1,0,0,...,-0.132789,-0.909782,0.074073,0,3,1,0,0,0,ckd
2,2,0.615355,0.254214,-1.421074,0.838742,2.507853,1,1,0,0,...,-0.99147,-0.316059,0.074073,0,4,1,1,0,1,ckd
3,3,-0.210031,-0.488016,-2.342354,2.363728,-0.380269,1,0,1,0,...,-0.868801,-0.632711,-0.996862,1,3,1,1,1,1,ckd
4,4,-0.033163,0.254214,-1.421074,0.838742,-0.380269,1,1,0,0,...,-0.500795,-0.395222,-0.163913,0,3,1,0,0,0,ckd
5,5,0.497443,0.996444,-0.499794,1.601235,-0.380269,1,1,0,0,...,-0.01012,-0.197314,-0.401898,1,4,1,0,1,0,ckd
6,6,0.969092,-0.488016,-1.421074,-0.686244,-0.380269,1,1,0,0,...,-0.378126,-0.118151,0.074073,0,3,1,0,0,0,ckd
7,7,-1.624979,0.254214,-0.499794,0.838742,3.47056,1,0,0,0,...,0.603224,-0.553548,0.312059,0,4,1,0,1,0,ckd
8,8,0.025793,1.738674,-0.499794,1.601235,-0.380269,1,0,1,0,...,-0.746133,0.515154,-0.87787,1,4,1,0,0,1,ckd
9,9,0.084749,0.996444,0.421486,0.838742,-0.380269,0,0,1,0,...,-1.236808,1.504692,-1.234848,1,4,1,1,0,1,ckd


# Decision tree with RFECV

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Separate features and target
X = pd.DataFrame(df.drop('target', axis=1))
y = pd.DataFrame(df['target'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize decision tree classifier
dtc = DecisionTreeClassifier()

# Define the grid of hyperparameters to search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize grid search object
grid_search = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize decision tree classifier with best hyperparameters
dtc = DecisionTreeClassifier(criterion=grid_search.best_params_['criterion'],
                             max_depth=grid_search.best_params_['max_depth'],
                             min_samples_split=grid_search.best_params_['min_samples_split'],
                             min_samples_leaf=grid_search.best_params_['min_samples_leaf'])

# Use RFECV to perform feature selection and cross-validation
rfecv = RFECV(estimator=dtc, step=1, cv=5, scoring='accuracy', min_features_to_select=7)
rfecv.fit(X_train, y_train.values.ravel())

# Print the optimal number of features selected
print("Optimal number of features: ", rfecv.n_features_)

# Print the selected features
print("Selected features: ", X_train.columns[rfecv.support_])

# Fit decision tree classifier with selected features
dtc.fit(X_train[X_train.columns[rfecv.support_]], y_train.values.ravel())

# Make predictions using the test set
y_pred = dtc.predict(X_test[X_test.columns[rfecv.support_]])

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy score: {:.6f}".format(accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))


Best hyperparameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 5}
Optimal number of features:  7
Selected features:  Index(['id', 'bu', 'hemo', 'pcv', 'wc', 'rc', 'htn'], dtype='object')
Accuracy score: 0.991667
Precision: 0.991775
Recall: 0.991667
F1 score: 0.991646


# SVM with RFECV

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_validate
# Define a pipeline for SVM with feature selection and hyperparameter tuning
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selection', RFECV(estimator=SVC(kernel='linear'), cv=5)),
    ('svm', SVC())
])

# Define the grid of hyperparameters to search for SVM
svm_param_grid = {
    'svm__C': [10, 100],
    'svm__gamma': [0.001, 0.01, 0.1, 1],
    'svm__kernel': ['linear', 'rbf']
}

# Initialize grid search object for SVM
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5)

# Fit grid search object to the data for SVM
svm_grid_search.fit(X_train, y_train.values.ravel())

# Print best hyperparameters for SVM
print("Best hyperparameters for SVM:", svm_grid_search.best_params_)

# Make predictions using the test set for SVM
svm_y_pred = svm_grid_search.predict(X_test)

# Calculate accuracy score for SVM
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM accuracy score: {:.6f}".format(svm_accuracy))

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision: {:.6f}".format(precision))

# Calculate recall
recall = recall_score(y_test, y_pred, average='weighted')
print("Recall: {:.6f}".format(recall))

# Calculate F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score: {:.6f}".format(f1))


Best hyperparameters for SVM: {'svm__C': 10, 'svm__gamma': 0.001, 'svm__kernel': 'rbf'}
SVM accuracy score: 0.991667
Precision: 0.991775
Recall: 0.991667
F1 score: 0.991646


# RF with RFECV

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Perform recursive feature elimination with cross-validation
rfecv = RFECV(estimator=RandomForestClassifier(n_estimators=100), cv=5)
X_rfecv = rfecv.fit_transform(X, y)

# Print the selected features
selected_features = X.columns[rfecv.support_]
print("Selected features:", list(selected_features))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rfecv, y, test_size=0.2)

# Define the grid of hyperparameters to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize random forest classifier
rfc = RandomForestClassifier()

# Initialize grid search object
grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

# Fit grid search object to the data
grid_search.fit(X_train, y_train)

# Print best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# Initialize random forest classifier with best hyperparameters
rfc_tuned = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    min_samples_split=grid_search.best_params_['min_samples_split'],
                                    min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                                    )

# Fit the classifier to the data
rfc_tuned.fit(X_train, y_train)

# Calculate accuracy score on test data
test_accuracy = rfc_tuned.score(X_test, y_test)

# Test the model
y_pred = rfc_tuned.predict(X_test)

# Print accuracy score, precision, recall and F1 score
print("Accuracy score:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 score:", f1_score(y_test, y_pred, average='macro'))

Selected features: ['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'appet', 'pe', 'ane']
Best hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy score: 0.9875
Precision: 0.6595744680851063
Recall: 0.6666666666666666
F1 score: 0.6630824372759857


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import output
output.eval_js('new Audio("http://commondatastorage.googleapis.com/codeskulptor-assets/week7-brrring.m4a").play()')