### Visualizing Predictions of Chronic Kidney Disease: A Machine Learning Approach

##### Student Name: Shreemithra Naveen, Student ID: 23895041
##### Student Name: Sai Laasya Gorantla, Student ID: 23884136




##### Importing the libraries

In [2]:
#Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier


In [3]:
#Loading the file
disease = pd.read_csv('kidney_disease.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'kidney_disease.csv'

In [None]:
#Shape
print("Shape of the disease dataframe", disease.shape)

In [None]:
#First five rows
print(disease.head())

In [None]:
#Dropping off the column 'id'
disease = disease.drop('id', axis = 1)

In [None]:
#Renaming the columns for easy understanding the column and dataset clearly
disease.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium','potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema','aanemia', 'class']

In [None]:
print(disease['class'].unique)

In [None]:
#Information about the dataframe
disease.info()

### Handling null values


In [None]:
disease.isna().sum()

It is clear that every column has null values so we need to drop 

In [None]:
#Understaning the summary stastics
disease.describe()

##### Converting the column named white - red blood cells and packed_cell_volume into the numerical value as it is of object type



In [None]:
#Converting the column, the errors is applied for not considering the NA values
disease[['packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count']] = disease[['packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count']].apply(pd.to_numeric, errors = 'coerce')

In [None]:
#As the 6-6 column has just 1,2 or 4 missing values
disease.dropna(subset=['hypertension', 'diabetes_mellitus','coronary_artery_disease','appetite', 'peda_edema','aanemia', 'bacteria', 'pus_cell_clumps'], inplace = True)

In [None]:
numerical_columns_null = ['age','blood_pressure', 'specific_gravity', 'albumin' ,'sugar', 'blood_glucose_random' , 'blood_urea' , 'serum_creatinine', 'sodium', 'potassium' ,'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count']

for feature in numerical_columns_null:
    disease[feature].fillna(disease[feature].mean(), inplace = True)

In [None]:
disease.isna().sum()

In [None]:
categorical_null = ['red_blood_cells', 'pus_cell']

values = ['normal', 'abnormal']
for feature in categorical_null:
    print(disease[feature].unique(), disease[feature].isna().sum())
    disease[feature].fillna(random.choice(values), inplace=True)


In [None]:
disease.isna().sum()

In [None]:
categorical = disease.select_dtypes(include = 'object')
categorical.columns 

In [None]:
plt.figure(figsize=(15,15))
for i,col_name in enumerate(categorical.columns, start = 1):
    plt.subplot(3,4,i)
    sns.countplot(x = disease[col_name])
plt.show()

In [None]:
## Replacing the incorrect values - cleaning
print(disease['class'].unique())
# Clean 'diabetes_mellitus' column
disease['diabetes_mellitus']= disease['diabetes_mellitus'].replace(to_replace={'\tno': 'no', '\tyes': 'yes', ' yes': 'yes'})
disease['coronary_artery_disease'] = disease['coronary_artery_disease'].replace(to_replace='\tno',value='no')
disease['class'] = disease['class'].replace(to_replace={'ckd\t': 'ckd'})

print(disease['class'].unique())

In [None]:
#Converting the class to the numerical column and also assigning binary values
print(disease['class'].unique())
disease['class'] = disease['class'].map({'ckd': 0, 'notckd': 1})
disease['class'] = pd.to_numeric(disease['class'], errors='coerce')
print(disease['class'].unique())

disease['class'].dtype


In [None]:
#Fetching the numerical columns
numerical_columns = disease.select_dtypes(exclude = 'object')

numerical_columns.columns

In [None]:
plt.figure(figsize=(15,15))
sns.pairplot(numerical_columns)
plt.show()

Label encoder and column values

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(numerical_columns.corr(), annot=True, cmap = 'Set2')
plt.show()


numerical_columns.corr()

In [None]:
#EDA

features = ['red_blood_cell_count', 'packed_cell_volume', 'haemoglobin', 'albumin', 'specific_gravity']
class_palette = ({0:"red", 1:"green"})

for feature in features:
    plt.figure(figsize=(6,4))
    sns.histplot(data = disease, x = feature, hue = 'class', kde = True,  palette= class_palette)
    plt.title(f'Distribution of {feature} by Class')
    plt.xlabel(feature)
    plt.legend(title='Class', labels=['CKD (0)', 'Non-CKD (1)'])
    plt.show()
    


In [None]:
##Scatter plot for relationship
plt.Figure(figsize=(6,4))
sns.scatterplot(data = disease, x = 'haemoglobin', y = 'red_blood_cell_count', hue= 'class')
plt.xlabel("haemoglobin")
plt.ylabel("red_blood_cell_count")
plt.title("Relationship between red_blood_cell_count vs haemoglobin")
plt.show()

In [None]:
##Scatter plot for relationship
plt.Figure(figsize=(6,4))
sns.scatterplot(data = disease, x = 'packed_cell_volume', y = 'red_blood_cell_count', hue= 'class')
plt.xlabel("packed volumne")
plt.ylabel("red_blood_cell_count")
plt.title("Relationship between red_blood_cell_count vs packed volumne")
plt.show()

In [None]:
#Model Building

#The response and features
X = disease[['red_blood_cell_count', 'packed_cell_volume', 'haemoglobin', 'albumin', 'specific_gravity']]
y = disease['class']

#Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2)

#Knn model
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of knn
knn_acc = accuracy_score(y_test, knn.predict(X_test))
print(f"Test Accuracy of KNN {knn_acc} ")

print(f"Confusion Matrix : {confusion_matrix(y_test, knn.predict(X_test))}")
print(f"Classification Report : {classification_report(y_test, knn.predict(X_test))}")

In [None]:
#Decisoon Tree Classifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

# accuracy score, confusion matrix and classification report of decision tree

dtc_acc = accuracy_score(y_test, dtc.predict(X_test))
print(f"Accuracy of Decision Tree Classifier is {dtc_acc}")

print(f"Confusion Matrix {confusion_matrix(y_test, dtc.predict(X_test))}")
print(f"Classification Report {classification_report(y_test, dtc.predict(X_test))}")

Compared with KNN the Decision Tree CLssifier works better with accuracy of 96%.

In [None]:
disease.info()

In [None]:
categorical_cols = [
    'red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria',
    'hypertension', 'diabetes_mellitus', 'coronary_artery_disease',
    'appetite', 'pedal_edema', 'aanemia'
]


In [None]:
df_encoded = pd.get_dummies(disease, columns=categorical_cols, drop_first=True)
