In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
%matplotlib inline


sns.set_style('whitegrid')  # You can also try 'darkgrid', 'white', 'dark', etc.


ds = pd.read_csv("kidney_disease.csv")

In [None]:
#plots a bar graph with features and missing values
from matplotlib import style
style.use("fivethirtyeight")
plt.figure(figsize=(15,5))

missing = ds.isna().sum().sort_values(ascending=False)

(missing/400).plot(kind="bar",color="blue")
plt.show()

In [None]:
#gives the various categorical values  under object features
cat_col=[col for col in ds.columns if ds[col].dtype=='object']
for col in cat_col:
    print('{} has {} values '.format(col,ds[col].unique()))
    print('\n')

In [None]:

#THis Creates uniformity in the data under the categorical features by unifying ambiguous categorical feature values
ds.replace({'\t?': np.nan, '\tyes': 'yes', '\tno': 'no', '\t43': '43', ' yes': 'yes', 'ckd\t': 'ckd'}, inplace=True)

# Convert numeric columns to appropriate types after cleaning

ds['pcv'] = pd.to_numeric(ds['pcv'], errors='coerce')

# wc
ds['wc'] = pd.to_numeric(ds['wc'], errors='coerce')

# rc
ds['rc'] = pd.to_numeric(ds['rc'], errors='coerce')

#filling the missing values using mode of the categorical features
# For categorical columns with 'normal', 'abnormal', 'notpresent', 'present', etc.
for col in ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'classification']:
    ds[col].fillna(ds[col].mode()[0], inplace=True)  # fill missing values with mode

# Check and verify the data
# print(ds.head())
ds['classification']


In [None]:
#encodes ckd : 0, not ckd:1
ds['classification'] = (ds['classification'] != 'ckd').astype(int)
ds['classification']


In [7]:
#kernel density function gives the kernel density curve for the any numerical feature with the ckd or notckd classification or 0 or 1,
def kde(col):
    grid = sns.FacetGrid(ds, hue="classification", height=6, aspect=2)

    grid.map(sns.kdeplot, col)
    grid.add_legend()

In [None]:
#diffrentiation of categorical and numerical features
num_col=[col for col in ds.columns if ds[col].dtype!='object']
cat_col=[col for col in ds.columns if ds[col].dtype=='object']


print(num_col)
print(cat_col)

In [None]:
#plots KDE of hemoglobin levels with classification of ckd or notckd
kde('hemo')
#we can see the subjects with lower hemo levels have ckd

In [None]:
for i in num_col:
    kde(i)
#plots kernel density estimate graph for all numerical features

In [None]:
(ds.isnull().sum()/ds.shape[0]*100.00).round(2)

In [12]:
#filling missing values with mean of the numerical features
def fill_mean(ftr):
    sample = ds[ftr].dropna().sample(ds[ftr].isnull().sum())
    sample.index= ds[ds[ftr].isnull()].index
    ds.loc[ds[ftr].isnull(),ftr]=sample

#filling missing values with mode of the categorical features
def fill_mode(ftr):
    mode = ds[ftr].mode()[0]
    ds[ftr]=ds[ftr].fillna(mode)

In [None]:
#filling missing values with mode of the categorical features
for cat in cat_col:
    fill_mode(cat)


#filling missing values with mean of the numerical features
for num in num_col:
    fill_mean(num)


ds.info()

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder

In [15]:
encoder= LabelEncoder()

# encode categorical features
for col in cat_col:
    ds[col]=encoder.fit_transform(ds[col])

independent_col = [col for col in ds.columns if col != "classification"]
dependent_col = "classification"

X = ds[independent_col]
y = ds[dependent_col]

In [None]:
X.head(20)


In [17]:
#splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)
rf_precision = precision_score(y_test, y_pred)
rf_recall = recall_score(y_test, y_pred)
rf_f1 = f1_score(y_test, y_pred)

# Write metrics to file
with open('random_forest_metrics.txt', 'w') as f:
    f.write("Random Forest Metrics:\n")
    f.write(f"Accuracy: {rf_accuracy:.4f}\n")
    f.write(f"Precision: {rf_precision:.4f}\n") 
    f.write(f"Recall: {rf_recall:.4f}\n")
    f.write(f"F1-Score: {rf_f1:.4f}\n")

cm = confusion_matrix(y_test, y_pred)
print(cm)
cr = classification_report(y_test, y_pred)
print("Classification Report", cr)


In [None]:
# Get first tree from random forest to visualize
tree = rf.estimators_[0]

class_names = ["ckd", "notckd"]
plt.figure(figsize=(20,10))
plot_tree(tree, feature_names=independent_col, filled=True, rounded=True, fontsize=10)
plt.show()

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Separate data by class
def separate_by_class(X, y):
    classes = np.unique(y)
    separated = {c: X[y == c].values for c in classes}
    return separated

# 2. Calculate Mean and Variance for each feature in each class
def calculate_mean_variance(separated_by_class):
    mean_variance = {}
    for class_value, instances in separated_by_class.items():
        # Add small epsilon to variance to avoid division by zero
        mean = np.mean(instances, axis=0)
        variance = np.var(instances, axis=0) + 1e-9
        mean_variance[class_value] = (mean, variance)
    return mean_variance

# 3. Calculate Prior Probabilities for each class
def calculate_prior(y):
    classes, counts = np.unique(y, return_counts=True)
    priors = {c: count / len(y) for c, count in zip(classes, counts)}
    return priors

# 4. Gaussian Probability Density Function
def gaussian_probability(x, mean, variance):
    exponent = np.exp(-((x - mean) ** 2) / (2 * variance))
    return (1 / np.sqrt(2 * np.pi * variance)) * exponent

# 5. Calculate Class Probabilities
def calculate_class_probabilities(input_data, mean_variance, priors):
    probabilities = {}
    for class_value, (mean, variance) in mean_variance.items():
        # Start with the prior probability
        probabilities[class_value] = np.log(priors[class_value])
        # Add log probabilities to avoid numerical underflow
        for i in range(len(mean)):
            probabilities[class_value] += np.log(gaussian_probability(input_data[i], mean[i], variance[i]))
    return probabilities

# 6. Prediction
def predict(input_data, mean_variance, priors):
    probabilities = calculate_class_probabilities(input_data, mean_variance, priors)
    # Return the class with the highest probability
    return max(probabilities, key=probabilities.get)

# 7. Naive Bayes Training Function
def train_naive_bayes(X, y):
    separated_by_class = separate_by_class(X, y)
    mean_variance = calculate_mean_variance(separated_by_class)
    priors = calculate_prior(y)
    return mean_variance, priors

# 8. Predict Multiple Samples
def predict_multiple(X, mean_variance, priors):
    predictions = [predict(x, mean_variance, priors) for x in X.values]
    return np.array(predictions)

# Train the model
mean_variance, priors = train_naive_bayes(X_train, y_train)

# Make predictions
predictions = predict_multiple(X_test, mean_variance, priors)

# Calculate metrics
nb_accuracy = np.mean(predictions == y_test)
nb_precision = precision_score(y_test, predictions)
nb_recall = recall_score(y_test, predictions)
nb_f1 = f1_score(y_test, predictions)

# Write metrics to file
with open('naive_bayes_metrics.txt', 'w') as f:
    f.write(f"Naive Bayes Metrics:\n")
    f.write(f"Accuracy: {nb_accuracy:.4f}\n")
    f.write(f"Precision: {nb_precision:.4f}\n") 
    f.write(f"Recall: {nb_recall:.4f}\n")
    f.write(f"F1-Score: {nb_f1:.4f}\n")



In [None]:
# Logistic Regression Implementation
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. Sigmoid Function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# 2. Cost Function
def compute_cost(X, y, weights):
    m = len(y)
    h = sigmoid(np.dot(X, weights))
    cost = (-1/m) * np.sum(y * np.log(h) + (1-y) * np.log(1-h))
    return cost

# 3. Gradient Descent
def gradient_descent(X, y, weights, learning_rate, n_iterations):
    m = len(y)
    cost_history = []
    
    for _ in range(n_iterations):
        h = sigmoid(np.dot(X, weights))
        gradient = np.dot(X.T, (h - y)) / m
        weights = weights - learning_rate * gradient
        cost_history.append(compute_cost(X, y, weights))
    
    return weights, cost_history

# 4. Predict Function
def predict_logistic(X, weights, threshold=0.5):
    probabilities = sigmoid(np.dot(X, weights))
    return (probabilities >= threshold).astype(int)

# 5. Train Logistic Regression
def train_logistic_regression(X, y, learning_rate=0.01, n_iterations=1000):
    # Add bias term
    X = np.c_[np.ones((X.shape[0], 1)), X]
    weights = np.zeros(X.shape[1])
    
    # Train model
    weights, cost_history = gradient_descent(X, y, weights, learning_rate, n_iterations)
    return weights, cost_history

# Train the logistic regression model
weights, cost_history = train_logistic_regression(X_train, y_train)

# Add bias term to test data
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Make predictions
lr_predictions = predict_logistic(X_test_bias, weights)

# Calculate metrics
lr_accuracy = np.mean(lr_predictions == y_test)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

# Write metrics to file
with open('logistic_regression_metrics.txt', 'w') as f:
    f.write("Logistic Regression Metrics\n")
    f.write("-" * 30 + "\n")
    f.write(f"Accuracy: {lr_accuracy:.4f}\n")
    f.write(f"Precision: {lr_precision:.4f}\n") 
    f.write(f"Recall: {lr_recall:.4f}\n")
    f.write(f"F1-Score: {lr_f1:.4f}\n")

print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")


In [None]:
# Compare metrics across all models
print("Comparing Model Performance")
print("-" * 50)

# Logistic Regression Metrics
print("Logistic Regression:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}") 
print(f"F1-Score: {lr_f1:.4f}")
print()

# Naive Bayes Metrics
print("Naive Bayes:")
print(f"Accuracy: {nb_accuracy:.4f}")
print(f"Precision: {nb_precision:.4f}")
print(f"Recall: {nb_recall:.4f}")
print(f"F1-Score: {nb_f1:.4f}")
print()

# Random Forest Metrics
print("Random Forest:")
print(f"Accuracy: {rf_accuracy:.4f}") 
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")

# Visualize comparison with bar plot
import matplotlib.pyplot as plt

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
lr_scores = [lr_accuracy, lr_precision, lr_recall, lr_f1]
nb_scores = [nb_accuracy, nb_precision, nb_recall, nb_f1]
rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1]

x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
ax.bar(x - width, lr_scores, width, label='Logistic Regression')
ax.bar(x, nb_scores, width, label='Naive Bayes')
ax.bar(x + width, rf_scores, width, label='Random Forest')

ax.set_ylabel('Scores')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

plt.tight_layout()
plt.show()
