In [2]:
# Data manipulation and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn libraries for models and evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

# Machine Learning Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression


In [4]:
import pandas as pd

# Replace the URL with your dataset's raw URL from GitHub
url = "https://raw.githubusercontent.com/sb2043/DM-ML-portfolio/refs/heads/main/data/raw/Nominal%20Dataset/new_model.csv?token=GHSAT0AAAAAACZ2Z562ZDPZBAON56OJOECKZZKH7YQ"
data = pd.read_csv(url)
print(data.head())


     Bp     Sg   Al   Su  Rbc    Bu   Sc     Sod   Pot  Hemo    Wbcc  Rbcc  \
0  80.0  1.020  1.0  0.0  1.0  36.0  1.2  137.53  4.63  15.4  7800.0  5.20   
1  50.0  1.020  4.0  0.0  1.0  18.0  0.8  137.53  4.63  11.3  6000.0  4.71   
2  80.0  1.010  2.0  3.0  1.0  53.0  1.8  137.53  4.63   9.6  7500.0  4.71   
3  70.0  1.005  4.0  0.0  1.0  56.0  3.8  111.00  2.50  11.2  6700.0  3.90   
4  80.0  1.010  2.0  0.0  1.0  26.0  1.4  137.53  4.63  11.6  7300.0  4.60   

   Htn  Class  
0  1.0      1  
1  0.0      1  
2  0.0      1  
3  1.0      1  
4  0.0      1  


In [None]:
# Assuming the file name is train.csv
import io
kd_data = pd.read_csv(io.BytesIO(uploaded['new_model.csv']))

In [None]:
kd_data.info()

In [None]:
kd_data.shape[0],kd_data.shape[1]

In [None]:
kd_data.head()

In [None]:
# Check for missing values
missing_values = kd_data.isnull().sum()

# Check for duplicates
duplicates = kd_data.duplicated().sum()

print("Missing values per column:\n", missing_values)
print("\nNumber of duplicate rows:", duplicates)


In [None]:
kd_data.describe()

In [None]:
# setting the outliers for column attributes outside the scope of medical possiblity i.e eradicating erreneous value(implausible)

# Reload the original data to reset any prior transformations
kd_data = pd.read_csv('new_model.csv')

# Handle the 'Bu' column based on the normal ranges provided
# Replace values lower than 7 with NaN as they are biologically implausible
kd_data['Bu'] = kd_data['Bu'].apply(lambda x: pd.NA if x < 7 else x)

# Fill NaN values with the median of the 'Bu' column
kd_data['Bu'].fillna(kd_data['Bu'].median(), inplace=True)


kd_data['Sc'] = kd_data['Sc'].apply(lambda x: pd.NA if x > 15 else x)
kd_data['Sc'].fillna(kd_data['Sc'].median(), inplace=True)
 # Replace NaN with median


# Apply only the lower bound to the 'Sod' column (no upper bound)
kd_data['Sod'] = kd_data['Sod'].apply(lambda x: pd.NA if x < 120 else x)

# Replace NaN with the median of the 'Sod' column
kd_data['Sod'].fillna(kd_data['Sod'].median(), inplace=True)

# Verify the corrected distribution
kd_data['Sod'].describe()


kd_data['Pot'] = kd_data['Pot'].apply(lambda x: pd.NA if x > 10 else x)
kd_data['Pot'].fillna(kd_data['Pot'].median(), inplace=True)
# Replace NaN with median


kd_data['Hemo'] = kd_data['Hemo'].apply(lambda x: pd.NA if x < 6 else x)
kd_data['Hemo'].fillna(kd_data['Hemo'].median(), inplace=True)  # Replace NaN with median



kd_data['Hemo'] = kd_data['Hemo'].apply(lambda x: pd.NA if x < 6 else x)
kd_data['Hemo'].fillna(kd_data['Hemo'].median(), inplace=True)  # Replace NaN with median



kd_data['Rbcc'] = kd_data['Rbcc'].apply(lambda x: pd.NA if x < 3.5 else x)
kd_data['Rbcc'].fillna(kd_data['Rbcc'].median(), inplace=True)  # Replace NaN with median


In [None]:
kd_data.describe()

In [None]:
kd_data.nunique()

In [None]:
coulumns = kd_data.columns

for col in coulumns:
  print(col)
  print(kd_data[col].unique)

In [None]:
correlation_matrix = kd_data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm',xticklabels=correlation_matrix.columns,yticklabels=correlation_matrix.columns)
plt.title('Correlation Heatmap')
plt.show()

# Correlation with the target variable (replace 'readmitted' with the actual target column name)
target_corr = correlation_matrix['Class'].sort_values(ascending=False)
print(target_corr)

print("X*X*X*X*X*X*X")

target_corr = correlation_matrix['Class'].sort_values(ascending=True)
print(target_corr)

In [None]:
# Plot histograms for all numerical features
kd_data.hist(bins=20, figsize=(15, 15))
plt.suptitle('Distribution of Features')
plt.show()


In [None]:
# Adjust the y-axis ticks to be more detailed, showing decimal values for better granularity
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plotting the boxplot with more detailed y-axis labels
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Hemo', data=kd_data)
plt.title('Hemo vs Class')

# Setting detailed y-axis ticks (from 6 to 18 in 0.5 increments)
plt.yticks(np.arange(6, 19, 0.5))

plt.show()

# Calculate and print median values for Hemo grouped by Class
median_hemo = kd_data.groupby('Class')['Hemo'].median()
print("Median Hemoglobin by Class:\n", median_hemo)


In [None]:
# Boxplot for Serum Creatinine vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Sc', data=kd_data)
plt.title('Serum Creatinine vs Class')
plt.xlabel('Class')
plt.ylabel('Serum Creatinine (mg/dL)')
plt.yticks(np.arange(0, kd_data['Sc'].max() + 1, 0.5))  # Setting y-axis ticks with 0.5 increments for detail
plt.show()

median_sc = kd_data.groupby('Class')['Sc'].median()
print("Median Serum Creatinine by Class:\n", median_sc)

In [None]:
# Boxplot for Red Blood Cell Count vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Rbcc', data=kd_data)
plt.title('Red Blood Cell Count vs Class')
plt.xlabel('Class')
plt.ylabel('Red Blood Cell Count (millions/µL)')
plt.yticks(np.arange(3.0, kd_data['Rbcc'].max() + 0.5, 0.2))  # Setting y-axis ticks with 0.2 increments for detail
plt.show()

# Calculate and print median values for Rbcc grouped by Class
median_rbcc = kd_data.groupby('Class')['Rbcc'].median()
print("Median Red Blood Cell Count by Class:\n", median_rbcc)

In [None]:
# Boxplot for Sodium vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Sod', data=kd_data)
plt.title('Sodium vs Class')
plt.xlabel('Class')
plt.ylabel('Sodium (mmol/L)')
plt.yticks(np.arange(120, kd_data['Sod'].max() + 1, 1))  # Setting y-axis ticks with 1 mmol/L increments for detail
plt.show()

# Calculate and print median values for Sod grouped by Class
median_sod = kd_data.groupby('Class')['Sod'].median()
print("Median Sodium by Class:\n", median_sod)

In [None]:
# Boxplot for Hypertension Presence vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Htn', data=kd_data)
plt.title('Hypertension Presence vs Class')
plt.xlabel('Class')
plt.ylabel('Hypertension Presence (0/1)')
plt.yticks([0, 0.5, 1])  # Binary outcome, keeping it simple
plt.show()


# Calculate and print median values for Htn grouped by Class
median_htn = kd_data.groupby('Class')['Htn'].median()
print("Median Hypertension Presence by Class:\n", median_htn)

In [None]:
# Boxplot for Albumin in Urine vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Al', data=kd_data)
plt.title('Albumin in Urine vs Class')
plt.xlabel('Class')
plt.ylabel('Albumin Level')
plt.yticks(np.arange(0, kd_data['Al'].max() + 1, 1))  # Setting y-axis ticks with 1-unit increments for detail
plt.show()


# Calculate and print median values for Al grouped by Class
median_al = kd_data.groupby('Class')['Al'].median()
print("Median Albumin by Class:\n", median_al)


In [None]:
# Boxplot for Blood Urea vs Class with detailed y-axis
plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Bu', data=kd_data)
plt.title('Blood Urea vs Class')
plt.xlabel('Class')
plt.ylabel('Blood Urea (mg/dL)')
plt.yticks(np.arange(0, kd_data['Bu'].max() + 5, 10))
plt.show()

# Calculate and print median values for Bu grouped by Class
median_bu = kd_data.groupby('Class')['Bu'].median()
print("Median Blood Urea by Class:\n", median_bu)

In [None]:

plt.figure(figsize=(8, 6))
sns.boxplot(x='Class', y='Bu', data=kd_data)  # Using 'data' as the DataFrame
plt.title('Blood Urea vs Class')
plt.xlabel('Class')
plt.ylabel('Blood Urea (mg/dL)')
plt.yticks(np.arange(0, kd_data['Bu'].max() + 10, 10))  # Using 10 mg/dL increments for clearer visualization
plt.show()

# Calculate and print median values for Bu grouped by Class
median_bu = kd_data.groupby('Class')['Bu'].median()
print("Median Blood Urea by Class:\n", median_bu)


In [None]:
# Scatter plot for Sc vs Bu
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Sc', y='Bu', hue='Class', data=kd_data, palette='Set1')
plt.title('Serum Creatinine vs Blood Urea')
plt.xlabel('Serum Creatinine (mg/dL)')
plt.ylabel('Blood Urea (mg/dL)')
plt.show()


In [None]:
# Scatter plot for Rbcc vs Hemo
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Rbcc', y='Hemo', hue='Class', data=kd_data, palette='Set1')
plt.title('Red Blood Cell Count vs Hemoglobin')
plt.xlabel('Red Blood Cell Count (millions/µL)')
plt.ylabel('Hemoglobin (g/dL)')
plt.show()



In [None]:
# Pairplot to see interactions between features and target variable
sns.pairplot(kd_data, hue='Class', diag_kind='kde', corner=True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of features to plot
features_to_plot = ['Bp', 'Bu', 'Sc', 'Sod', 'Pot', 'Hemo']

# Generate histograms for the features side by side
plt.figure(figsize=(15, 10))

for i, feature in enumerate(features_to_plot):
    plt.subplot(2, 3, i + 1)
    sns.histplot(kd_data[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data' is your DataFrame

# 1. Total Records for Non-CKD and CKD
plt.figure(figsize=(8, 6))
class_counts = kd_data['Class'].value_counts()
sns.barplot(x=class_counts.index, y=class_counts.values, palette='Set2')
for i, value in enumerate(class_counts.values):
    plt.text(i, value + 3, str(value), ha='center', fontsize=12)
plt.title('Total Records for Non-CKD and CKD')
plt.xlabel('Class')
plt.ylabel('Total Count')
plt.show()

# 2. Number of Non-CKD and CKD by Hypertension
plt.figure(figsize=(8, 6))
sns.countplot(x='Class', hue='Htn', data=kd_data, palette="Set2")
plt.title('Number of Non-CKD and CKD by Hypertension')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

# 3. Histogram distribution of features separated by class (Hypertension as hue)
features_to_plot = ['Bp', 'Bu', 'Sc', 'Sod', 'Pot', 'Hemo']

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_to_plot):
    plt.subplot(2, 3, i + 1)
    sns.histplot(kd_data, x=feature, hue='Htn', kde=True, bins=30, palette="Set2")
    plt.title(f'Distribution of {feature} by Hypertension')
    plt.xlabel(feature)
    plt.ylabel('Count')

plt.tight_layout()
plt.show()

# 4. Barplot showing mean values of features by Class and Hypertension
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features_to_plot):
    plt.subplot(2, 3, i + 1)
    sns.barplot(x='Htn', y=feature, hue='Class', data=kd_data, palette="Set2")
    plt.title(f'Mean {feature} by Class and Hypertension')
    plt.xlabel('Hypertension (Htn)')
    plt.ylabel(f'Mean {feature}')

plt.tight_layout()
plt.show()
