In [69]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [70]:
df = pd.read_csv("set.csv", na_values=['?'])
df.columns

Index(['age', 'bp', 'sg', 'al', 'sugar', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [71]:
# Summarize the data :

def summarize_dataframe(df):
    summary = pd.DataFrame({
        'Column': df.columns,
        'Data Type': df.dtypes.values,
        'Missing %': df.isnull().mean().values * 100,
        'Unique Values': [df[col].nunique() if df[col].dtype == 'object' else None for col in df.columns],
        'Range': [f"{df[col].min()} - {df[col].max()}" if df[col].dtype != 'object' else None for col in df.columns]
    })

    return summary

summary = summarize_dataframe(df)
print(summary)



   Column Data Type  Missing %  Unique Values           Range
0     age   float64       2.25            NaN      2.0 - 90.0
1      bp   float64       3.00            NaN    50.0 - 180.0
2      sg   float64      11.75            NaN   1.005 - 1.025
3      al   float64      11.50            NaN       0.0 - 5.0
4   sugar   float64      12.25            NaN       0.0 - 5.0
5     rbc    object      38.00            2.0            None
6      pc    object      16.25            2.0            None
7     pcc    object       1.00            2.0            None
8      ba    object       1.00            2.0            None
9     bgr   float64      11.00            NaN    22.0 - 490.0
10     bu   float64       4.75            NaN     1.5 - 391.0
11     sc   float64       4.25            NaN      0.4 - 76.0
12    sod   float64      21.75            NaN     4.5 - 163.0
13    pot   float64      22.00            NaN      2.5 - 47.0
14   hemo   float64      13.00            NaN      3.1 - 17.8
15    pc

## Data Processing

In [73]:
df['dm']= np.where(df['dm'] == ' yes','yes',df['dm'])

In [74]:
df['pe'].value_counts(dropna=False)
# df['dm'].unique()

pe
no      311
yes      74
good     14
NaN       1
Name: count, dtype: int64

In [75]:
# Identify Anomalous columns

ANO = ["htn"]

# Identify the columns where more than 30% observations are missing

TO_DROP = ["rbc","rc"]

print(df.shape)
df = df.drop(TO_DROP+ANO, axis=1)
print(df.shape)

(400, 25)
(400, 22)



#### Data processing on numerical fields


In [77]:
# split a DataFrame into two based on the data type of each column:

def split_dataframe_by_dtype(df):
    df_numeric = df.select_dtypes(include=['number'])  # Select numerical columns
    df_categorical = df.select_dtypes(exclude=['number'])  # Select categorical (non-numeric) columns
    return df_numeric, df_categorical

df_numeric, df_categorical = split_dataframe_by_dtype(df)
df_numeric.shape, df_categorical.shape


((400, 13), (400, 9))

In [78]:
# to handle missing values in numerical columns + Flooring & Capping

def handle_missing_and_outliers(df):
    df_numeric = df.select_dtypes(include=['number']).copy()  # Select numerical columns
    
    for col in df_numeric.columns:
        # Replace missing values with median
        median_value = df_numeric[col].median()
        df_numeric[col].fillna(median_value, inplace=True)

        # Flooring & Capping
        lower_bound = np.percentile(df_numeric[col], 1)  # 1st percentile
        upper_bound = np.percentile(df_numeric[col], 99)  # 99th percentile
        
        df_numeric[col] = np.clip(df_numeric[col], lower_bound, upper_bound)

    return df_numeric

df_numeric_cleaned = handle_missing_and_outliers(df_numeric)
df_numeric_cleaned.shape


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numeric[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numeric[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

(400, 13)

In [79]:
summary_numeric = summarize_dataframe(df_numeric_cleaned)
summary_numeric

Unnamed: 0,Column,Data Type,Missing %,Unique Values,Range
0,age,float64,0.0,,5.0 - 80.00999999999999
1,bp,float64,0.0,,50.0 - 110.0
2,sg,float64,0.0,,1.005 - 1.025
3,al,float64,0.0,,0.0 - 4.0
4,sugar,float64,0.0,,0.0 - 4.0
5,bgr,float64,0.0,,70.0 - 425.2199999999998
6,bu,float64,0.0,,15.0 - 235.05999999999995
7,sc,float64,0.0,,0.5 - 18.15899999999995
8,sod,float64,0.0,,113.0 - 150.0
9,pot,float64,0.0,,2.899 - 6.5009999999999994


#### Data processig on categorical fields

In [81]:
# Replace the missing values in categorical fileds with modal value of  the column

def impute_categorical_with_mode(df):
    for col in df.columns:
        mode_value = df[col].mode()[0]  # Get the most frequent value
        df[col].fillna(mode_value, inplace=True)  # Replace missing values with mode
    
    return df

df_categorical_imputed = impute_categorical_with_mode(df_categorical)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)  # Replace missing values with mode


In [82]:
df_categorical_imputed.head()

Unnamed: 0,pc,pcc,ba,dm,cad,appet,pe,ane,class
0,normal,notpresent,notpresent,yes,no,good,no,no,ckd
1,normal,notpresent,notpresent,no,no,good,no,no,ckd
2,normal,notpresent,notpresent,yes,no,poor,no,yes,ckd
3,abnormal,present,notpresent,no,no,poor,yes,yes,ckd
4,normal,notpresent,notpresent,no,no,good,no,no,ckd


In [83]:
## Encode categorical variables
# LabelEncoder for 2 class
# Onehotencoding for multiple classes

label_encoder = LabelEncoder()

columns_to_encode = ['pc', 'pcc', 'ba', 'dm', 'cad', 'ane', 'class']

for col in columns_to_encode:
    label_encoder.fit(df_categorical_imputed[col])
    df_categorical_imputed[col] = label_encoder.transform(df_categorical_imputed[col])

df_categorical_imputed.head(20)

Unnamed: 0,pc,pcc,ba,dm,cad,appet,pe,ane,class
0,1,0,0,1,0,good,no,0,0
1,1,0,0,0,0,good,no,0,0
2,1,0,0,1,0,poor,no,1,0
3,0,1,0,0,0,poor,yes,1,0
4,1,0,0,0,0,good,no,0,0
5,1,0,0,1,0,good,yes,0,0
6,1,0,0,0,0,good,no,0,0
7,0,0,0,1,0,good,yes,0,0
8,0,1,0,1,0,good,no,1,0
9,0,1,0,1,0,poor,no,1,0


In [84]:
## Encode categorical variables
# LabelEncoder for 2 class
# Onehotencoding for multiple classes

# label_encoder = LabelEncoder()

columns_to_encode = ['pc', 'pcc', 'ba', 'dm', 'cad', 'ane', 'class']


# Dictionary to hold encoders
label_encoders = {}

for col in columns_to_encode:
    label_encoder = LabelEncoder()
    label_encoder.fit(df_categorical_imputed[col])
    df_categorical_imputed[col] = label_encoder.transform(df_categorical_imputed[col])
    label_encoders[col] = label_encoder

df_categorical_imputed.head(20)

Unnamed: 0,pc,pcc,ba,dm,cad,appet,pe,ane,class
0,1,0,0,1,0,good,no,0,0
1,1,0,0,0,0,good,no,0,0
2,1,0,0,1,0,poor,no,1,0
3,0,1,0,0,0,poor,yes,1,0
4,1,0,0,0,0,good,no,0,0
5,1,0,0,1,0,good,yes,0,0
6,1,0,0,0,0,good,no,0,0
7,0,0,0,1,0,good,yes,0,0
8,0,1,0,1,0,good,no,1,0
9,0,1,0,1,0,poor,no,1,0


In [85]:
# One hot encoding for ['appet', 'pe']

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap

# Select columns to encode
columns_to_encode = ['appet', 'pe']

# Fit and transform the categorical columns
onehot_encoder.fit(df_categorical_imputed[columns_to_encode])
encoded_array = onehot_encoder.transform(df_categorical_imputed[columns_to_encode])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(columns_to_encode))

# Drop original categorical columns and concatenate the new one-hot encoded DataFrame
df_categorical_imputed = df_categorical_imputed.drop(columns_to_encode, axis=1)
df_categorical_imputed = pd.concat([df_categorical_imputed, encoded_df], axis=1)

df_categorical_imputed.head(20)


Unnamed: 0,pc,pcc,ba,dm,cad,ane,class,appet_no,appet_poor,pe_no,pe_yes
0,1,0,0,1,0,0,0,0.0,0.0,1.0,0.0
1,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
2,1,0,0,1,0,1,0,0.0,1.0,1.0,0.0
3,0,1,0,0,0,1,0,0.0,1.0,0.0,1.0
4,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
5,1,0,0,1,0,0,0,0.0,0.0,0.0,1.0
6,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
7,0,0,0,1,0,0,0,0.0,0.0,0.0,1.0
8,0,1,0,1,0,1,0,0.0,0.0,1.0,0.0
9,0,1,0,1,0,1,0,0.0,1.0,1.0,0.0


In [86]:
for col in df_categorical_imputed.columns:
    print(f"Value counts for column: {col}")
    print(df_categorical_imputed[col].value_counts(dropna=False))
    print("-" * 40)  # Separator for better readability


Value counts for column: pc
pc
1    324
0     76
Name: count, dtype: int64
----------------------------------------
Value counts for column: pcc
pcc
0    358
1     42
Name: count, dtype: int64
----------------------------------------
Value counts for column: ba
ba
0    378
1     22
Name: count, dtype: int64
----------------------------------------
Value counts for column: dm
dm
0    267
1    133
Name: count, dtype: int64
----------------------------------------
Value counts for column: cad
cad
0    360
1     40
Name: count, dtype: int64
----------------------------------------
Value counts for column: ane
ane
0    338
1     62
Name: count, dtype: int64
----------------------------------------
Value counts for column: class
class
0    250
1    150
Name: count, dtype: int64
----------------------------------------
Value counts for column: appet_no
appet_no
0.0    386
1.0     14
Name: count, dtype: int64
----------------------------------------
Value counts for column: appet_poor
appet_po

#### Combine numeric & categorical columns

In [88]:
print(df.shape)
df_cleaned = pd.concat([df_numeric_cleaned,df_categorical_imputed], axis =1)
print(df_cleaned.shape)

(400, 22)
(400, 24)


In [89]:
summarize_dataframe(df_cleaned)

Unnamed: 0,Column,Data Type,Missing %,Unique Values,Range
0,age,float64,0.0,,5.0 - 80.00999999999999
1,bp,float64,0.0,,50.0 - 110.0
2,sg,float64,0.0,,1.005 - 1.025
3,al,float64,0.0,,0.0 - 4.0
4,sugar,float64,0.0,,0.0 - 4.0
5,bgr,float64,0.0,,70.0 - 425.2199999999998
6,bu,float64,0.0,,15.0 - 235.05999999999995
7,sc,float64,0.0,,0.5 - 18.15899999999995
8,sod,float64,0.0,,113.0 - 150.0
9,pot,float64,0.0,,2.899 - 6.5009999999999994


## Data Splitting - Training & testing 

In [91]:
## DiVide the dataset into indepent and dependent features
X=df_cleaned.drop('class',axis=1)
y=df_cleaned['class']

## Split the data in training and tetsing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  


## Scale these features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test=scaler.transform(X_test)




In [92]:
X_val

array([[ 0.21271572,  1.94906988, -0.51825415, ..., -0.51298918,
         0.51946248, -0.46056619],
       [ 0.9652051 ,  0.34434682, -1.43282029, ...,  1.94935887,
        -1.92506684,  2.17124059],
       [-1.58168202,  0.34434682, -0.51825415, ..., -0.51298918,
        -1.92506684,  2.17124059],
       ...,
       [-0.36612226, -0.4580147 , -1.43282029, ..., -0.51298918,
        -1.92506684,  2.17124059],
       [ 0.32848332,  0.34434682,  0.39631199, ..., -0.51298918,
         0.51946248, -0.46056619],
       [ 0.79155371, -1.26037623, -1.43282029, ...,  1.94935887,
         0.51946248, -0.46056619]])

In [93]:
df_cleaned.shape
df_cleaned.columns

Index(['age', 'bp', 'sg', 'al', 'sugar', 'bgr', 'bu', 'sc', 'sod', 'pot',
       'hemo', 'pcv', 'wc', 'pc', 'pcc', 'ba', 'dm', 'cad', 'ane', 'class',
       'appet_no', 'appet_poor', 'pe_no', 'pe_yes'],
      dtype='object')

In [94]:
# pip install scikeras


# MODEL TRAINING-ANN

In [96]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
import datetime

# Set global random seed for reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Optimizer and Loss function
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

# Build ANN Model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # HL1
    Dense(32, activation='relu'),  # HL2
    Dense(1, activation='sigmoid')  # Output layer
])

# Compile the model
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])




# Set up TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train, validation_data=(X_val, y_val),
    epochs=50,
    callbacks=[tensorflow_callback, early_stopping_callback]
)

# Get final validation accuracy
val_accuracy = history.history['val_accuracy'][-1]
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Final Validation Accuracy: {val_accuracy}")
print(f"Test Accuracy: {test_accuracy}")





Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.8170 - loss: 0.4401 - val_accuracy: 0.9875 - val_loss: 0.0628
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9870 - loss: 0.0619 - val_accuracy: 0.9875 - val_loss: 0.0240
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9953 - loss: 0.0185 - val_accuracy: 1.0000 - val_loss: 0.0138
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9953 - loss: 0.0086 - val_accuracy: 1.0000 - val_loss: 0.0067
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.0037 - val_accuracy: 1.0000 - val_loss: 0.0034
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0022
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

# RANDOM FOREST

In [98]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

## Predict on validation and test sets
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

## Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

## Print the accuracies
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Validation Accuracy: 1.0000
Test Accuracy: 0.9875


# XGBOOST

In [100]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score




xgb_model = XGBClassifier(n_estimators=50, max_depth=3, min_child_weight=3, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss')


xgb_model.fit(X_train, y_train)

# Validate model
y_valid_pred = xgb_model.predict(X_val)
valid_accuracy = accuracy_score(y_val, y_valid_pred)
print(f"Validation Accuracy: {valid_accuracy:.4f}")

# Test model
y_test_pred = xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Validation Accuracy: 0.9875
Test Accuracy: 0.9875


Parameters: { "use_label_encoder" } are not used.



# CATBOOST

In [102]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score




# Initialize CatBoost Classifier
cat_model = CatBoostClassifier(
    iterations=100,        # Number of boosting rounds
    learning_rate=0.1,     # Step size
    depth=6,              # Maximum depth of trees
    random_seed=42,
    verbose=0,            # Suppresses training logs
    eval_metric='Accuracy'
)

# Train model
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=0)

# Validate model
y_valid_pred = cat_model.predict(X_val)
valid_accuracy = accuracy_score(y_val, y_valid_pred)
print(f"Validation Accuracy: {valid_accuracy:.4f}")

# Test model
y_test_pred = cat_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


Validation Accuracy: 1.0000
Test Accuracy: 0.9875


In [103]:
# pip install --upgrade scikit-learn scikeras tensorflow

In [104]:
# import sklearn
# import tensorflow as tf
# import scikeras

# print("Scikit-Learn:", sklearn.__version__)
# print("TensorFlow:", tf.__version__)
# print("SciKeras:", scikeras.__version__)


# Ensemble

In [106]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam
def create_nn_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)), 
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap NN model using KerasClassifier
nn_model = KerasClassifier(model=create_nn_model, epochs=10, batch_size=32, verbose=0)

# Train the NN model separately
nn_model.fit(X_train, y_train)


# Get NN predictions (convert probabilities to class labels)
nn_preds = (nn_model.predict(X_test) > 0.5).astype(int)






# Train Voting Classifier (excluding NN)
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ],
    voting='hard'  # Majority voting
)

# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Get ensemble predictions
ensemble_preds = ensemble_model.predict(X_test)

final_preds = ((ensemble_preds + nn_preds.flatten()) / 2 >= 0.3).astype(int)

# Accuracy Calculation
ensemble_accuracy = accuracy_score(y_test, final_preds)
print(f'Final Ensemble Accuracy: {ensemble_accuracy:.4f}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Final Ensemble Accuracy: 0.9750


Parameters: { "use_label_encoder" } are not used.



In [107]:
import pickle

# Save the final predictions
with open('final_preds.pkl', 'wb') as f:
    pickle.dump(final_preds, f)

print("Final predictions saved successfully!")


Final predictions saved successfully!


# Save the model Trasformations

In [109]:
# Pickle dump StandardScaler()

with open('scaler_SD.pkl','wb') as file:
    pickle.dump(scaler,file)


# Pickle dump label encoder,Onehot encoder

# with open('label_encoder_SD.pkl','wb') as file:
#     pickle.dump(label_encoder,file)

with open('onehot_encoder_SD.pkl','wb') as file:
    pickle.dump(onehot_encoder,file)

with open('label_encoder_SD.pkl', 'wb') as file:
    pickle.dump(label_encoders, file)    

# Rest is not used

In [111]:
import os
from sklearn.ensemble import RandomForestClassifier
import pickle

# Create the models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the model
with open('models/random_forest.pkl', 'wb') as f:
    pickle.dump(rf_model, f)


In [112]:
from xgboost import XGBClassifier
import pickle

# Train the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Save the model
with open('models/xgboost.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)


Parameters: { "use_label_encoder" } are not used.



In [113]:
from catboost import CatBoostClassifier
import pickle

# Train the model
cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)

# Save the model
with open('models/catboost.pkl', 'wb') as f:
    pickle.dump(cat_model, f)


In [114]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pickle

# Build and compile the ANN model
ann_model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ann_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Save the model
with open('models/ann.pkl', 'wb') as f:
    pickle.dump(ann_model, f)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - accuracy: 0.5858 - loss: 0.6802 - val_accuracy: 0.8625 - val_loss: 0.5307
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9029 - loss: 0.5053 - val_accuracy: 0.9625 - val_loss: 0.4011
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9665 - loss: 0.3881 - val_accuracy: 0.9750 - val_loss: 0.3139
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9737 - loss: 0.3031 - val_accuracy: 0.9750 - val_loss: 0.2534
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9751 - loss: 0.2398 - val_accuracy: 0.9750 - val_loss: 0.2103
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9798 - loss: 0.1922 - val_accuracy: 0.9750 - val_loss: 0.1802
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [115]:
import pickle

# Load transformer from the file
with open('models/data_transformation.pkl', 'rb') as file:
    transformer = pickle.load(file)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [116]:
if hasattr(transformer, 'named_transformers_'):
    for name, transformer_obj in transformer.named_transformers_.items():
        if hasattr(transformer_obj, 'categories_'):
            print(f"Categories for '{name}': {transformer_obj.categories_}")


Categories for 'cat': [array(['abnormal', 'normal', nan], dtype=object), array(['abnormal', 'normal', nan], dtype=object), array(['notpresent', 'present', nan], dtype=object), array(['notpresent', 'present', nan], dtype=object), array(['4', '5.2', '8', 'no', 'yes', nan], dtype=object), array(['no', 'yes', nan], dtype=object), array(['no', 'yes', nan], dtype=object), array(['good', 'no', 'poor', nan], dtype=object), array(['good', 'no', 'yes', nan], dtype=object), array(['no', 'yes', nan], dtype=object), array(['ckd', 'notckd'], dtype=object)]


In [117]:
import pandas as pd

# Example input dictionary (adjust according to your actual input fields)
data = {
    'age': 50,
    'bp': 80,
    'sg': 1.015,
    'al': 0,
    'su': 0,
    'rbc': 1,
    'pc': 1,
    'pcc': 1,
    'ba': 1,
    'bgr': 100,
    'bu': 30,
    'sc': 1.2,
    'sod': 135
}

# Create DataFrame from input
input_df = pd.DataFrame([data])
print(input_df)
try:
    input_scaled = transformer.transform(input_df)
    print("Input data transformed successfully!")
except ValueError as e:
    print(f"Transformation Error: {e}")


   age  bp     sg  al  su  rbc  pc  pcc  ba  bgr  bu   sc  sod
0   50  80  1.015   0   0    1   1    1   1  100  30  1.2  135
Transformation Error: columns are missing: {'sugar', 'rc', 'class', 'htn', 'hemo', 'dm', 'pot', 'cad', 'appet', 'pcv', 'pe', 'wc', 'ane'}


In [118]:
import pickle
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification

# Generate a synthetic dataset (Replace this with actual CKD dataset)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define base models
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
catboost = CatBoostClassifier(verbose=0, random_state=42)
ann = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)

# Create Ensemble Model using Voting Classifier
ensemble_model = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('catboost', catboost), ('ann', ann)],
    voting='hard'  # 'hard' for majority voting, 'soft' for probability-based voting
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Save the trained ensemble model
with open("ensemble_model.pkl", "wb") as file:
    pickle.dump(ensemble_model, file)

print("✅ Ensemble model saved as 'ensemble_model.pkl'")


Parameters: { "use_label_encoder" } are not used.



✅ Ensemble model saved as 'ensemble_model.pkl'




In [119]:
expected_columns = transformer.feature_names_in_
print("Expected Columns:", expected_columns)


Expected Columns: ['age' 'bp' 'sg' 'al' 'sugar' 'rbc' 'pc' 'pcc' 'ba' 'bgr' 'bu' 'sc' 'sod'
 'pot' 'hemo' 'pcv' 'wc' 'rc' 'htn' 'dm' 'cad' 'appet' 'pe' 'ane' 'class']


In [120]:
print("Input DataFrame Columns:", input_df.columns)


Input DataFrame Columns: Index(['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod'],
      dtype='object')


In [121]:
for col in expected_columns:
    if col not in input_df.columns:
        input_df[col] = np.nan  # or some default value


In [122]:
input_df.rename(columns={'su': 'sugar'}, inplace=True)


In [123]:
if 'class' in input_df.columns:
    input_df.drop(columns=['class'], inplace=True)


In [124]:
transformed_data = transformer.transform(input_df)


ValueError: columns are missing: {'class'}

In [None]:
df_cleaned.head()
