In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
pwd

'/Users/aksharagarlapad/Desktop/dup'

In [3]:
df = pd.read_csv("set.csv", na_values=['?'])
df.columns

Index(['age', 'bp', 'sg', 'al', 'sugar', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu',
       'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'class'],
      dtype='object')

In [4]:
# Summarize the data :

def summarize_dataframe(df):
    summary = pd.DataFrame({
        'Column': df.columns,
        'Data Type': df.dtypes.values,
        'Missing %': df.isnull().mean().values * 100,
        'Unique Values': [df[col].nunique() if df[col].dtype == 'object' else None for col in df.columns],
        'Range': [f"{df[col].min()} - {df[col].max()}" if df[col].dtype != 'object' else None for col in df.columns]
    })

    return summary

summary = summarize_dataframe(df)
print(summary)



   Column Data Type  Missing %  Unique Values           Range
0     age   float64       2.25            NaN      2.0 - 90.0
1      bp   float64       3.00            NaN    50.0 - 180.0
2      sg   float64      11.75            NaN   1.005 - 1.025
3      al   float64      11.50            NaN       0.0 - 5.0
4   sugar   float64      12.25            NaN       0.0 - 5.0
5     rbc    object      38.00            2.0            None
6      pc    object      16.25            2.0            None
7     pcc    object       1.00            2.0            None
8      ba    object       1.00            2.0            None
9     bgr   float64      11.00            NaN    22.0 - 490.0
10     bu   float64       4.75            NaN     1.5 - 391.0
11     sc   float64       4.25            NaN      0.4 - 76.0
12    sod   float64      21.75            NaN     4.5 - 163.0
13    pot   float64      22.00            NaN      2.5 - 47.0
14   hemo   float64      13.00            NaN      3.1 - 17.8
15    pc

## Data Processing

In [6]:
df['dm']= np.where(df['dm'] == ' yes','yes',df['dm'])

In [7]:
df['pe'].value_counts(dropna=False)
# df['dm'].unique()

pe
no      311
yes      74
good     14
NaN       1
Name: count, dtype: int64

In [8]:
# Identify Anomalous columns

ANO = ["htn"]

# Identify the columns where more than 30% observations are missing

TO_DROP = ["rbc","rc"]

print(df.shape)
df = df.drop(TO_DROP+ANO, axis=1)
print(df.shape)

(400, 25)
(400, 22)


#### Data processing on numerical fields

In [10]:
# split a DataFrame into two based on the data type of each column:

def split_dataframe_by_dtype(df):
    df_numeric = df.select_dtypes(include=['number'])  # Select numerical columns
    df_categorical = df.select_dtypes(exclude=['number'])  # Select categorical (non-numeric) columns
    return df_numeric, df_categorical

df_numeric, df_categorical = split_dataframe_by_dtype(df)
df_numeric.shape, df_categorical.shape

((400, 13), (400, 9))

In [11]:
# to handle missing values in numerical columns + Flooring & Capping

def handle_missing_and_outliers(df):
    df_numeric = df.select_dtypes(include=['number']).copy()  # Select numerical columns
    
    for col in df_numeric.columns:
        # Replace missing values with median
        median_value = df_numeric[col].median()
        df_numeric[col].fillna(median_value, inplace=True)

        # Flooring & Capping
        lower_bound = np.percentile(df_numeric[col], 1)  # 1st percentile
        upper_bound = np.percentile(df_numeric[col], 99)  # 99th percentile
        
        df_numeric[col] = np.clip(df_numeric[col], lower_bound, upper_bound)

    return df_numeric

df_numeric_cleaned = handle_missing_and_outliers(df_numeric)
df_numeric_cleaned.shape


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numeric[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_numeric[col].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

(400, 13)

In [12]:
summary_numeric = summarize_dataframe(df_numeric_cleaned)
summary_numeric

Unnamed: 0,Column,Data Type,Missing %,Unique Values,Range
0,age,float64,0.0,,5.0 - 80.00999999999999
1,bp,float64,0.0,,50.0 - 110.0
2,sg,float64,0.0,,1.005 - 1.025
3,al,float64,0.0,,0.0 - 4.0
4,sugar,float64,0.0,,0.0 - 4.0
5,bgr,float64,0.0,,70.0 - 425.2199999999998
6,bu,float64,0.0,,15.0 - 235.05999999999995
7,sc,float64,0.0,,0.5 - 18.15899999999995
8,sod,float64,0.0,,113.0 - 150.0
9,pot,float64,0.0,,2.899 - 6.5009999999999994


#### Data processig on categorical fields

In [14]:
# Replace the missing values in categorical fileds with modal value of  the column

def impute_categorical_with_mode(df):
    for col in df.columns:
        mode_value = df[col].mode()[0]  # Get the most frequent value
        df[col].fillna(mode_value, inplace=True)  # Replace missing values with mode
    
    return df

df_categorical_imputed = impute_categorical_with_mode(df_categorical)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(mode_value, inplace=True)  # Replace missing values with mode


In [15]:
df_categorical_imputed.columns

Index(['pc', 'pcc', 'ba', 'dm', 'cad', 'appet', 'pe', 'ane', 'class'], dtype='object')

In [16]:
## Encode categorical variables
# LabelEncoder for 2 class
# Onehotencoding for multiple classes

label_encoder = LabelEncoder()

columns_to_encode = ['pc', 'pcc', 'ba', 'dm', 'cad', 'ane', 'class']

for col in columns_to_encode:
    df_categorical_imputed[col] = label_encoder.fit_transform(df_categorical_imputed[col])

df_categorical_imputed.head(20)

Unnamed: 0,pc,pcc,ba,dm,cad,appet,pe,ane,class
0,1,0,0,1,0,good,no,0,0
1,1,0,0,0,0,good,no,0,0
2,1,0,0,1,0,poor,no,1,0
3,0,1,0,0,0,poor,yes,1,0
4,1,0,0,0,0,good,no,0,0
5,1,0,0,1,0,good,yes,0,0
6,1,0,0,0,0,good,no,0,0
7,0,0,0,1,0,good,yes,0,0
8,0,1,0,1,0,good,no,1,0
9,0,1,0,1,0,poor,no,1,0


In [17]:
# One hot encoding for ['appet', 'pe']

# Initialize OneHotEncoder
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap

# Select columns to encode
columns_to_encode = ['appet', 'pe']

# Fit and transform the categorical columns
encoded_array = onehot_encoder.fit_transform(df_categorical_imputed[columns_to_encode])

# Convert the encoded array to a DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(columns_to_encode))

# Drop original categorical columns and concatenate the new one-hot encoded DataFrame
df_categorical_imputed = df_categorical_imputed.drop(columns_to_encode, axis=1)
df_categorical_imputed = pd.concat([df_categorical_imputed, encoded_df], axis=1)

df_categorical_imputed.head(20)


Unnamed: 0,pc,pcc,ba,dm,cad,ane,class,appet_no,appet_poor,pe_no,pe_yes
0,1,0,0,1,0,0,0,0.0,0.0,1.0,0.0
1,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
2,1,0,0,1,0,1,0,0.0,1.0,1.0,0.0
3,0,1,0,0,0,1,0,0.0,1.0,0.0,1.0
4,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
5,1,0,0,1,0,0,0,0.0,0.0,0.0,1.0
6,1,0,0,0,0,0,0,0.0,0.0,1.0,0.0
7,0,0,0,1,0,0,0,0.0,0.0,0.0,1.0
8,0,1,0,1,0,1,0,0.0,0.0,1.0,0.0
9,0,1,0,1,0,1,0,0.0,1.0,1.0,0.0


In [18]:
for col in df_categorical_imputed.columns:
    print(f"Value counts for column: {col}")
    print(df_categorical_imputed[col].value_counts(dropna=False))
    print("-" * 40)  # Separator for better readability


Value counts for column: pc
pc
1    324
0     76
Name: count, dtype: int64
----------------------------------------
Value counts for column: pcc
pcc
0    358
1     42
Name: count, dtype: int64
----------------------------------------
Value counts for column: ba
ba
0    378
1     22
Name: count, dtype: int64
----------------------------------------
Value counts for column: dm
dm
0    267
1    133
Name: count, dtype: int64
----------------------------------------
Value counts for column: cad
cad
0    360
1     40
Name: count, dtype: int64
----------------------------------------
Value counts for column: ane
ane
0    338
1     62
Name: count, dtype: int64
----------------------------------------
Value counts for column: class
class
0    250
1    150
Name: count, dtype: int64
----------------------------------------
Value counts for column: appet_no
appet_no
0.0    386
1.0     14
Name: count, dtype: int64
----------------------------------------
Value counts for column: appet_poor
appet_po

#### Combine numeric & categorical columns

In [20]:
print(df.shape)
df_cleaned = pd.concat([df_numeric_cleaned,df_categorical_imputed], axis =1)
print(df_cleaned.shape)

(400, 22)
(400, 24)


In [21]:
summarize_dataframe(df_cleaned)

Unnamed: 0,Column,Data Type,Missing %,Unique Values,Range
0,age,float64,0.0,,5.0 - 80.00999999999999
1,bp,float64,0.0,,50.0 - 110.0
2,sg,float64,0.0,,1.005 - 1.025
3,al,float64,0.0,,0.0 - 4.0
4,sugar,float64,0.0,,0.0 - 4.0
5,bgr,float64,0.0,,70.0 - 425.2199999999998
6,bu,float64,0.0,,15.0 - 235.05999999999995
7,sc,float64,0.0,,0.5 - 18.15899999999995
8,sod,float64,0.0,,113.0 - 150.0
9,pot,float64,0.0,,2.899 - 6.5009999999999994


## Data Splitting - Training & testing 

In [23]:
## DiVide the dataset into indepent and dependent features
X=df_cleaned.drop('class',axis=1)
y=df_cleaned['class']

## Split the data in training and tetsing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  


## Scale these features
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test=scaler.transform(X_test)




In [24]:
X_val

array([[ 0.21271572,  1.94906988, -0.51825415, ..., -0.51298918,
         0.51946248, -0.46056619],
       [ 0.9652051 ,  0.34434682, -1.43282029, ...,  1.94935887,
        -1.92506684,  2.17124059],
       [-1.58168202,  0.34434682, -0.51825415, ..., -0.51298918,
        -1.92506684,  2.17124059],
       ...,
       [-0.36612226, -0.4580147 , -1.43282029, ..., -0.51298918,
        -1.92506684,  2.17124059],
       [ 0.32848332,  0.34434682,  0.39631199, ..., -0.51298918,
         0.51946248, -0.46056619],
       [ 0.79155371, -1.26037623, -1.43282029, ...,  1.94935887,
         0.51946248, -0.46056619]])

In [25]:
df_cleaned.shape

(400, 24)

In [26]:
pip install scikeras


Note: you may need to restart the kernel to use updated packages.


In [27]:
#MODEL TRAINING-ANN


In [28]:
import os
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
import datetime

# Set global random seed for reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Optimizer and Loss function
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

# Build ANN Model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),  # HL1
    Dense(32, activation='relu'),  # HL2
    Dense(1, activation='sigmoid')  # Output layer
])

# Compile the model
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=['accuracy'])




# Set up TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train, validation_data=(X_val, y_val),
    epochs=50,
    callbacks=[tensorflow_callback, early_stopping_callback]
)

# Get final validation accuracy
val_accuracy = history.history['val_accuracy'][-1]
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Final Validation Accuracy: {val_accuracy}")
print(f"Test Accuracy: {test_accuracy}")





Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.8170 - loss: 0.4401 - val_accuracy: 0.9875 - val_loss: 0.0628
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9870 - loss: 0.0619 - val_accuracy: 0.9875 - val_loss: 0.0240
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9953 - loss: 0.0185 - val_accuracy: 1.0000 - val_loss: 0.0138
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9953 - loss: 0.0086 - val_accuracy: 1.0000 - val_loss: 0.0067
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0037 - val_accuracy: 1.0000 - val_loss: 0.0034
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0020 - val_accuracy: 1.0000 - val_loss: 0.0022
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [29]:
#RANDOM FOREST

In [30]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)

## Predict on validation and test sets
y_val_pred = rf_model.predict(X_val)
y_test_pred = rf_model.predict(X_test)

## Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

## Print the accuracies
print(f'Validation Accuracy: {val_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Validation Accuracy: 1.0000
Test Accuracy: 0.9875


In [31]:
#XGBOOST

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score




xgb_model = XGBClassifier(n_estimators=50, max_depth=3, min_child_weight=3, learning_rate=0.1, random_state=42, use_label_encoder=False, eval_metric='logloss')


xgb_model.fit(X_train, y_train)

# Validate model
y_valid_pred = xgb_model.predict(X_val)
valid_accuracy = accuracy_score(y_val, y_valid_pred)
print(f"Validation Accuracy: {valid_accuracy:.4f}")

# Test model
y_test_pred = xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Validation Accuracy: 0.9875
Test Accuracy: 0.9875


Parameters: { "use_label_encoder" } are not used.



In [33]:
#CATBOOST

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score




# Initialize CatBoost Classifier
cat_model = CatBoostClassifier(
    iterations=100,        # Number of boosting rounds
    learning_rate=0.1,     # Step size
    depth=6,              # Maximum depth of trees
    random_seed=42,
    verbose=0,            # Suppresses training logs
    eval_metric='Accuracy'
)

# Train model
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=10, verbose=0)

# Validate model
y_valid_pred = cat_model.predict(X_val)
valid_accuracy = accuracy_score(y_val, y_valid_pred)
print(f"Validation Accuracy: {valid_accuracy:.4f}")

# Test model
y_test_pred = cat_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


Validation Accuracy: 1.0000
Test Accuracy: 0.9875


In [35]:
pip install --upgrade scikit-learn scikeras tensorflow


Note: you may need to restart the kernel to use updated packages.


In [36]:
import sklearn
import tensorflow as tf
import scikeras

print("Scikit-Learn:", sklearn.__version__)
print("TensorFlow:", tf.__version__)
print("SciKeras:", scikeras.__version__)


Scikit-Learn: 1.6.1
TensorFlow: 2.19.0
SciKeras: 0.13.0


In [37]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.optimizers import Adam
def create_nn_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)), 
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=0.01), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap NN model using KerasClassifier
nn_model = KerasClassifier(model=create_nn_model, epochs=10, batch_size=32, verbose=0)

# Train the NN model separately
nn_model.fit(X_train, y_train)

# Get NN predictions (convert probabilities to class labels)
nn_preds = (nn_model.predict(X_test) > 0.5).astype(int)



# Train Voting Classifier (excluding NN)
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ],
    voting='hard'  # Majority voting
)

# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Get ensemble predictions
ensemble_preds = ensemble_model.predict(X_test)

# Combine predictions (majority vote)
final_preds = np.round((ensemble_preds + nn_preds.flatten()) / 2).astype(int)

# Accuracy Calculation
ensemble_accuracy = accuracy_score(y_test, final_preds)
print(f'Final Ensemble Accuracy: {ensemble_accuracy:.4f}')



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Final Ensemble Accuracy: 0.9625


Parameters: { "use_label_encoder" } are not used.

