In [1]:
# Test cell - Run this first
import pandas as pd
import numpy as np
import sklearn
import tensorflow

print("‚úÖ All packages work!")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")
print(f"Scikit-learn: {sklearn.__version__}")
print(f"TensorFlow: {tensorflow.__version__}")

‚úÖ All packages work!
Pandas: 2.2.2
NumPy: 2.0.2
Scikit-learn: 1.6.1
TensorFlow: 2.19.0


In [2]:
# ========================================
# LOAD CSV FROM GOOGLE DRIVE
# ========================================

from google.colab import drive
import pandas as pd

# Mount Google Drive
print("üìÅ Mounting Google Drive...")
drive.mount('/content/drive')
print("‚úÖ Drive mounted!\n")

# ============================================
# CONFIGURE YOUR FILE PATH HERE
# ============================================
FILE_PATH = '/content/drive/MyDrive/gallstone_data.csv'  # ‚¨ÖÔ∏è CHANGE THIS TO YOUR FILE PATH

# ============================================

# Load the CSV file
print(f"üìÇ Loading file from: {FILE_PATH}")
print("‚è≥ Reading CSV file...\n")

df = pd.read_csv(FILE_PATH)

# Clean the data
df = df.dropna(how='all')  # Remove completely empty rows
df = df.dropna(axis=1, how='all')  # Remove completely empty columns
df.columns = [str(col).strip() for col in df.columns]  # Clean column names

print("="*60)
print("üéâ SUCCESS! DATA LOADED FROM DRIVE!")
print("="*60)

print(f"Shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
print(f"\nüìã Columns:")
print(df.columns.tolist())
print(f"\nüìä First 5 rows:")
print(df.head())
print(f"\nüìä Data types:")
print(df.dtypes)

print("‚úÖ Done!")

üìÅ Mounting Google Drive...
Mounted at /content/drive
‚úÖ Drive mounted!

üìÇ Loading file from: /content/drive/MyDrive/gallstone_data.csv
‚è≥ Reading CSV file...

üéâ SUCCESS! DATA LOADED FROM DRIVE!
Shape: 319 rows √ó 39 columns

üìã Columns:
['Gallstone Status', 'Age', 'Gender', 'Comorbidity', 'Coronary Artery Disease (CAD)', 'Hypothyroidism', 'Hyperlipidemia', 'Diabetes Mellitus (DM)', 'Height', 'Weight', 'Body Mass Index (BMI)', 'Total Body Water (TBW)', 'Extracellular Water (ECW)', 'Intracellular Water (ICW)', 'Extracellular Fluid/Total Body Water (ECF/TBW)', 'Total Body Fat Ratio (TBFR) (%)', 'Lean Mass (LM) (%)', 'Body Protein Content (Protein) (%)', 'Visceral Fat Rating (VFR)', 'Bone Mass (BM)', 'Muscle Mass (MM)', 'Obesity (%)', 'Total Fat Content (TFC)', 'Visceral Fat Area (VFA)', 'Visceral Muscle Area (VMA) (Kg)', 'Hepatic Fat Accumulation (HFA)', 'Glucose', 'Total Cholesterol (TC)', 'Low Density Lipoprotein (LDL)', 'High Density Lipoprotein (HDL)', 'Triglyceride', 'As

In [3]:
# ========================================
# GALLSTONE ML ANALYSIS - COMPLETE PIPELINE
# ========================================

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score,
                             recall_score, f1_score, roc_curve, roc_auc_score)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("GALLSTONE PREDICTION - MACHINE LEARNING ANALYSIS")
print("="*60)

GALLSTONE PREDICTION - MACHINE LEARNING ANALYSIS


In [4]:
print(f"\nDataset Shape: {df.shape}")
print(f"Target Variable: Gallstone Status")
print(f"  - 0: Gallstone Present")
print(f"  - 1: No Gallstone")


Dataset Shape: (319, 39)
Target Variable: Gallstone Status
  - 0: Gallstone Present
  - 1: No Gallstone


In [7]:
# ========================================
# DATA PREPROCESSING
# ========================================
print("\nüîß Preprocessing data...")

# Separate features and target
X = df.drop('Gallstone Status', axis=1)
y = df['Gallstone Status']

print(f"  - Features: {X.shape[1]} columns")
print(f"  - Samples: {X.shape[0]} rows")
print(f"  - Class distribution:")
print(y.value_counts())



üîß Preprocessing data...
  - Features: 38 columns
  - Samples: 319 rows
  - Class distribution:
Gallstone Status
0    161
1    158
Name: count, dtype: int64


In [8]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print(f"  - Training: {X_train.shape[0]} samples")
print(f"  - Testing: {X_test.shape[0]} samples")

  - Training: 239 samples
  - Testing: 80 samples


In [9]:
# ========================================
# NORMALIZATION
# ========================================
print("\nüìä Applying Normalization...")

# Z-Score
scaler_zscore = StandardScaler()
X_train_zscore = scaler_zscore.fit_transform(X_train)
X_test_zscore = scaler_zscore.transform(X_test)

# Min-Max
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train)
X_test_minmax = scaler_minmax.transform(X_test)

print("  ‚úì Z-Score normalization applied")
print("  ‚úì Min-Max normalization applied")


üìä Applying Normalization...
  ‚úì Z-Score normalization applied
  ‚úì Min-Max normalization applied


In [10]:
# ========================================
# INITIALIZE STORAGE
# ========================================
results = {
    'Model': [], 'Normalization': [], 'Accuracy': [],
    'Precision': [], 'Recall': [], 'F1-Score': [], 'ROC-AUC': []
}
confusion_matrices = {}
roc_curves = {}

In [11]:
# ========================================
# TRAIN MODELS - MIN-MAX
# ========================================
print("\n" + "="*60)
print("üü¢ MIN-MAX NORMALIZATION MODELS")
print("="*60)



In [12]:
# 1D CNN
print("\n4Ô∏è‚É£  1D CNN...")
X_train_cnn = X_train_minmax.reshape(X_train_minmax.shape[0], X_train_minmax.shape[1], 1)
X_test_cnn = X_test_minmax.reshape(X_test_minmax.shape[0], X_test_minmax.shape[1], 1)

cnn = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train_minmax.shape[1], 1)),
    Conv1D(32, 3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

y_proba = cnn.predict(X_test_cnn, verbose=0).ravel()
y_pred = (y_proba > 0.5).astype(int)

results['Model'].append('1D CNN')
results['Normalization'].append('Min-Max')
results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-Score'].append(f1_score(y_test, y_pred))
results['ROC-AUC'].append(roc_auc_score(y_test, y_proba))
confusion_matrices['1D CNN (Min-Max)'] = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_curves['1D CNN'] = (fpr, tpr, results['ROC-AUC'][-1])
print(f"   Accuracy: {results['Accuracy'][-1]:.4f}")


4Ô∏è‚É£  1D CNN...
   Accuracy: 0.8250


In [13]:
!pip install -q keras-tuner

[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ[0m [32m122.9/129.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m129.1/129.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import tensorflow as tf
import keras_tuner as kt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Dropout, Flatten
from tensorflow.keras.optimizers import Adam

def build_model(hp):
    model = Sequential()

    # 1Ô∏è‚É£ First Conv layer
    model.add(Conv1D(
        filters=hp.Int('conv1_filters', min_value=32, max_value=128, step=32),
        kernel_size=hp.Choice('conv1_kernel', values=[2, 3, 5]),
        activation='relu',
        input_shape=(X_train_minmax.shape[1], 1)
    ))

    # 2Ô∏è‚É£ Second Conv layer
    model.add(Conv1D(
        filters=hp.Int('conv2_filters', min_value=16, max_value=64, step=16),
        kernel_size=hp.Choice('conv2_kernel', values=[2, 3, 5]),
        activation='relu'
    ))

    model.add(Flatten())

    # 3Ô∏è‚É£ Dense layers
    model.add(Dense(
        units=hp.Int('dense1_units', min_value=32, max_value=128, step=32),
        activation='relu'
    ))
    model.add(Dropout(hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))

    model.add(Dense(
        units=hp.Int('dense2_units', min_value=16, max_value=64, step=16),
        activation='relu'
    ))

    model.add(Dense(1, activation='sigmoid'))

    # 4Ô∏è‚É£ Optimizer
    lr = hp.Choice('learning_rate', values=[1e-4, 5e-4, 1e-3, 5e-3])
    optimizer = Adam(learning_rate=lr)

    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

In [15]:
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=50,
    factor=3,
    directory='cnn_tuning',
    project_name='gallstone_cnn_tuning'
)

# Reshape data for CNN
X_train_cnn = X_train_minmax.reshape(X_train_minmax.shape[0], X_train_minmax.shape[1], 1)
X_val_cnn = X_test_minmax.reshape(X_test_minmax.shape[0], X_test_minmax.shape[1], 1)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(X_train_cnn, y_train, epochs=50, validation_data=(X_val_cnn, y_test),
             callbacks=[stop_early], batch_size=32, verbose=1)


Trial 90 Complete [00h 00m 13s]
val_accuracy: 0.8374999761581421

Best val_accuracy So Far: 0.887499988079071
Total elapsed time: 00h 08m 34s


In [16]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)

{'conv1_filters': 96, 'conv1_kernel': 2, 'conv2_filters': 16, 'conv2_kernel': 3, 'dense1_units': 96, 'dropout': 0.2, 'dense2_units': 48, 'learning_rate': 0.005, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0046'}


In [17]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(
    X_train_cnn, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test_cnn, y_test),
    callbacks=[stop_early],
    verbose=1
)

Epoch 1/50
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m6s[0m 49ms/step - accuracy: 0.4659 - loss: 0.6968 - val_accuracy: 0.5000 - val_loss: 0.6917
Epoch 2/50
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5065 - loss: 0.6933 - val_accuracy: 0.5000 - val_loss: 0.6903
Epoch 3/50
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5055 - loss: 0.6900 - val_accuracy: 0.6250 - val_loss: 0.6746
Epoch 4/50
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5918 - loss: 0.6719 - val_accuracy: 0.6000 - val_loss: 0.6391
Epoch 5/50
[1m8/8[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5642 - loss: 0.6769 - val_accuracy: 0.5875 - val_loss: 0.6414
Epoch

In [20]:
test_loss, test_acc = model.evaluate(X_test_cnn, y_test)
print(f"Final Test Accuracy: {test_acc:.4f}")

[1m3/3[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.8391 - loss: 0.4784
Final Test Accuracy: 0.8500


In [21]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Best Hyperparameters:
- Conv1 Filters: {best_hps.get('conv1_filters')}
- Conv1 Kernel: {best_hps.get('conv1_kernel')}
- Conv2 Filters: {best_hps.get('conv2_filters')}
- Conv2 Kernel: {best_hps.get('conv2_kernel')}
- Dense1 Units: {best_hps.get('dense1_units')}
- Dense2 Units: {best_hps.get('dense2_units')}
- Dropout: {best_hps.get('dropout')}
- Learning Rate: {best_hps.get('learning_rate')}
""")

# Rebuild and train the best model
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train_cnn, y_train,
                         epochs=50,
                         batch_size=32,
                         validation_split=0.2,
                         verbose=1)

# Evaluate
test_loss, test_acc = best_model.evaluate(X_val_cnn, y_test, verbose=0)
print(f"‚úÖ Tuned CNN Test Accuracy: {test_acc*100:.2f}%")


Best Hyperparameters:
- Conv1 Filters: 96
- Conv1 Kernel: 2
- Conv2 Filters: 16
- Conv2 Kernel: 3
- Dense1 Units: 96
- Dense2 Units: 48
- Dropout: 0.2
- Learning Rate: 0.005

Epoch 1/50
[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.4565 - loss: 0.6950 - val_accuracy: 0.5208 - val_loss: 0.6969
Epoch 2/50
[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.4470 - loss: 0.6968 - val_accuracy: 0.5833 - val_loss: 0.6886
Epoch 3/50
[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.6013 - loss: 0.6828 - val_accuracy: 0.5208 - val_loss: 0.6922
Epoch 4/50
[1m6/6[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.6420 - loss: 0.6479 - val_accuracy: 0.5208 - val_loss: 0.7238
Epoch 5/50
[1m6/6[0m [32m

In [22]:
# 1D CNN
print("\n4Ô∏è‚É£  1D CNN...")
X_train_cnn = X_train_minmax.reshape(X_train_minmax.shape[0], X_train_minmax.shape[1], 1)
X_test_cnn = X_test_minmax.reshape(X_test_minmax.shape[0], X_test_minmax.shape[1], 1)

cnn = Sequential([
    Conv1D(64, 3, activation='relu', input_shape=(X_train_minmax.shape[1], 1)),
    Conv1D(32, 3, activation='relu'),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

y_proba = cnn.predict(X_test_cnn, verbose=0).ravel()
y_pred = (y_proba > 0.5).astype(int)

results['Model'].append('1D CNN')
results['Normalization'].append('Min-Max')
results['Accuracy'].append(accuracy_score(y_test, y_pred))
results['Precision'].append(precision_score(y_test, y_pred))
results['Recall'].append(recall_score(y_test, y_pred))
results['F1-Score'].append(f1_score(y_test, y_pred))
results['ROC-AUC'].append(roc_auc_score(y_test, y_proba))
confusion_matrices['1D CNN (Min-Max)'] = confusion_matrix(y_test, y_pred)
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_curves['1D CNN'] = (fpr, tpr, results['ROC-AUC'][-1])
print(f"   Accuracy: {results['Accuracy'][-1]:.4f}")


4Ô∏è‚É£  1D CNN...




   Accuracy: 0.8500
