<a href="https://colab.research.google.com/github/shahad-jeza/my-first-nerural-network---Planar_data_classification_with_one_hidden_layer/blob/main/hospitilization_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import pandas as pd

# Define parameters for synthetic data generation
num_samples = 1000

# Generate features for class 1 (hospitalization)
num_samples_class_1 = int(num_samples * (1/3))
age_class_1 = np.random.normal(loc=60, scale=10, size=num_samples_class_1)
gender_class_1 = np.random.choice([0, 1], size=num_samples_class_1)
diabetes_class_1 = np.random.choice([0, 1], size=num_samples_class_1)
hypertension_class_1 = np.random.choice([0, 1], size=num_samples_class_1)
bmi_class_1 = np.random.normal(loc=30, scale=5, size=num_samples_class_1)
albumin_class_1 = np.random.normal(loc=3, scale=0.5, size=num_samples_class_1)
hemoglobin_class_1 = np.random.normal(loc=13, scale=2, size=num_samples_class_1)
hospitalization_class_1 = np.ones(num_samples_class_1, dtype=int)

# Generate features for class 0 (non-hospitalization)
num_samples_class_0 = num_samples - num_samples_class_1
age_class_0 = np.random.normal(loc=45, scale=10, size=num_samples_class_0)
gender_class_0 = np.random.choice([0, 1], size=num_samples_class_0)
diabetes_class_0 = np.random.choice([0, 1], size=num_samples_class_0)
hypertension_class_0 = np.random.choice([0, 1], size=num_samples_class_0)
bmi_class_0 = np.random.normal(loc=25, scale=5, size=num_samples_class_0)
albumin_class_0 = np.random.normal(loc=4, scale=0.5, size=num_samples_class_0)
hemoglobin_class_0 = np.random.normal(loc=15, scale=2, size=num_samples_class_0)
hospitalization_class_0 = np.zeros(num_samples_class_0, dtype=int)

# Concatenate features and labels for both classes
features = np.concatenate([
    np.column_stack((age_class_0, gender_class_0, diabetes_class_0, hypertension_class_0, bmi_class_0, albumin_class_0, hemoglobin_class_0)),
    np.column_stack((age_class_1, gender_class_1, diabetes_class_1, hypertension_class_1, bmi_class_1, albumin_class_1, hemoglobin_class_1))
])
labels = np.concatenate([hospitalization_class_0, hospitalization_class_1])

# Shuffle the data
indices = np.random.permutation(num_samples)
features = features[indices]
labels = labels[indices]

# Create a DataFrame
data = pd.DataFrame(features, columns=['age', 'gender', 'diabetes', 'hypertension', 'bmi', 'albumin', 'hemoglobin'])
data['Hospitalization'] = labels

# Save the synthetic dataset to a CSV file
data.to_csv('synthetic_data_imbalanced.csv', index=False)

# Display the first few rows of the generated data
print(data.head())


         age  gender  diabetes  hypertension        bmi   albumin  hemoglobin  \
0  22.746765     0.0       1.0           1.0  34.587869  3.691712   14.221822   
1  44.685871     1.0       1.0           0.0  29.111877  4.246831   14.434705   
2  39.547920     1.0       1.0           1.0  27.858523  3.116607   14.123173   
3  47.007537     0.0       1.0           1.0  29.829547  2.442454   14.762961   
4  52.161293     1.0       1.0           1.0  31.049881  3.405791   10.860943   

   Hospitalization  
0                0  
1                0  
2                0  
3                1  
4                1  


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LeakyReLU, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv('synthetic_data_imbalanced.csv')



# Define features and target variable
X = data.drop('Hospitalization', axis=1)
y = data['Hospitalization']

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Separate the minority class
X_minority = X_train_scaled[y_train == 1]
y_minority = y_train[y_train == 1]

# Convert to numpy arrays for GAN training
X_minority = np.array(X_minority)


In [18]:
# GAN parameters
latent_dim = 100
input_dim = X_minority.shape[1]

# Define the generator model
def build_generator(latent_dim, input_dim):
    generator_input = Input(shape=(latent_dim,))
    x = Dense(128)(generator_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = BatchNormalization()(x)
    x = Dense(input_dim, activation='tanh')(x)
    generator = Model(generator_input, x)
    return generator

# Define the discriminator model
def build_discriminator(input_dim):
    discriminator_input = Input(shape=(input_dim,))
    x = Dense(256)(discriminator_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(1, activation='sigmoid')(x)
    discriminator = Model(discriminator_input, x)
    return discriminator

# Build and compile the discriminator
discriminator = build_discriminator(input_dim)
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build the generator
generator = build_generator(latent_dim, input_dim)

# Create the GAN model
discriminator.trainable = False
gan_input = Input(shape=(latent_dim,))
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Training the GAN
def train_gan(generator, discriminator, gan, X_train, epochs=5000, batch_size=32):
    for epoch in range(epochs):
        # Generate random noise as generator input
        noise = np.random.normal(0, 1, (batch_size, latent_dim))

        # Generate fake samples
        gen_samples = generator.predict(noise)

        # Select a random batch of real samples
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_samples = X_train[idx]

        # Create labels for real and fake samples
        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_samples, real_labels)
        d_loss_fake = discriminator.train_on_batch(gen_samples, fake_labels)

        # Generate noise for the generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))
        valid_y = np.ones((batch_size, 1))

        # Train the generator
        g_loss = gan.train_on_batch(noise, valid_y)

        if epoch % 1000 == 0:
            print(f"Epoch: {epoch}, D Loss Real: {d_loss_real}, D Loss Fake: {d_loss_fake}, G Loss: {g_loss}")

# Train the GAN
train_gan(generator, discriminator, gan, X_minority)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 1000, D Loss Real: [0.21196363866329193, 0.90625], D Loss Fake: [0.16704697906970978, 0.96875], G Loss: 2.714158535003662
Epoch: 2000, D Loss Real: [0.2445041537284851, 0.90625], D Loss Fake: [0.1481976956129074, 0.96875], G Loss: 3.5085575580596924
Epoch: 3000, D Loss Real: [0.11026562005281448, 0.96875], D Loss Fake: [0.05859449505805969, 1.0], G Loss: 4.059751987457275
Epoch: 4000, D Loss Real: [0.10440551489591599, 0.96875], D Loss Fake: [0.08161604404449463, 0.96875], G Loss: 3.9217658042907715


In [19]:
# Generate synthetic samples
noise = np.random.normal(0, 1, (len(X_minority), latent_dim))
synthetic_samples = generator.predict(noise)

# Combine synthetic samples with original data
X_train_balanced = np.vstack((X_train_scaled, synthetic_samples))
y_train_balanced = np.concatenate((y_train, np.ones(len(synthetic_samples))))




In [20]:
# Train Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_balanced, y_train_balanced)

# Predictions on test set
y_pred = rf_classifier.predict(X_test_scaled)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)* 100
print("Accuracy:", accuracy)


Accuracy: 96.0


In [21]:
input_data = {
    'age': 62,              # Age in years
    'gender': 1,            # Gender (0 for Female, 1 for Male)
    'diabetes': 0,          # Diabetes (0 for No, 1 for Yes)
    'hypertension': 0,      # Hypertension (0 for No, 1 for Yes)
    'bmi': 22.5,            # BMI (Body Mass Index)
    'albumin': 2.8,         # Albumin level
    'hemoglobin': 14.5      # Hemoglobin level
}
# Convert input data to a numpy array (if necessary) and scale it using the scaler
input_array = scaler.transform([list(input_data.values())])

# Make a prediction
prediction = rf_classifier.predict(input_array)

# Print the prediction
print("Hospitalization Prediction:", prediction[0])


Hospitalization Prediction: 1.0




In [22]:
import joblib

# Save the trained model as a .pkl file
model_filename = 'random_forest_model.pkl'
joblib.dump(rf_classifier, model_filename)
print(f"Model saved to {model_filename}")


Model saved to random_forest_model.pkl


In [23]:
# Save the scaler
scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)
print(f"Scaler saved to {scaler_filename}")

Scaler saved to scaler.pkl
