<a href="https://colab.research.google.com/github/shahad-jeza/my-first-nerural-network---Planar_data_classification_with_one_hidden_layer/blob/main/GAN_to_increase_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the dataset
file_path = '/content/combined_dialysis_data.csv'
data = pd.read_csv(file_path)

# Check for NaN values in the dataset
print(data.isnull().sum())

# Drop rows with NaN values in the target column or handle them appropriately
data = data.dropna(subset=['Hospitalized'])  # Assuming 'Hospitalized' is the target column name

# Separate features and target variable
X = data.drop(columns=['Hospitalized'])
y = data['Hospitalized']

# Split the data into majority and minority classes
minority_class = data[data['Hospitalized'] == 0]
majority_class = data[data['Hospitalized'] == 1]

# Normalize features based on the minority class
scaler = StandardScaler()
minority_features = minority_class.drop(columns=['Hospitalized'])
scaler.fit(minority_features)

# Normalize both minority and majority class features
X_scaled = scaler.transform(X)
minority_class_scaled = scaler.transform(minority_features)

# Split normalized features and target
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled['Hospitalized'] = y.values

# Verify the normalization
print(X_scaled.head())


Age                          0
Gender                       0
Diabetes                     0
Hypertension                 0
Hemoglobin                   0
Albumin                      0
Dialysis_Vintage             0
Dialysis_Type                0
Hospitalization_Last_Year    0
ER_Visits_Last_Year          0
Hospitalized                 0
dtype: int64
        Age    Gender  Diabetes  Hypertension  Hemoglobin   Albumin  \
0  0.492399  1.001601  0.719020     -1.951331   -0.954420 -1.998239   
1 -0.216048 -0.998401 -1.390781      0.512471    0.071428  0.081469   
2  0.634089  1.001601 -1.390781      0.512471    0.251078 -2.010884   
3  1.555070 -0.998401 -1.390781     -1.951331   -1.243262 -0.264982   
4 -0.286893  1.001601  0.719020      0.512471    0.327239  1.685719   

   Dialysis_Vintage  Dialysis_Type  Hospitalization_Last_Year  \
0          1.142400      -0.410152                  -0.603845   
1          0.637984       2.438123                   1.656054   
2          0.385777     

In [13]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, LeakyReLU, BatchNormalization, Reshape, Flatten, Input
from tensorflow.keras.models import Sequential, Model

# Generator
def create_generator(input_dim, output_dim):
    generator = Sequential()
    generator.add(Dense(256, input_dim=input_dim))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(BatchNormalization(momentum=0.8))
    generator.add(Dense(512))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(BatchNormalization(momentum=0.8))
    generator.add(Dense(1024))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(BatchNormalization(momentum=0.8))
    generator.add(Dense(output_dim, activation='tanh'))
    return generator

# Discriminator
def create_discriminator(input_dim):
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=input_dim))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return discriminator

# Create the GAN
def create_gan(generator, discriminator):
    discriminator.trainable = False
    gan_input = Input(shape=(100,))
    x = generator(gan_input)
    gan_output = discriminator(x)
    gan = Model(gan_input, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    return gan

# Instantiate the generator and discriminator
input_dim = 100
output_dim = minority_class_scaled.shape[1]
generator = create_generator(input_dim, output_dim)
discriminator = create_discriminator(output_dim)
gan = create_gan(generator, discriminator)

# Training the GAN
def train_gan(gan, generator, discriminator, data, epochs=5000, batch_size=64):
    for epoch in range(epochs):
        # Generate random noise as generator input
        noise = np.random.normal(0, 1, (batch_size, 100))

        # Generate synthetic samples
        generated_data = generator.predict(noise)

        # Get a random batch of real samples
        real_data = data[np.random.randint(0, data.shape[0], batch_size)]

        # Create labels for real and fake data
        real_labels = np.ones((batch_size, 1))
        fake_labels = np.zeros((batch_size, 1))

        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_data, real_labels)
        d_loss_fake = discriminator.train_on_batch(generated_data, fake_labels)

        # Average the discriminator loss
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train the generator
        noise = np.random.normal(0, 1, (batch_size, 100))
        g_loss = gan.train_on_batch(noise, real_labels)

        # Print the progress
        if epoch % 1000 == 0:
            print(f"Epoch: {epoch}, D Loss: {d_loss[0]}, D Accuracy: {d_loss[1]}, G Loss: {g_loss}")

# Train the GAN on the minority class data
train_gan(gan, generator, discriminator, minority_class_scaled, epochs=5000, batch_size=64)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch: 1000, D Loss: 0.004943319596350193, D Accuracy: 1.0, G Loss: 10.254497528076172
Epoch: 2000, D Loss: 0.00027668480470310897, D Accuracy: 1.0, G Loss: 10.944642066955566
Epoch: 3000, D Loss: 0.001735593075864017, D Accuracy: 1.0, G Loss: 11.306806564331055
Epoch: 4000, D Loss: 0.0022474460308785638, D Accuracy: 1.0, G Loss: 15.023110389709473


In [14]:
# Generate synthetic data
def generate_synthetic_data(generator, n_samples):
    noise = np.random.normal(0, 1, (n_samples, 100))
    generated_data = generator.predict(noise)
    return generated_data

# Generate 1000 synthetic samples
synthetic_data = generate_synthetic_data(generator, 1000)

# Transform synthetic data back to the original scale
synthetic_data_original_scale = scaler.inverse_transform(synthetic_data)

# Create a DataFrame for synthetic data
synthetic_df = pd.DataFrame(synthetic_data_original_scale, columns=X.columns)
synthetic_df['Hospitalized'] = 0  # Assign the minority class label

# Combine the synthetic data with the original dataset
augmented_data = pd.concat([data, synthetic_df])

# Verify the augmented data
print(augmented_data['Hospitalized'].value_counts())


Hospitalized
0.0    1625
1.0    1375
Name: count, dtype: int64


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the augmented dataset into features and target
X_aug = augmented_data.iloc[:, :-1]
y_aug = augmented_data.iloc[:, -1]

# Check for and handle any NaN values in the target variable
print(y_aug.isnull().sum())  # Should be zero, but check just in case
y_aug = y_aug.dropna()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_aug, y_aug, test_size=0.3, random_state=42, stratify=y_aug)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


0
Accuracy: 0.8788888888888889
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.95      0.89       488
         1.0       0.93      0.80      0.86       412

    accuracy                           0.88       900
   macro avg       0.89      0.87      0.88       900
weighted avg       0.88      0.88      0.88       900

Confusion Matrix:
[[462  26]
 [ 83 329]]
