In [None]:
import numpy as np
import pandas as pd

# Parameters
num_sequences = 24000# Number of sequences
sequence_length = 24  # Length of each sequence (hours)
num_features = 4  # Ethylene, Temperature, Humidity, VOC

# Generate synthetic data with very high noise
np.random.seed(0)  # For reproducibility

def generate_synthetic_data_with_big_noise(num_sequences, sequence_length):
    sequences = []
    temperatures = []
    humidities = []
    vocs = []
    labels = []

    noise_std_dev = 5  # Significantly increase standard deviation for noise
    missing_value_prob = 0.1  # Probability of missing values

    for _ in range(num_sequences):
        # Randomly generate data for each sequence
        ethylene = np.random.uniform(0.1, 1, sequence_length) + np.random.normal(0, noise_std_dev, sequence_length)
        temperature = np.random.uniform(18, 25, sequence_length) + np.random.normal(0, noise_std_dev, sequence_length)
        humidity = np.random.uniform(60, 70, sequence_length) + np.random.normal(0, noise_std_dev, sequence_length)

        # Determine VOC levels and labels based on conditions
        if np.random.rand() > 0.5:  # 50% chance to be Healthy or Not Healthy
            voc = np.random.uniform(5, 50, sequence_length) + np.random.normal(0, noise_std_dev, sequence_length)
            label = 'Healthy'
        else:
            voc = np.random.uniform(50, 200, sequence_length) + np.random.normal(0, noise_std_dev, sequence_length)
            label = 'Not Healthy'

        # Introduce missing values randomly
        for i in range(sequence_length):
            if np.random.rand() < missing_value_prob:
                ethylene[i] = np.nan
            if np.random.rand() < missing_value_prob:
                temperature[i] = np.nan
            if np.random.rand() < missing_value_prob:
                humidity[i] = np.nan
            if np.random.rand() < missing_value_prob:
                voc[i] = np.nan

        # Append data
        sequences.append(ethylene)
        temperatures.append(temperature)
        humidities.append(humidity)
        vocs.append(voc)
        labels.append(label)

    # Convert lists to numpy arrays
    sequences = np.array(sequences)
    temperatures = np.array(temperatures)
    humidities = np.array(humidities)
    vocs = np.array(vocs)
    labels = np.array(labels)

    # Stack features together
    X = np.stack([sequences, temperatures, humidities, vocs], axis=-1)

    # Encode labels
    label_mapping = {'Healthy': 0, 'Not Healthy': 1}
    y = np.array([label_mapping[label] for label in labels])

    return X, y

# Generate data with big noise
X, y = generate_synthetic_data_with_big_noise(num_sequences, sequence_length)

# Print shape and a sample
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Sample X[0]:", X[0])
print("Sample y[0]:", y[0])

# Optionally, save the data to a CSV file
df = pd.DataFrame({
    'Ethylene': list(X[:, :, 0].flatten()),
    'Temperature': list(X[:, :, 1].flatten()),
    'Humidity': list(X[:, :, 2].flatten()),
    'VOC': list(X[:, :, 3].flatten()),
    'Label': np.repeat(y, sequence_length)
})
df.to_csv('synthetic_data_with_biiiiig_noise.csv', index=False)


X shape: (24000, 24, 4)
y shape: (24000,)
Sample X[0]: [[  2.15927066  18.81976692  60.99951523  28.98053982]
 [ -3.52680827  19.25650476  74.46886899  48.91416875]
 [-12.12246204  21.55631463  56.62191441  51.80771757]
 [  3.85848784  16.91654787          nan  51.95398627]
 [  4.80347031  15.39640588  63.85232441  39.1243338 ]
 [ -3.0295204   10.49775361  61.985245    -2.57830369]
 [ 11.84260161  23.45888899  71.84552646  18.29064792]
 [ -6.36923267  17.76413654  76.93006407  35.16040554]
 [  1.19608907  13.11318368  73.80904859   4.94321709]
 [         nan  22.02489042  72.99431002  28.17432306]
 [  8.47644861  14.57629526  62.68866433          nan]
 [  7.92279928  19.03235297  72.52469427  10.84396753]
 [  1.38597723          nan  66.79796134   1.19268483]
 [  2.82384957  19.61219521  67.97733939  48.17000624]
 [ -4.27499629  25.07307995  73.54729181  16.05410764]
 [ -9.72556597  14.40694709  65.03767826  18.18280396]
 [         nan  25.75866081  71.88775047  55.82543146]
 [  1.6311

In [None]:
import pandas as pd

# Load the dataset to examine its structure
file_path = 'synthetic_data_with_biiiiig_noise.csv'
data = pd.read_csv(file_path)


# Remove rows with null values
data = data.dropna()

# Display basic information about the dataset
data_info = data.info()
data_head = data.head()

data_info, data_head

<class 'pandas.core.frame.DataFrame'>
Index: 378665 entries, 0 to 575997
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Ethylene     378665 non-null  float64
 1   Temperature  378665 non-null  float64
 2   Humidity     378665 non-null  float64
 3   VOC          378665 non-null  float64
 4   Label        378665 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 17.3 MB


(None,
     Ethylene  Temperature   Humidity        VOC  Label
 0   2.159271    18.819767  60.999515  28.980540      0
 1  -3.526808    19.256505  74.468869  48.914169      0
 2 -12.122462    21.556315  56.621914  51.807718      0
 4   4.803470    15.396406  63.852324  39.124334      0
 5  -3.029520    10.497754  61.985245  -2.578304      0)

# LSTM

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Build the LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train_seq_clean.shape[1], X_train_seq_clean.shape[2])),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model with a suitable learning rate
optimizer = Adam(learning_rate=0.001, clipvalue=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_seq_clean, y_train_seq_clean, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_seq_clean, y_test_seq_clean)
print(f'Test Accuracy: {test_accuracy:.2f}')


Epoch 1/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - accuracy: 0.6624 - loss: 0.6661 - val_accuracy: 0.8897 - val_loss: 0.5411
Epoch 2/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.8804 - loss: 0.5049 - val_accuracy: 0.8824 - val_loss: 0.3385
Epoch 3/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8733 - loss: 0.3483 - val_accuracy: 0.8603 - val_loss: 0.3446
Epoch 4/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8658 - loss: 0.3400 - val_accuracy: 0.8676 - val_loss: 0.3105
Epoch 5/5
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8747 - loss: 0.3472 - val_accuracy: 0.8824 - val_loss: 0.2817
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9098 - loss: 0.2473 
Test Accuracy: 0.90


# LSTM+GRU

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Build the combined LSTM and GRU model
model = Sequential([
    LSTM(32, return_sequences=True, input_shape=(X_train_seq_clean.shape[1], X_train_seq_clean.shape[2])),
    Dropout(0.2),
    GRU(32, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model with a suitable learning rate
optimizer = Adam(learning_rate=0.001, clipvalue=1.0)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_seq_clean, y_train_seq_clean, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_seq_clean, y_test_seq_clean)
print(f'Test Accuracy: {test_accuracy:.2f}')


Epoch 1/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 54ms/step - accuracy: 0.4989 - loss: 0.6693 - val_accuracy: 0.8971 - val_loss: 0.4951
Epoch 2/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.8704 - loss: 0.4335 - val_accuracy: 0.8603 - val_loss: 0.3473
Epoch 3/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.8629 - loss: 0.3643 - val_accuracy: 0.8676 - val_loss: 0.3059
Epoch 4/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8820 - loss: 0.3064 - val_accuracy: 0.8824 - val_loss: 0.2692
Epoch 5/5
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9157 - loss: 0.2696 - val_accuracy: 0.9338 - val_loss: 0.2268
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9273 - loss: 0.1935 
Test Accuracy: 0.92
