In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load preprocessed data
df = pd.read_csv("../data/processed/dht11_preprocessed.csv")

In [2]:
df

Unnamed: 0,datatime,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
0,2024-04-06 22:39:00,32.300000,53.0,147.750000,1.0,187.625000,0.0,141.125000,1.0,205.125000,1.0,240.827500,247.057500
1,2024-04-06 22:40:00,32.300000,53.0,147.777778,1.0,187.444444,0.0,142.000000,1.0,205.111111,1.0,241.173333,194.508889
2,2024-04-06 22:41:00,32.300000,53.0,148.000000,1.0,188.375000,0.0,142.000000,1.0,205.250000,1.0,236.781250,209.391250
3,2024-04-06 22:42:00,32.300000,53.0,147.555556,1.0,187.333333,0.0,141.111111,1.0,204.000000,1.0,241.542222,209.725556
4,2024-04-06 22:43:00,32.300000,53.0,147.111111,1.0,186.777778,0.0,140.555556,1.0,203.000000,1.0,239.328889,198.197778
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5921,2024-11-21 12:12:00,28.755556,40.0,47.777778,1.0,143.111111,0.0,231.111111,1.0,139.444444,1.0,216.365556,171.730000
5922,2024-11-21 12:13:00,28.900000,40.0,47.750000,1.0,143.500000,0.0,231.250000,1.0,139.625000,1.0,228.482500,179.511250
5923,2024-11-21 12:14:00,28.900000,40.0,48.000000,1.0,143.111111,0.0,231.777778,1.0,139.888889,1.0,233.612222,191.465556
5924,2024-11-21 12:15:00,28.900000,40.0,48.000000,1.0,141.888889,0.0,230.222222,1.0,140.000000,1.0,224.388889,178.370000


In [3]:
# Create statistical features for the scaled dataset
stat_features = df.describe().T[['mean', 'min', 'max', 'std']]

# Reset index for better readability
stat_features = stat_features.reset_index().rename(columns={'index': 'Feature'})

# Display the statistical features
print("Statistical Features (Mean, Min, Max, Standard Deviation):")
print(stat_features)


Statistical Features (Mean, Min, Max, Standard Deviation):
          Feature        mean  min          max        std
0     temperature   30.386946  0.0    40.600000   7.122737
1        humidity   46.439142  0.0    83.000000  15.440287
2      mq2_analog   51.290903  0.0  1016.714286  48.228161
3     mq2_digital    0.983694  0.0     1.000000   0.125348
4    sound_analog  133.878785  0.0   760.142857  35.060610
5   sound_digital    0.111192  0.0     1.000000   0.312500
6      mq9_analog  188.346382  7.5   524.200000  43.875589
7     mq9_digital    0.943405  0.0     1.000000   0.227657
8      mq8_analog  116.147676  0.0  1021.857143  57.648962
9     mq8_digital    0.973766  0.0     1.000000   0.158127
10   pm25_density  230.355502 -0.1   804.891111  32.606520
11   pm10_density  187.625015 -0.1   575.332222  30.378905


In [4]:
from sklearn.preprocessing import StandardScaler

# Select the relevant columns (excluding datetime and any non-numeric columns)
cols_to_scale = ['temperature', 'humidity', 'mq2_analog', 'mq2_digital', 
                 'sound_analog', 'sound_digital', 'mq9_analog', 'mq9_digital', 
                 'mq8_analog', 'mq8_digital', 'pm25_density', 'pm10_density']

# Initialize StandardScaler
scaler = StandardScaler()

# Apply scaling to the selected columns
df_scaled = df[cols_to_scale].copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])

# Now calculate correlations after scaling
correlation_matrix = df_scaled.corr()

# Display the correlation matrix
correlation_matrix


Unnamed: 0,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density
temperature,1.0,0.560368,0.138279,-0.031241,0.00224,0.124676,-0.006723,-0.049166,0.122661,-0.014223,0.073313,0.079456
humidity,0.560368,1.0,-0.097359,0.056664,-0.146078,0.135861,-0.144253,-0.053068,-0.208159,-0.004947,0.237449,-0.046328
mq2_analog,0.138279,-0.097359,1.0,-0.16941,0.573025,0.240117,0.351419,-0.104408,0.693583,-0.181198,-0.110764,0.069266
mq2_digital,-0.031241,0.056664,-0.16941,1.0,-0.052253,-0.260874,0.167739,0.323123,-0.001317,0.775212,0.25278,0.039815
sound_analog,0.00224,-0.146078,0.573025,-0.052253,1.0,-0.436985,0.338884,-0.110329,0.190898,-0.109706,0.073263,0.157613
sound_digital,0.124676,0.135861,0.240117,-0.260874,-0.436985,1.0,-0.146085,-0.077998,0.348412,-0.191224,-0.16632,-0.176152
mq9_analog,-0.006723,-0.144253,0.351419,0.167739,0.338884,-0.146085,1.0,-0.195167,0.588569,0.09458,0.057587,0.070532
mq9_digital,-0.049166,-0.053068,-0.104408,0.323123,-0.110329,-0.077998,-0.195167,1.0,-0.053011,0.25358,-0.107382,-0.16589
mq8_analog,0.122661,-0.208159,0.693583,-0.001317,0.190898,0.348412,0.588569,-0.053011,1.0,0.086167,-0.133426,0.032561
mq8_digital,-0.014223,-0.004947,-0.181198,0.775212,-0.109706,-0.191224,0.09458,0.25358,0.086167,1.0,0.120282,-0.01922


In [5]:
# Filter correlations greater than 0.3 and less than 1.0
high_corr = correlation_matrix.where((correlation_matrix > 0.3) & (correlation_matrix < 1.0))

# Drop NaN values to focus on relevant correlations
high_corr = high_corr.stack().reset_index()

# Rename columns for clarity
high_corr.columns = ["Parameter 1", "Parameter 2", "Correlation"]

# Ensure unique pairs by sorting parameters and dropping duplicates
high_corr["Sorted Pair"] = high_corr[["Parameter 1", "Parameter 2"]].apply(lambda x: tuple(sorted(x)), axis=1)
unique_corr = high_corr.drop_duplicates(subset="Sorted Pair").drop(columns="Sorted Pair")

# Print the unique pairs
print("Unique pairs with correlation greater than 0.3:")
for _, row in unique_corr.iterrows():
    print(f"{row['Parameter 1']} and {row['Parameter 2']}: {row['Correlation']:.2f}")


Unique pairs with correlation greater than 0.3:
temperature and humidity: 0.56
mq2_analog and sound_analog: 0.57
mq2_analog and mq9_analog: 0.35
mq2_analog and mq8_analog: 0.69
mq2_digital and mq9_digital: 0.32
mq2_digital and mq8_digital: 0.78
sound_analog and mq9_analog: 0.34
sound_digital and mq8_analog: 0.35
mq9_analog and mq8_analog: 0.59
pm25_density and pm10_density: 0.64


In [6]:
import numpy as np

# Ensure 'datatime' is in datetime format
df['datatime'] = pd.to_datetime(df['datatime'])

# Extract day, hour, and minute
df['day'] = df['datatime'].dt.day
df['hour'] = df['datatime'].dt.hour
df['minute'] = df['datatime'].dt.minute

# Group by unique minute intervals
grouped = df.groupby(['day', 'hour', 'minute'])

In [7]:
df

Unnamed: 0,datatime,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density,day,hour,minute
0,2024-04-06 22:39:00,32.300000,53.0,147.750000,1.0,187.625000,0.0,141.125000,1.0,205.125000,1.0,240.827500,247.057500,6,22,39
1,2024-04-06 22:40:00,32.300000,53.0,147.777778,1.0,187.444444,0.0,142.000000,1.0,205.111111,1.0,241.173333,194.508889,6,22,40
2,2024-04-06 22:41:00,32.300000,53.0,148.000000,1.0,188.375000,0.0,142.000000,1.0,205.250000,1.0,236.781250,209.391250,6,22,41
3,2024-04-06 22:42:00,32.300000,53.0,147.555556,1.0,187.333333,0.0,141.111111,1.0,204.000000,1.0,241.542222,209.725556,6,22,42
4,2024-04-06 22:43:00,32.300000,53.0,147.111111,1.0,186.777778,0.0,140.555556,1.0,203.000000,1.0,239.328889,198.197778,6,22,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5921,2024-11-21 12:12:00,28.755556,40.0,47.777778,1.0,143.111111,0.0,231.111111,1.0,139.444444,1.0,216.365556,171.730000,21,12,12
5922,2024-11-21 12:13:00,28.900000,40.0,47.750000,1.0,143.500000,0.0,231.250000,1.0,139.625000,1.0,228.482500,179.511250,21,12,13
5923,2024-11-21 12:14:00,28.900000,40.0,48.000000,1.0,143.111111,0.0,231.777778,1.0,139.888889,1.0,233.612222,191.465556,21,12,14
5924,2024-11-21 12:15:00,28.900000,40.0,48.000000,1.0,141.888889,0.0,230.222222,1.0,140.000000,1.0,224.388889,178.370000,21,12,15


In [8]:
import numpy as np
import pandas as pd

columns_for_lstm = ['temperature', 'humidity', 'mq2_analog', 'mq9_analog', 'mq8_analog', 'pm25_density', 'pm10_density', 'sound_analog', 'sound_digital']

# Group the data by 'day', 'hour', 'minute' to create sequences for each time slice
df_grouped = df.groupby(['day', 'hour', 'minute'])

# Initialize a list to store the sequences
sequences = []

min_length = 2  # This can be adjusted according to your needs

# Iterate over each group (day, hour, minute combination)
for (day, hour, minute), group in df_grouped:
    # Get the sequence (only the relevant columns)
    sequence = group[columns_for_lstm].values
    
    # Interpolate missing values using linear interpolation
    sequence_df = pd.DataFrame(sequence, columns=columns_for_lstm)
    
    # Interpolate to fill missing values
    sequence_df = sequence_df.interpolate(method='linear', axis=0)
    
    # Forward fill and backward fill to handle any remaining NaNs
    sequence_df = sequence_df.ffill().bfill()

    # If sequence length is shorter than min_length, pad it
    if len(sequence_df) < min_length:
        # Padding with zeros (for missing records)
        padding = np.zeros((min_length - len(sequence_df), sequence_df.shape[1]))  # Pad with zeros
        sequence_df = pd.DataFrame(np.vstack([sequence_df.values, padding]), columns=columns_for_lstm)

    # Append the sequence to the list
    sequences.append(sequence_df.values)

# Convert the list of sequences to a numpy array
lstm_sequences = np.array(sequences, dtype=object)

# Check the shape of lstm_sequences before proceeding
print("Shape of Time-Series Sequences for LSTM:", lstm_sequences.shape)

# Make sure all sequences are of the same length (in terms of time steps)
max_length = max(len(seq) for seq in lstm_sequences)
lstm_sequences_padded = np.array([np.pad(seq, ((0, max_length - len(seq)), (0, 0)), 'constant') for seq in lstm_sequences])

# Check the shape after padding
print("Shape of Padded Sequences:", lstm_sequences_padded.shape)

# Now, lstm_sequences_padded should be a 3D array with shape (num_samples, num_time_steps, num_features)


Shape of Time-Series Sequences for LSTM: (5838, 2, 9)
Shape of Padded Sequences: (5838, 2, 9)


In [9]:
# Save the sequences to a .npy file
np.save('../data/processed/lstm_sequences.npy', lstm_sequences_padded)

print("Sequences saved as lstm_sequences.npy")


Sequences saved as lstm_sequences.npy


In [10]:

# Calculate the number of records in each sequence
sequence_lengths = [len(seq) for seq in lstm_sequences]

# Find the minimum and maximum sequence lengths
min_seq_length = min(sequence_lengths)
max_seq_length = max(sequence_lengths)

# Display the results
print(f"Minimum sequence length: {min_seq_length}")
print(f"Maximum sequence length: {max_seq_length}")

Minimum sequence length: 2
Maximum sequence length: 2
