In [1]:
import pandas as pd

In [2]:
# Load preprocessed data
df_preprocessed = pd.read_csv('../data/processed/new_dht11_preprocessed.csv')

# Display first few rows of the data
df_preprocessed.head()

Unnamed: 0,datatime,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density,anomaly_isolation_forest,anomaly_lstm,combined_anomaly,anomaly_duration
0,2024-04-06 22:39:00,32.3,53.0,147.75,1.0,187.625,0.0,141.125,1.0,205.125,1.0,240.8275,247.0575,1,,1,0.0
1,2024-04-06 22:40:00,32.3,53.0,147.777778,1.0,187.444444,0.0,142.0,1.0,205.111111,1.0,241.173333,194.508889,1,,1,0.0
2,2024-04-06 22:41:00,32.3,53.0,148.0,1.0,188.375,0.0,142.0,1.0,205.25,1.0,236.78125,209.39125,1,,1,0.0
3,2024-04-06 22:42:00,32.3,53.0,147.555556,1.0,187.333333,0.0,141.111111,1.0,204.0,1.0,241.542222,209.725556,1,,1,0.0
4,2024-04-06 22:43:00,32.3,53.0,147.111111,1.0,186.777778,0.0,140.555556,1.0,203.0,1.0,239.328889,198.197778,1,,1,0.0


In [3]:
# # Convert 'datatime' from milliseconds to datetime
# df_preprocessed['datatime'] = pd.to_datetime(df_preprocessed['datatime'], unit='ms', errors='coerce')

# # Convert the datetime object to Unix timestamp (seconds since epoch)
# df_preprocessed['datatime'] = df_preprocessed['datatime'].apply(lambda x: x.timestamp() if pd.notnull(x) else None)


In [4]:
df_preprocessed

Unnamed: 0,datatime,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,mq8_digital,pm25_density,pm10_density,anomaly_isolation_forest,anomaly_lstm,combined_anomaly,anomaly_duration
0,2024-04-06 22:39:00,32.300000,53.000000,147.750000,1.0,187.625000,0.0,141.125000,1.0,205.125000,1.0,240.827500,247.057500,1,,1,0.0
1,2024-04-06 22:40:00,32.300000,53.000000,147.777778,1.0,187.444444,0.0,142.000000,1.0,205.111111,1.0,241.173333,194.508889,1,,1,0.0
2,2024-04-06 22:41:00,32.300000,53.000000,148.000000,1.0,188.375000,0.0,142.000000,1.0,205.250000,1.0,236.781250,209.391250,1,,1,0.0
3,2024-04-06 22:42:00,32.300000,53.000000,147.555556,1.0,187.333333,0.0,141.111111,1.0,204.000000,1.0,241.542222,209.725556,1,,1,0.0
4,2024-04-06 22:43:00,32.300000,53.000000,147.111111,1.0,186.777778,0.0,140.555556,1.0,203.000000,1.0,239.328889,198.197778,1,,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4464,2024-11-14 22:40:00,30.844444,38.444444,30.888889,1.0,131.444444,0.0,172.666667,1.0,98.777778,1.0,278.896667,211.108889,1,1.0,1,0.0
4465,2024-11-14 22:41:00,30.800000,39.000000,31.111111,1.0,140.111111,0.0,174.000000,1.0,99.111111,1.0,230.936667,210.924444,1,1.0,1,0.0
4466,2024-11-14 22:42:00,30.800000,39.000000,31.222222,1.0,143.444444,0.0,175.111111,1.0,99.222222,1.0,243.020000,216.735556,1,1.0,1,0.0
4467,2024-11-14 22:43:00,30.800000,39.000000,31.000000,1.0,133.111111,0.0,175.222222,1.0,99.222222,1.0,241.174444,209.172222,1,1.0,1,0.0


In [5]:
#Step 1: Feature Engineering
sensor_columns = ['temperature', 'humidity', 'mq2_analog', 'mq9_analog', 'sound_analog', 'pm25_density', 'pm10_density']
# df_preprocessed['datatime'] = pd.to_datetime(df_preprocessed['datatime'], unit='s')


In [6]:
# Calculate differences and rolling averages
for col in sensor_columns:
    df_preprocessed[f'{col}_diff'] = df_preprocessed[col].diff()
    df_preprocessed[f'{col}_rolling_avg'] = df_preprocessed[col].rolling(window=5).mean()


In [7]:
# Calculate anomaly duration
# df_preprocessed['anomaly_duration'] = df_preprocessed['datatime'].diff().dt.total_seconds().fillna(0)


In [8]:
# Calculate severity (normalized deviation)
for col in sensor_columns:
    normal_mean = df_preprocessed[df_preprocessed['combined_anomaly'] == 1][col].mean()
    normal_std = df_preprocessed[df_preprocessed['combined_anomaly'] == 1][col].std()
    df_preprocessed[f'{col}_severity'] = (df_preprocessed[col] - normal_mean) / normal_std


In [9]:
# Compute pairwise correlations
corr_matrix = df_preprocessed[sensor_columns].corr()

# Identify strongly correlated sensor pairs (excluding self-pairs)
strong_pairs = [(col1, col2) for col1 in sensor_columns for col2 in sensor_columns
                if corr_matrix.loc[col1, col2] > 0.2 and col1 != col2]

# Initialize a list to hold correlation scores for each row
correlation_scores = []

# Iterate over the strong sensor pairs and compute agreement in changes
for col1, col2 in strong_pairs:
    # Calculate whether both sensors increase or decrease together
    agreement = ((df_preprocessed[f'{col1}_diff'] > 0) & (df_preprocessed[f'{col2}_diff'] > 0)) | \
                ((df_preprocessed[f'{col1}_diff'] < 0) & (df_preprocessed[f'{col2}_diff'] < 0))
    
    # Append the boolean agreement as integers (1 for agreement, 0 otherwise)
    correlation_scores.append(agreement.astype(int))

df_preprocessed['correlation_behavior'] = pd.DataFrame(correlation_scores).sum(axis=0) / len(strong_pairs)



In [10]:
print(df_preprocessed['correlation_behavior'].value_counts())

correlation_behavior
0.166667    1381
0.333333    1372
0.666667     718
0.500000     717
0.000000     190
0.833333      82
1.000000       9
Name: count, dtype: int64


In [11]:
strong_pairs

[('temperature', 'humidity'),
 ('humidity', 'temperature'),
 ('humidity', 'pm25_density'),
 ('mq2_analog', 'mq9_analog'),
 ('mq2_analog', 'sound_analog'),
 ('mq9_analog', 'mq2_analog'),
 ('mq9_analog', 'sound_analog'),
 ('sound_analog', 'mq2_analog'),
 ('sound_analog', 'mq9_analog'),
 ('pm25_density', 'humidity'),
 ('pm25_density', 'pm10_density'),
 ('pm10_density', 'pm25_density')]

In [12]:
df_preprocessed

Unnamed: 0,datatime,temperature,humidity,mq2_analog,mq2_digital,sound_analog,sound_digital,mq9_analog,mq9_digital,mq8_analog,...,pm10_density_diff,pm10_density_rolling_avg,temperature_severity,humidity_severity,mq2_analog_severity,mq9_analog_severity,sound_analog_severity,pm25_density_severity,pm10_density_severity,correlation_behavior
0,2024-04-06 22:39:00,32.300000,53.000000,147.750000,1.0,187.625000,0.0,141.125000,1.0,205.125000,...,,,-0.154689,0.166393,2.673821,-1.133910,1.917901,0.292252,2.133198,0.000000
1,2024-04-06 22:40:00,32.300000,53.000000,147.777778,1.0,187.444444,0.0,142.000000,1.0,205.111111,...,-52.548611,,-0.154689,0.166393,2.674599,-1.110388,1.911611,0.303620,0.227340,0.166667
2,2024-04-06 22:41:00,32.300000,53.000000,148.000000,1.0,188.375000,0.0,142.000000,1.0,205.250000,...,14.882361,,-0.154689,0.166393,2.680819,-1.110388,1.944026,0.159254,0.767101,0.166667
3,2024-04-06 22:42:00,32.300000,53.000000,147.555556,1.0,187.333333,0.0,141.111111,1.0,204.000000,...,0.334306,,-0.154689,0.166393,2.668378,-1.134284,1.907741,0.315745,0.779226,0.666667
4,2024-04-06 22:43:00,32.300000,53.000000,147.111111,1.0,186.777778,0.0,140.555556,1.0,203.000000,...,-11.527778,211.776194,-0.154689,0.166393,2.655937,-1.149219,1.888388,0.242994,0.361131,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4464,2024-11-14 22:40:00,30.844444,38.444444,30.888889,1.0,131.444444,0.0,172.666667,1.0,98.777778,...,10.605556,205.649333,-0.860960,-1.068851,-0.597402,-0.285970,-0.039092,1.543568,0.829397,0.500000
4465,2024-11-14 22:41:00,30.800000,39.000000,31.111111,1.0,140.111111,0.0,174.000000,1.0,99.111111,...,-0.184444,206.055111,-0.882525,-1.021705,-0.591181,-0.250126,0.262803,-0.032854,0.822708,0.666667
4466,2024-11-14 22:42:00,30.800000,39.000000,31.222222,1.0,143.444444,0.0,175.111111,1.0,99.222222,...,5.811111,209.190889,-0.882525,-1.021705,-0.588071,-0.220256,0.378916,0.364319,1.033468,0.666667
4467,2024-11-14 22:43:00,30.800000,39.000000,31.000000,1.0,133.111111,0.0,175.222222,1.0,99.222222,...,-7.563333,209.688889,-0.882525,-1.021705,-0.594291,-0.217269,0.018965,0.303656,0.759157,0.333333


In [13]:

# Step 4: Weighted Scoring - Assign weights to different features
weights = {
    'trend_features': 0.3,
    'anomaly_duration': 0.2,
    'severity': 0.3,
    'correlation_behavior': 0.2,
}

from sklearn.preprocessing import MinMaxScaler

# Define the features to normalize
features_to_normalize = {
    'trend_features': [f'{col}_diff' for col in sensor_columns],
    'anomaly_duration': ['anomaly_duration'],
    'severity': [f'{col}_severity' for col in sensor_columns],
    'correlation_behavior': ['correlation_behavior']
}

# Initialize scaler
scaler = MinMaxScaler()

# Normalize each feature group and create new normalized columns
for feature_group, columns in features_to_normalize.items():
    # Normalize and add '_normalized' suffix
    df_preprocessed[[f'{col}_normalized' for col in columns]] = scaler.fit_transform(df_preprocessed[columns])

# Calculate weighted score using normalized columns
df_preprocessed['weighted_score'] = (
    weights['trend_features'] * df_preprocessed[[f'{col}_normalized' for col in features_to_normalize['trend_features']]].mean(axis=1) +
    weights['anomaly_duration'] * df_preprocessed['anomaly_duration_normalized'] +
    weights['severity'] * df_preprocessed[[f'{col}_normalized' for col in features_to_normalize['severity']]].mean(axis=1) +
    weights['correlation_behavior'] * df_preprocessed['correlation_behavior_normalized']
)


In [14]:
# Calculate thresholds dynamically for weighted_score
high_score_threshold = df_preprocessed['weighted_score'].quantile(0.9)  # Top 10% for Environmental Changes
mid_score_threshold = df_preprocessed['weighted_score'].quantile(0.5)  # Median for Fault

# Calculate thresholds dynamically for correlation_behavior
high_corr_threshold = df_preprocessed['correlation_behavior'].quantile(0.9)  # Top 10% for strong correlations
mid_corr_threshold = df_preprocessed['correlation_behavior'].quantile(0.5)  # Median for moderate correlations

# Refined classification function with dynamic thresholds and handling negative sensor data
def classify_with_dynamic_thresholds(row):
    # Flag negative values in sensor data as faults (if present)
    if any(val < 0 for val in row[['temperature', 'humidity', 'mq2_analog', 'sound_analog', 'mq9_analog', 'pm25_density', 'pm10_density']]):
        return "Fault"  # Flag as Fault if there are negative sensor readings
    
    # If combined anomaly is 1, classify as Normal
    if row['combined_anomaly'] == 1:
        return "Normal"
    
    # Classification based on dynamic thresholds for anomaly data (combined_anomaly == -1)
    if row['combined_anomaly'] == -1:
        # Apply threshold logic to decide if it is an "Environmental Change" or "Fault"
        if row['weighted_score'] > high_score_threshold and row['correlation_behavior'] > high_corr_threshold:
            return "Environmental Changes"  # High score and high correlation indicates environmental changes
        else:
            return "Fault" 
    return "Normal" 

# Apply the classification function to the dataframe
df_preprocessed['anomaly_class'] = df_preprocessed.apply(classify_with_dynamic_thresholds, axis=1)

# Print the unique values and their counts in the 'anomaly_class' column
print(df_preprocessed['anomaly_class'].value_counts())


anomaly_class
Normal                   4036
Fault                     418
Environmental Changes      15
Name: count, dtype: int64


In [15]:
# Output: Preview the dataset with new anomaly classifications
df_preprocessed[['weighted_score','correlation_behavior', 'anomaly_class']].head(20)

Unnamed: 0,weighted_score,correlation_behavior,anomaly_class
0,,0.0,Normal
1,0.275635,0.166667,Normal
2,0.28006,0.166667,Normal
3,0.379669,0.666667,Normal
4,0.377643,0.666667,Normal
5,0.311557,0.333333,Normal
6,0.278145,0.166667,Normal
7,0.314184,0.333333,Normal
8,0.384243,0.666667,Normal
9,0.386813,0.666667,Normal


In [16]:
print(df_preprocessed['weighted_score'].describe())
print(df_preprocessed['correlation_behavior'].describe())


count    4468.000000
mean        0.314785
std         0.052510
min         0.179377
25%         0.275456
50%         0.305880
75%         0.342948
max         0.670563
Name: weighted_score, dtype: float64
count    4469.000000
mean        0.358469
std         0.201184
min         0.000000
25%         0.166667
50%         0.333333
75%         0.500000
max         1.000000
Name: correlation_behavior, dtype: float64


In [17]:
df_preprocessed.to_csv("../data/processed/classification_dht11_preprocessed.csv", index=False)

In [18]:
df_preprocessed['anomaly_duration'].describe()

count    4.469000e+03
mean     2.824703e+04
std      1.617212e+05
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.037100e+06
Name: anomaly_duration, dtype: float64

In [19]:
# Specify columns to retain: sensor data (excluding digital), weighted_score, correlation_behavior, anomaly_class
columns_to_save = [
    'temperature', 'humidity', 'mq2_analog', 'mq9_analog', 
    'sound_analog', 'pm25_density', 'pm10_density', 
    'weighted_score', 'correlation_behavior', 'anomaly_class'
]

# Filter the DataFrame to keep only the desired columns
df_filtered = df_preprocessed[columns_to_save]

# Save the filtered DataFrame to a new CSV file
df_filtered.to_csv('../data/processed/sensor_data_filtered.csv', index=False)

# Display confirmation
print("Filtered data saved to 'sensor_data_filtered.csv'.")


Filtered data saved to 'sensor_data_filtered.csv'.
