In [29]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('indoor_data.csv')

# 1. Handle missing values - remove unused location columns
df.drop(['latitude', 'longitude', 'elevation'], axis=1, inplace=True)

# 2. Convert timestamp to datetime and set as index
df['created_at'] = pd.to_datetime(df['created_at'])
df.set_index('created_at', inplace=True)

# 3. Rename columns with accurate sensor descriptions
column_rename = {
    'entry_id': 'entry_id',
    'field1': 'mq7_co_ppm',          # MQ7 Carbon Monoxide (ppm)
    'field2': 'mq135_air_quality',    # MQ135 Composite Air Quality Index
    'field3': 'temperature_c',        # Temperature (°C)
    'field4': 'humidity_pct',         # Humidity (%)
    'field5': 'eco2_ppm',             # eCO2 (ppm)
    'field6': 'tvoc_ppb',             # TVOC (ppb)
    'field7': 'dust_density_ugm3',    # Dust Density (µg/m³)
    'status': 'status'
}
df = df.rename(columns=column_rename)

# 4. Handle outliers with sensor-specific ranges
sensor_ranges = {
    'mq7_co_ppm': (10, 1000),         # Typical MQ7 range
    'mq135_air_quality': (0, 1000),    # MQ135 general range
    'temperature_c': (10, 40),         # Reasonable indoor temp
    'humidity_pct': (20, 90),          # Reasonable humidity
    'eco2_ppm': (400, 2000),           # Typical eCO2 range
    'tvoc_ppb': (0, 2000),             # Typical TVOC range
    'dust_density_ugm3': (0, 500)      # Typical dust range
}

for col, (min_val, max_val) in sensor_ranges.items():
    if col in df.columns:
        df = df[(df[col] >= min_val) & (df[col] <= max_val)]

# 5. Calculate time between readings
df['seconds_since_previous'] = df.index.to_series().diff().dt.total_seconds().fillna(0)

# 6. Add time-based features
df['hour_of_day'] = df.index.hour
df['day_of_week'] = df.index.dayofweek  # Monday=0, Sunday=6
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# 7. Air Quality Index Calculation (example - adjust formula as needed)
# Simple weighted average of relevant sensors
df['aqi'] = (0.3*df['mq135_air_quality'] + 
             0.2*df['eco2_ppm']/10 + 
             0.3*df['tvoc_ppb']/100 + 
             0.2*df['dust_density_ugm3'])

# 8. Final cleaning
df = df.dropna()  # Remove any remaining missing values
df = df[df['status'] == 'ok'] if 'status' in df.columns else df  # Filter by status if column exists

# 9. Save processed data
df.to_csv('processed_indoor_air_quality.csv')

print(f"Processing complete. Final dataset shape: {df.shape}")
print("Sample of processed data:")
print(df.head())
print("\nCleaned data saved to 'processed_indoor_air_quality.csv'")
# Display processed dataset info
print("\nProcessed dataset info:")
print(df.info())
print("\nFirst 5 rows after processing:")
print(df.head())

Processing complete. Final dataset shape: (0, 14)
Sample of processed data:
Empty DataFrame
Columns: [entry_id, mq7_co_ppm, mq135_air_quality, temperature_c, humidity_pct, eco2_ppm, tvoc_ppb, dust_density_ugm3, status, seconds_since_previous, hour_of_day, day_of_week, is_weekend, aqi]
Index: []

Cleaned data saved to 'processed_indoor_air_quality.csv'

Processed dataset info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 0 entries
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   entry_id                0 non-null      int64  
 1   mq7_co_ppm              0 non-null      int64  
 2   mq135_air_quality       0 non-null      float64
 3   temperature_c           0 non-null      float64
 4   humidity_pct            0 non-null      float64
 5   eco2_ppm                0 non-null      float64
 6   tvoc_ppb                0 non-null      float64
 7   dust_density_ugm3       0 non-null      flo