In [None]:
import pandas as pd
import ast
import os,sys

# --- Read File Selection ---
try:
    read_file = input("Enter read file path (e.g., raw_stream/sensor_data_x.csv): ").strip()
    if not read_file or not os.path.isfile(read_file):
        raise FileNotFoundError(f"Read file does not exist: {read_file}")
except Exception as e:
    print(f"[ERROR] {e}")
    sys.exit(1)

# --- Write File Selection ---
try:
    write_file = input("Enter write file path (e.g., processed/data_x.csv): ").strip()
    if not write_file:
        raise ValueError("No write file path provided.")
    write_dir = os.path.dirname(write_file) or "."
    os.makedirs(write_dir, exist_ok=True)  # create parent dirs if needed
except Exception as e:
    print(f"[ERROR] {e}")
    sys.exit(1)

# --- Processing Logic ---
frames = []

for chunk in pd.read_csv(read_file, usecols=["times", "PM2.5"], dtype=str, chunksize=50_000):
    chunk = chunk[(chunk['times'] != 'times') & (chunk["PM2.5"] != "PM2.5")].reset_index(drop=True)
    chunk['times'] = chunk['times'].apply(ast.literal_eval)
    chunk['PM2.5'] = chunk['PM2.5'].apply(ast.literal_eval)
    chunk = chunk.explode(['times', 'PM2.5'], ignore_index=True)
    chunk['times'] = pd.to_datetime(chunk['times'], errors='coerce')
    chunk['PM2.5'] = pd.to_numeric(chunk['PM2.5'], errors='coerce')
    chunk = chunk.dropna(subset=['times', 'PM2.5'])
    chunk['hour'] = chunk['times'].dt.floor('h')
    hourly = (chunk.groupby('hour', as_index=False)['PM2.5']
              .mean()
              .rename(columns={'PM2.5': 'pm25_mean'}))
    frames.append(hourly)

df = pd.concat(frames)
df.to_csv(write_file, index=False)
print(f"Processed data saved to: {write_file}")

In [None]:
# split into date + hour-of-day
df['date'] = df['hour'].dt.date
df['hod']  = df['hour'].dt.hour

g = df.groupby(['date','hod'], as_index=False)['pm25_mean'].mean()

heat = (
    g.pivot(index='hod', columns='date', values='pm25_mean')
     .sort_index()
)
# plot (matplotlib, no explicit colors)
fig, ax = plt.subplots(figsize=(14, 6))
im = ax.imshow(heat.to_numpy(), aspect='auto', origin='lower')

ax.set_yticks(np.arange(heat.shape[0]))
ax.set_yticklabels(heat.index)

cols = heat.columns
step = max(1, len(cols)//30)
ax.set_xticks(np.arange(0, len(cols), step))
ax.set_xticklabels([c.strftime('%Y-%m-%d') for c in cols[::step]], rotation=90)

ax.set_xlabel('Date'); ax.set_ylabel('Hour of day')
ax.set_title('PM2.5 hourly mean (hour × day)')
fig.colorbar(im, ax=ax, label='µg/m³')
fig.tight_layout(); plt.show()


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

# --- 1) Ensure df['hour'] is datetime ---
# If it's already datetime, this is a no-op. If it's a string/number, it will parse it.
df = df.copy()
df['hour'] = pd.to_datetime(df['hour'], errors='coerce')

# Drop rows that failed to parse
df = df.dropna(subset=['hour'])

# --- 2) Derive date and hour-of-day ---
df['date'] = df['hour'].dt.normalize()     # midnight for each date (datetime64[ns])
df['hod']  = df['hour'].dt.hour            # 0..23

# --- 3) Build pivot: rows = hour-of-day, cols = date, values = pm25_mean ---
heat = (
    df.pivot_table(index='hod', columns='date', values='pm25_mean', aggfunc='mean')
      .sort_index()
)

# If columns are date objects (rare), convert to datetime for nicer axis formatting
if not np.issubdtype(heat.columns.dtype, np.datetime64):
    heat.columns = pd.to_datetime(heat.columns)

# --- 4) Classify danger levels (heuristic) ---
bins  = [-np.inf, 12.0, 35.4, 55.4, 150.4, np.inf]
names = ["Good", "Moderate", "USG", "Unhealthy", "Very Unhealthy"]

vals = heat.values
cat_idx = np.digitize(vals, bins) - 1
cat_idx = np.clip(cat_idx, 0, len(names)-1)

custom = np.where(
    np.isnan(vals),
    "No data",
    np.array(names, dtype=object)[cat_idx]
)

# --- 5) Interactive heatmap ---
fig = px.imshow(
    heat,
    origin="lower",
    aspect="auto",
    labels=dict(x="Date", y="Hour of day", color="µg/m³"),
    x=heat.columns, y=heat.index,
    color_continuous_scale="Viridis",
)

# Attach 2-D customdata matching z-shape
fig.update_traces(
    customdata=custom,
    hovertemplate="Date=%{x|%Y-%m-%d}<br>Hour=%{y}:00<br>PM2.5=%{z:.1f} µg/m³<br>Status=%{customdata}<extra></extra>"
)

fig.update_layout(
    title="PM2.5 hourly mean (interactive heatmap)",
    xaxis_tickangle=-45,
    yaxis_nticks=24,
)
fig.show()


In [None]:
import plotly.express as px

# Prepare data for Plotly heatmap: reset index
heat_df = heat.reset_index().melt(id_vars='hod', var_name='date', value_name='pm25_mean')

fig = px.density_heatmap(
    heat_df,
    x='date',
    y='hod',
    z='pm25_mean',
    color_continuous_scale='Viridis',
    labels={'hod': 'Hour of Day', 'date': 'Date', 'pm25_mean': 'PM2.5 (µg/m³)'},
    title='PM2.5 Hourly Mean (hour × day) Interactive Heatmap',
    hover_data={'pm25_mean': ':.2f'}
)

fig.update_yaxes(autorange='reversed')  # to match "origin=lower" style
fig.show()


In [None]:
import pandas as pd, numpy as np
import plotly.graph_objects as go

# Clean & sort
df = df.copy()
df['hour'] = pd.to_datetime(df['hour'], errors='coerce')
df['pm25_mean'] = pd.to_numeric(df['pm25_mean'], errors='coerce')
df = df.dropna(subset=['hour','pm25_mean']).sort_values('hour')

# 24h rolling mean -> set datetime index for time-based window
df = df.set_index('hour')
df['pm25_roll'] = df['pm25_mean'].rolling('24h').mean()
df = df.reset_index()  # back to columns for plotting

fig = go.Figure()

# Raw values (fast renderer)
fig.add_trace(go.Scattergl(
    x=df['hour'], y=df['pm25_mean'],
    mode='lines', name='PM2.5 (raw)',
    line=dict(width=1),
    hovertemplate="%{x|%Y-%m-%d %H:%M}<br>PM2.5=%{y:.1f} µg/m³<extra></extra>"
))

# Rolling mean (trend)
fig.add_trace(go.Scatter(
    x=df['hour'], y=df['pm25_roll'],
    mode='lines', name='24h rolling mean',
    line=dict(width=3),
    hovertemplate="%{x|%Y-%m-%d %H:%M}<br>24h mean=%{y:.1f} µg/m³<extra></extra>"
))

# AQI-ish bands (EPA 24h thresholds for context)
bands = [
    (0, 12.0,   "Good"),
    (12.1, 35.4,"Moderate"),
    (35.5, 55.4,"USG"),
    (55.5,150.4,"Unhealthy"),
]
for y0,y1,label in bands:
    fig.add_shape(type="rect", xref="paper", yref="y",
                  x0=0, x1=1, y0=y0, y1=y1,
                  fillcolor="rgba(0,0,0,0.04)", line_width=0, layer="below")
    fig.add_annotation(xref="paper", x=1.002, y=(y0+y1)/2,
                       text=label, showarrow=False, xanchor="left",
                       font=dict(size=11, color="#555"))

fig.update_layout(
    title="PM2.5 over time (raw & 24h trend)",
    xaxis=dict(
        title="Time",
        rangeslider=dict(visible=True),
        rangeselector=dict(
            buttons=[
                dict(count=24,  step="hour",  stepmode="backward", label="24h"),
                dict(count=7,   step="day",   stepmode="backward", label="7d"),
                dict(count=1,   step="month", stepmode="backward", label="1m"),
                dict(step="all", label="All")
            ]
        )
    ),
    yaxis=dict(title="µg/m³", rangemode="tozero"),
    hovermode="x unified",
    margin=dict(r=80)
)
fig.show()


In [None]:
import plotly.express as px

df2 = df.copy()
df2['hour'] = pd.to_datetime(df2['hour'], errors='coerce')
df2 = df2.dropna(subset=['hour'])
df2['date'] = df2['hour'].dt.date
df2['hod']  = df2['hour'].dt.hour

# Optional smoothing per day/hour
day_hour = (df2.groupby(['date','hod'], as_index=False)
              .agg(pm=('pm25_mean','mean')))

fig = px.line(
    day_hour, x='hod', y='pm',
    facet_col='date', facet_col_wrap=4,
    markers=True,
    labels={'hod':'Hour of day','pm':'µg/m³'},
    title="Daily PM2.5 profiles (hourly means)"
)

# Nice axes & hover
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_traces(hovertemplate="Hour=%{x}:00<br>PM2.5=%{y:.1f} µg/m³<extra></extra>")
fig.update_layout(height=500 + 70*((len(day_hour['date'].unique())-1)//4),
                  hovermode="x unified")
fig.update_xaxes(dtick=2)
fig.show()
