Author: Wenxin Zhang<br>
This notebook does analysis and visualization on device data.

In [1]:
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy import stats


In [2]:
DATA_PATH = 'data/'

### Get all data files

In [3]:
data_files = []
for f in os.listdir(DATA_PATH):
        if not f.startswith('.'):
                data_files.append(f)


### Add a new column called device name (should be the name of the txt file), combine all dataframes

In [4]:
data_array = []
device_names = []
for d in data_files:
        device_name = d.split('.')[0]
        device_names.append(device_name)
        data_df = pd.read_csv(DATA_PATH + d, skiprows=1)
        data_df.columns = data_df.columns.str.replace(' ','')
        data_df['Device'] = device_name
        data_array.append(data_df)
all_device_data = pd.concat(data_array)

In [5]:

# add datetime
all_device_data['Datetime'] = pd.to_datetime(all_device_data['Date'] + ' ' + all_device_data['Time'])

In [6]:
device_names

['Device1', 'Device7', 'Device12', 'Device13', 'Device9']

In [7]:
to_numeric_cols = all_device_data.columns.drop(['Time', 'Date', 'Battery', 'Fix', 'Longitude','Latitude', 'Temp(C)', 'RH(%)', 'P(hPa)', 'Alti(m)', 'Device', 'Datetime'])
all_device_data[to_numeric_cols] = all_device_data[to_numeric_cols].apply(pd.to_numeric, errors='coerce', downcast='float').astype(float)

In [8]:
all_device_data

Unnamed: 0,Date,Time,Battery,Fix,Latitude,Longitude,Dp>0.3,Dp>0.5,Dp>1.0,Dp>2.5,...,PM10_Std,PM1_Env,PM2.5_Env,PM10_Env,Temp(C),RH(%),P(hPa),Alti(m),Device,Datetime
0,2022/3/31,11:29:11,51.84,-,-,-,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,20.959999,39.053711,1020.883118,-,Device1,2022-03-31 11:29:11
1,2022/3/31,11:29:17,50.21,-,-,-,6.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,21.000000,38.150391,1020.939697,-,Device1,2022-03-31 11:29:17
2,2022/3/31,11:29:22,50.21,-,-,-,6.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,21.010000,38.286133,1020.955811,-,Device1,2022-03-31 11:29:22
3,2022/3/31,11:29:27,50.21,-,-,-,12.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,21.040001,39.430664,1020.895569,-,Device1,2022-03-31 11:29:27
4,2022/3/31,11:29:32,50.21,-,-,-,18.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,21.070000,39.220703,1020.979248,-,Device1,2022-03-31 11:29:32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4984,2022/5/17,14:57:52,90.40,-,-,-,555.0,169.0,15.0,9.0,...,11.0,2.0,3.0,11.0,26.240000,26.206055,1020.653931,-,Device9,2022-05-17 14:57:52
4985,2022/5/17,14:57:57,89.95,-,-,-,510.0,141.0,12.0,6.0,...,7.0,1.0,1.0,7.0,26.260000,26.541016,1020.733459,-,Device9,2022-05-17 14:57:57
4986,2022/5/17,14:58:5,90.54,-,-,-,399.0,108.0,3.0,3.0,...,4.0,1.0,1.0,4.0,26.280001,26.115234,1020.719543,-,Device9,2022-05-17 14:58:05
4987,2022/5/17,14:58:10,90.54,-,-,-,609.0,167.0,3.0,3.0,...,5.0,2.0,2.0,5.0,26.280001,26.103516,1020.692017,-,Device9,2022-05-17 14:58:10


In [9]:
all_device_data = all_device_data[['Datetime', 'Date', 'Time', 'Dp>0.3',
       'Dp>0.5', 'Dp>1.0', 'Dp>2.5', 'Dp>5.0', 'Dp>10.0', 'PM1_Std',
       'PM2.5_Std', 'PM10_Std', 'PM1_Env', 'PM2.5_Env', 'PM10_Env','Device']]

In [10]:
all_device_data.dtypes

Datetime     datetime64[ns]
Date                 object
Time                 object
Dp>0.3              float64
Dp>0.5              float64
Dp>1.0              float64
Dp>2.5              float64
Dp>5.0              float64
Dp>10.0             float64
PM1_Std             float64
PM2.5_Std           float64
PM10_Std            float64
PM1_Env             float64
PM2.5_Env           float64
PM10_Env            float64
Device               object
dtype: object

In [11]:
rounded = pd.DataFrame(all_device_data['Datetime'].dt.round('5s'))
all_device_data['Datetime_round'] = rounded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_device_data['Datetime_round'] = rounded


In [12]:
all_device_data_rounded = all_device_data.groupby(['Datetime_round','Device']).mean().reset_index()
all_device_data_rounded

Unnamed: 0,Datetime_round,Device,Dp>0.3,Dp>0.5,Dp>1.0,Dp>2.5,Dp>5.0,Dp>10.0,PM1_Std,PM2.5_Std,PM10_Std,PM1_Env,PM2.5_Env,PM10_Env
0,2022-02-09 14:43:25,Device9,1152.0,343.0,36.0,0.0,0.0,0.0,7.0,9.0,9.0,7.0,9.0,9.0
1,2022-02-09 14:43:30,Device12,1125.0,318.0,30.0,9.0,3.0,0.0,5.0,7.0,9.0,5.0,7.0,9.0
2,2022-02-09 14:43:30,Device9,1131.0,335.0,39.0,3.0,0.0,0.0,7.0,9.0,9.0,7.0,9.0,9.0
3,2022-02-09 14:43:35,Device12,972.0,276.0,6.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0
4,2022-02-09 14:43:35,Device7,939.0,298.0,38.0,0.0,0.0,0.0,6.0,7.0,7.0,6.0,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18507,2022-05-27 15:46:05,Device1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18508,2022-05-27 15:46:10,Device1,57.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18509,2022-05-27 15:46:15,Device1,57.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18510,2022-05-27 15:46:20,Device1,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Helper Function

In [13]:
def plot_line_chart(df, x, y, title):
    plot = px.line(df,
                          x=x,
                          y=y,
                          color='Device',
                          title=title,
                          labels={'Datetime_round': 'Time'})
    return plot

In [14]:
def drop_numerical_outliers(df, z_thresh=4):
    constrains = df.select_dtypes(include=['float64']) \
        .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh).all(axis=1)
    idx = df.index[constrains==False].tolist()
    new_df = df.drop(idx)
    return new_df

## Get data betweem a specific time 'YYYY-MM-DD HH:MM:SS'

In [15]:
START_TIME = '2022-04-22 16:00:00'
END_TIME = '2022-04-22 16:15:00'

In [16]:
data_in_time_range = all_device_data_rounded[(all_device_data_rounded['Datetime_round'] >= START_TIME) & (all_device_data_rounded['Datetime_round'] <= END_TIME)]

In [17]:
#dp3_in_time_range.to_csv("output.csv", index=False)
data_in_time_range

Unnamed: 0,Datetime_round,Device,Dp>0.3,Dp>0.5,Dp>1.0,Dp>2.5,Dp>5.0,Dp>10.0,PM1_Std,PM2.5_Std,PM10_Std,PM1_Env,PM2.5_Env,PM10_Env
16459,2022-04-22 16:03:05,Device13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16460,2022-04-22 16:03:05,Device7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16461,2022-04-22 16:03:05,Device9,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16462,2022-04-22 16:03:10,Device12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16463,2022-04-22 16:03:10,Device13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16974,2022-04-22 16:14:55,Device9,183.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16975,2022-04-22 16:15:00,Device12,288.0,93.0,21.0,9.0,9.0,6.0,0.0,0.0,8.0,0.0,0.0,8.0
16976,2022-04-22 16:15:00,Device13,207.0,69.0,9.0,3.0,3.0,3.0,1.0,1.0,3.0,1.0,1.0,3.0
16977,2022-04-22 16:15:00,Device7,117.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
lineplt_by_time = plot_line_chart(df=data_in_time_range, x='Datetime_round', y='Dp>0.3', title='Dp>0.3 Collected by Device')
lineplt_by_time.show()
lineplt_by_time.write_html('./Figures/lineplt_by_time.html')

In [19]:
Z_thresh = 4
data_removed_extreme = drop_numerical_outliers(data_in_time_range.copy(), Z_thresh)

In [20]:
removed_extreme_plot = plot_line_chart(data_removed_extreme, 'Datetime_round', 'Dp>0.3', 'Dp>0.3 Collected by Device with Extreme Removed')
removed_extreme_plot.show()
removed_extreme_plot.write_html('./Figures/removed_extreme_plot.html')

In [21]:
data_in_time_range.columns

Index(['Datetime_round', 'Device', 'Dp>0.3', 'Dp>0.5', 'Dp>1.0', 'Dp>2.5',
       'Dp>5.0', 'Dp>10.0', 'PM1_Std', 'PM2.5_Std', 'PM10_Std', 'PM1_Env',
       'PM2.5_Env', 'PM10_Env'],
      dtype='object')

In [22]:
measured_type_plot = go.Figure()
columns = ['Dp>0.3', 'Dp>0.5', 'Dp>1.0', 'Dp>2.5', 'Dp>5.0', 'Dp>10.0', 'PM1_Std', 'PM2.5_Std', 'PM10_Std', 'PM1_Env', 'PM2.5_Env', 'PM10_Env']
for c in columns:
    measured_type_plot.add_traces(plot_line_chart(data_in_time_range, 'Datetime_round', c, c + 'Collected by Device').update_traces(visible=False).data)
measured_type_plot.update_traces()
measured_type_plot.update_layout(
updatemenus=[
    dict(
        active=0,
        buttons=list([
            dict(label="Select",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, False, False, False,False, False]},
                       {"title": "Select one field to start..."}]),
            dict(label="Dp>0.3",
                 method="update",
                 args=[{"visible": [True, False, False, False, False, False, False, False, False, False,False, False]},
                       {"title": "Dp>0.3"}]),
            dict(label="Dp>0.5",
                 method="update",
                 args=[{"visible": [False, True, False, False, False, False, False, False, False, False,False, False]},
                       {"title": "Dp>0.5"}]),
            dict(label="Dp>1.0",
                 method="update",
                 args=[{"visible": [False, False, True, False, False, False, False, False, False, False,False, False]},
                       {"title": "Dp>1.0"}]),
            dict(label="Dp>2.5",
                 method="update",
                 args=[{"visible": [False, False, False, True, False, False, False, False, False, False,False, False]},
                       {"title": "Dp>2.5"}]),
            dict(label="Dp>5.0",
                 method="update",
                 args=[{"visible": [False, False, False, False, True, False, False, False, False, False,False, False]},
                       {"title": "Dp>5.0"}]),
            dict(label="Dp>10.0",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, True, False, False, False, False,False, False]},
                       {"title": "Dp>10.0"}]),
            dict(label="PM1_Std",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, True, False, False, False,False, False]},
                       {"title": "PM1_Std"}]),
            dict(label="PM2.5_Std",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, True, False, False,False, False]},
                       {"title": "PM2.5_Std"}]),
            dict(label="PM10_Std",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, False, True, False, False, False]},
                       {"title": "PM10_Std"}]),
            dict(label="PM1_Env",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, False, False, True, False, False]},
                       {"title": "PM1_Env"}]),
            dict(label="PM2.5_Env",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, False, False, False, True, False]},
                       {"title": "PM2.5_Env"}]),
            dict(label="PM10_Env",
                 method="update",
                 args=[{"visible": [False, False, False, False, False, False, False, False, False, False, False, True]},
                       {"title": "PM10_Env"}]),

        ]),
    )
])

# measured_type_plot.add_trace(go.Scatter(
#     x=data_in_time_range['Datetime_round'],
#     y=data_in_time_range['Dp>0.3'],
#      fill='Device',
#     name = 'Dp>0.3'))
# measured_type_plot.add_trace( go.Scatter(
#         x=data_in_time_range['Datetime_round'],
#     y=data_in_time_range['Dp>0.5'],
#     name = 'Dp>0.5'))
measured_type_plot.show()
measured_type_plot.write_html('./Figures/measured_type_plot.html')

In [23]:
plot_line_chart(data_in_time_range, 'Datetime_round', 'Dp>0.3', 'Dp>0.3' + 'Collected by Device').data

(Scatter({
     'hovertemplate': 'Device=Device13<br>Time=%{x}<br>Dp>0.3=%{y}<extra></extra>',
     'legendgroup': 'Device13',
     'line': {'color': '#636efa', 'dash': 'solid'},
     'marker': {'symbol': 'circle'},
     'mode': 'lines',
     'name': 'Device13',
     'orientation': 'v',
     'showlegend': True,
     'x': array([datetime.datetime(2022, 4, 22, 16, 3, 5),
                 datetime.datetime(2022, 4, 22, 16, 3, 10),
                 datetime.datetime(2022, 4, 22, 16, 3, 15),
                 datetime.datetime(2022, 4, 22, 16, 3, 20),
                 datetime.datetime(2022, 4, 22, 16, 3, 30),
                 datetime.datetime(2022, 4, 22, 16, 3, 35),
                 datetime.datetime(2022, 4, 22, 16, 3, 40),
                 datetime.datetime(2022, 4, 22, 16, 3, 45),
                 datetime.datetime(2022, 4, 22, 16, 3, 50),
                 datetime.datetime(2022, 4, 22, 16, 3, 55),
                 datetime.datetime(2022, 4, 22, 16, 4),
                 datetime.dateti

In [24]:
def multi_plot(df, col, title):
    fig = go.Figure()
    for device in device_names:
        df_device = df[df['Device'] == device]
        fig.add_trace(
            go.Scatter(
                x=list(df_device['Datetime_round']),
                y=list(df_device[col]),
                name=device,
                line_shape='spline'
            )
        )

    button_all = dict(label='All',
                      method='update',
                      args=[{'visible': True,
                             'title': 'All',
                             'showlegend': True}])

    def create_layout_button(column):
        boolean = []
        for c in device_names:
            if c == column:
                boolean.append(True)
            else:
                boolean.append(False)
        return dict(label=column,
                    method='update',
                    args=[{'visible': boolean,
                           'title': column,
                           'showlegend': True}])

    fig.update_layout(
        updatemenus=[go.layout.Updatemenu(
            active=0,
            buttons=([button_all] * True) + [create_layout_button(device) for device in device_names])

        ],
        yaxis_type="log"
    )
    # Update remaining layout properties
    fig.update_layout(
        title_text=title,
        height=800

    )

    fig.show()


In [25]:
#data_in_time_range[data_in_time_range['Device'] == 'Device7']
multi_plot(data_in_time_range, 'Dp>0.3', 'Dp>0.3')

In [26]:
for c in columns:
    multi_plot(data_in_time_range,  c, c)

In [27]:
data_in_time_range

Unnamed: 0,Datetime_round,Device,Dp>0.3,Dp>0.5,Dp>1.0,Dp>2.5,Dp>5.0,Dp>10.0,PM1_Std,PM2.5_Std,PM10_Std,PM1_Env,PM2.5_Env,PM10_Env
16459,2022-04-22 16:03:05,Device13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16460,2022-04-22 16:03:05,Device7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16461,2022-04-22 16:03:05,Device9,18.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16462,2022-04-22 16:03:10,Device12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16463,2022-04-22 16:03:10,Device13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16974,2022-04-22 16:14:55,Device9,183.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16975,2022-04-22 16:15:00,Device12,288.0,93.0,21.0,9.0,9.0,6.0,0.0,0.0,8.0,0.0,0.0,8.0
16976,2022-04-22 16:15:00,Device13,207.0,69.0,9.0,3.0,3.0,3.0,1.0,1.0,3.0,1.0,1.0,3.0
16977,2022-04-22 16:15:00,Device7,117.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
