<a href="https://colab.research.google.com/github/rhacrsse/AutomIoT/blob/main/data_analysis/notebooks/time_series/01_time_series_analysis_SDNT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analisi trace di rete su dispositivi

## Smart Device Network Traffic (SDNT)

### SMARTBULB EZVIZ 64:F2:FB:DF:FB:E1 -> LABEL 1
### ~~SMARTPLUG EZVIZ 64:F2:FB:48:2C:5B -> LABEL 2~~
### SMARTBULB TAPO  00:5F:67:BF:09:EF -> LABEL 3
### SMARTPLUG TAPO  E8:48:B8:D6:A8:1D -> LABEL 4

## Mount Google Drive environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Python Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from datetime import datetime as dt, timedelta as td
import time
import plotly.express as px
import socket
import glob
import os

## Python Dictionary Objects that identify a smart home device

In [None]:
#
# LABELS DEFINITION
# Device are identified by MAC address associated to the device by the linksys router
# during the tcpdump capturing phase
#
# Ezviz Smart Bulb LB1
label1 = {"mac":"64:F2:FB:DF:FB:E1","app":"EZVIZ","device":"Smart Bulb","name":"LB1"}
# Ezviz Smart Plug T31 (device capture data non available since not connected to same access point, but to other Wi-Fi)
# label2 = {"mac":"64:F2:FB:48:2C:5B","app":"EZVIZ","device":"Smart Plug","name":"T31"}
# Tapo Smart Bulb L530E
label3 = {"mac":"00:5F:67:BF:09:EF","app":"Tapo","device":"Smart Bulb","name":"L530E"}
# Tapo Smart Plug P100
label4 = {"mac":"E8:48:B8:D6:A8:1D","app":"Tapo","device":"Smart Plug","name":"P100"}

## CHOICE
### a. captured data statistics for each smart device based upon temporal base with 1s windows chosen among 0.5s, 1s, 5s, 10s.
### b. captured data with 0.5s temporal base cannot be used since there is a problem with capture00500ms_seg0204.csv that is emtpy
### c. tapo devices have a temporal shift ???? Instead for ezviz devices this does not happen. The EVENT is not strictly located where there is a peak of traffic

In [None]:
lblsel = "label3" # label selected

In [None]:
# capdata = captured data
#
# Extract the data from aggregated data
path = "/content/drive/MyDrive/Thesis - Computer Science and Engineering - Master's degree/datasets/linksys/feature_sniffer/"
aggr_win = "01000"
paths = sorted(glob.glob(r'{}'.format(path + 'capture'+aggr_win+'ms_seg[0-9]*.csv')))
capdata = pd.concat((pd.read_csv(f) for f in paths), ignore_index=True)

#capdata = pd.read_csv(path + "capture00500ms_seg0104.csv")
#capdata = pd.read_csv(path + "capture01000ms_seg0104.csv")
#capdata = pd.read_csv(path + "capture05000ms_seg0104.csv")
#capdata = pd.read_csv(path + "capture10000ms_seg0104.csv")

# Filter the data with the Mac  of device chosen in 'lblsel' variable.
capdata = capdata.query("Dev == '{}'".format(eval(lblsel).get("mac")))
# Convert Timestamp in long format to avoid eulero representation
capdata["TS"] = capdata["TS"].apply(lambda a: round(np.compat.long(a),1))

In [None]:
# TIME SHIFT 1h
''''
-> gtfile.csv start and end timestamp
First event time: 2022-09-20 17:31:20.935,1,Tapo,Smart Bulb,Turn ON bulb
Last  event time: 2022-09-22 19:30:57.121,3000,EZVIZ,Smart Bulb,Set preset mode sweet

-> capture01000ms.csv = capture01000ms_seg0104.csv
                        + capture01000ms_seg0204.csv
                        + capture01000ms_seg0304.csv
                        + capture01000ms_seg0404.csv
First packet time: 2022-09-20 17:19:24.476804
Last  packet time: 2022-09-22 20:26:21.938617
'''

# AGGREGATION WINDOWS COMPARISON
startDate=dt(2022,9,20,17,20,0)
sdts=round(float(startDate.timestamp()),1)
endDate=dt(2022,9,20,18,20,0)
edts=round(float(endDate.timestamp()),1)

# FIRST DAY CAPTURE EXAMPLE
#startDate=dt(2022,9,20,21,30,0)
#sdts=round(float(startDate.timestamp()),1)
#endDate=dt(2022,9,20,22,30,0)
#edts=round(float(endDate.timestamp()),1)

# SECOND DAY CAPTURE EXAMPLE
#startDate=dt(2022,9,21,11,30,0)
#sdts=round(float(startDate.timestamp()),1)
#endDate=dt(2022,9,21,12,30,0)
#edts=round(float(endDate.timestamp()),1)

# THIRD DAY CAPTURE EXAMPLE
#startDate=dt(2022,9,22,13,30,0)
#sdts=round(float(startDate.timestamp()),1)
#endDate=dt(2022,9,22,14,30,0)
#edts=round(float(endDate.timestamp()),1)

In [None]:
# x-axis plot ticks string formatted generated 1 per minute between starting date (sdts) and ending date (endts)
yticksts = np.arange(sdts,edts,1*60) # every minute
yticksdt = list(map(lambda elem: dt.fromtimestamp(elem).strftime('%Y-%m-%d %H:%M:%S') , yticksts))

In [None]:
pltcapdata = capdata.query("TS > {} and TS < {}".format(sdts,edts))

In [None]:
# Get ground truth labelled events
grtdata = pd.read_csv("/content/drive/MyDrive/Thesis - Computer Science and Engineering - Master's degree/datasets/activity_log/gtfile.csv")
# Filter labels by app and device
grtdata = grtdata.query('APP == "{}" and DEVICE == "{}"'.format(eval(lblsel).get("app"), eval(lblsel).get("device")))
rests = grtdata["TIMESTAMP"].apply(lambda x: time.mktime(dt.strptime(x, "%Y-%m-%d %H:%M:%S.%f").timetuple()))
pltgrtdata = grtdata
# Convert timestamp to string format
pltgrtdata["TIMESTAMP"] = pltgrtdata["TIMESTAMP"].apply(lambda x: time.mktime(dt.strptime(x, "%Y-%m-%d %H:%M:%S.%f").timetuple()))
# Select only labels in the selected date range (sdts,edts)
pltgrtdata = grtdata.query("TIMESTAMP > {} and TIMESTAMP < {}".format(sdts,edts))

### PLOT

In [None]:
#fig = plt.figure(figsize=(10, 3))
#fig = plt.figure()
#ax = fig.add_axes([0, 0, 5, 1])
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(20,10))
#fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,3),dpi=200)
#
# PLOT NETWORK DATA
#
traffic = ax.bar(pltcapdata["TS"],pltcapdata["SumTcpDLpckSz"],align='center',width=10)
ax.grid(True)

#ax.set_title("{} {} {} Mean Packet Size from [{}] to [{}] of [{}]"
#             .format(eval(lblsel).get("app"),
#                     eval(lblsel).get("device"),
#                     eval(lblsel).get("name"),
#                     startDate.strftime('%H:%M:%S %a %d %b %Y'),
#                     endDate.strftime('%H:%M:%S %a %d %b %Y'),
#                     endDate.strftime('%H:%M:%S %a %d %b %Y')))

ax.set_title("{} {} Mean Packet Size from [{}] to [{}] of [{}]"
             .format(eval(lblsel).get("app"),
                     eval(lblsel).get("device").split(" ")[1],
                     startDate.strftime('%H:%M'),
                     endDate.strftime('%H:%M'),
                     endDate.strftime('%a %d %b %Y')))
#ax.set_title("Ezviz Bulb Mean Packet Size over Time")
ax.set_xlabel("Timestamp [s]")
ax.set_ylabel("Packet Size [B]")
ax.set_xticks(yticksts)
ax.set_xticklabels(yticksdt,rotation=90,size=8)
#ax.scatter(pltgrtdata["TIMESTAMP"], np.zeros(len(pltgrtdata["TIMESTAMP"])), marker='X', color='r', linewidths=0.5)
#
# PLOT LABELED DATA
#
testdata = ax.stem(pltgrtdata["TIMESTAMP"],
                   np.multiply(np.ones(len(pltgrtdata["TIMESTAMP"])), 100),linefmt="r--",markerfmt="r.",basefmt="r.")
#ax.legend([traffic, testdata],['Traffic Captured','Test Data'])
ax.legend([traffic, testdata],['Aggregate Value','Event'])

#ax = plt.gca()
xtickslocs = ax.get_xticks()[::5]
#plt.xticks(xtickslocs,[f"{300*i}" for i in range(len(xtickslocs ))])
#plt.xticks(xtickslocs, [dt.fromtimestamp(i).strftime('%Y-%m-%d %H:%M:%S') for i in xtickslocs])
plt.xticks(xtickslocs, [dt.fromtimestamp(i).strftime('%H:%M') for i in xtickslocs])
#plt.savefig('/home/angioletto/Downloads/time_series_paper.png', bbox_inches = "tight")

# AGGREGATION WINDOW EXAMPLES
#plt.savefig('/home/angioletto/Downloads/capture00500ms_seg0104_first1h.png', bbox_inches = "tight")
#plt.savefig('/home/angioletto/Downloads/capture01000ms_seg0104_first1h.png', bbox_inches = "tight")
#plt.savefig('/home/angioletto/Downloads/capture05000ms_seg0104_first1h.png', bbox_inches = "tight")
#plt.savefig('/home/angioletto/Downloads/capture10000ms_seg0104_first1h.png', bbox_inches = "tight")

# FIRST DAY CAPTURE EXAMPLE
#plt.savefig('/home/angioletto/Downloads/time_series_aggr_01000ms_day_01.png', bbox_inches = "tight")
# SECOND DAY CAPTURE EXAMPLE
#plt.savefig('/home/angioletto/Downloads/time_series_aggr_01000ms_day_02.png', bbox_inches = "tight")
# THIRD DAY CAPTURE EXAMPLE
#plt.savefig('/home/angioletto/Downloads/time_series_aggr_01000ms_day_03.png', bbox_inches = "tight")
plt.show()

### SUBPLOTS
We tried to create subplots with successive hour time shift

In [None]:
lblsel = "label1"

# capdata = captured data
path = "/content/drive/MyDrive/Thesis - Computer Science and Engineering - Master's degree/datasets/linksys/feature_sniffer/"
aggr_win = "01000"
paths = sorted(glob.glob(r'{}'.format(path + 'capture'+aggr_win+'ms_seg[0-9]*.csv')))
capdata = pd.concat((pd.read_csv(f) for f in paths), ignore_index=True)
capdata = capdata.query("Dev == '{}'".format(eval(lblsel).get("mac")))
capdata["TS"] = capdata["TS"].apply(lambda a: round(np.compat.long(a),1))
fig_list, ax_list = plt.subplots(nrows=10,ncols=1,figsize=(20,100))
for idx in range(0,10):
    #startDate=dt(2022,9,20,17,19,0) + td(hours=idx)
    startDate=dt(2022,9,22,1,42,0) + td(hours=idx)
    sdts=round(float(startDate.timestamp()),1)
    #endDate=dt(2022,9,20,18,19,0) + td(hours=idx)\
    endDate=dt(2022,9,22,2,42,0) + td(hours=idx)
    edts=round(float(endDate.timestamp()),1)
    yticksts = np.arange(sdts,edts,1*60) # every minute
    yticksdt = list(map(lambda elem: dt.fromtimestamp(elem).strftime('%Y-%m-%d %H:%M:%S') , yticksts))
    pltcapdata = capdata.query("TS > {} and TS < {}".format(sdts,edts))
    grtdata = pd.read_csv("/content/drive/MyDrive/Thesis - Computer Science and Engineering - Master's degree/datasets/activity_log/gtfile.csv")
    grtdata = grtdata.query('APP == "{}" and DEVICE == "{}"'.format(eval(lblsel).get("app"), eval(lblsel).get("device")))
    rests = grtdata["TIMESTAMP"].apply(lambda x: time.mktime(dt.strptime(x, "%Y-%m-%d %H:%M:%S.%f").timetuple()))
    pltgrtdata = grtdata
    pltgrtdata["TIMESTAMP"] = pltgrtdata["TIMESTAMP"].apply(lambda x: time.mktime(dt.strptime(x, "%Y-%m-%d %H:%M:%S.%f").timetuple()))
    pltgrtdata = grtdata.query("TIMESTAMP > {} and TIMESTAMP < {}".format(sdts,edts))
    ax = ax_list[idx]
    traffic = ax.bar(pltcapdata["TS"],pltcapdata["SumTcpDLpckSz"],align='center',width=10)
    ax.grid(True)
    ax.set_title("Network Traffic {} {} {} from [{}] to [{}]".format(eval(lblsel).get("app"),eval(lblsel).get("device"),eval(lblsel).get("name"),startDate.strftime('%H:%M:%S %a %d %b %Y'),endDate.strftime('%H:%M:%S %a %d %b %Y')))
    ax.set_xlabel("Timestamp [s]")
    ax.set_ylabel("Packet Size [B]")
    ax.set_xticks(yticksts)
    ax.set_xticklabels(yticksdt,rotation=90,size=8)
    testdata = ax.stem(pltgrtdata["TIMESTAMP"], np.multiply(np.ones(len(pltgrtdata["TIMESTAMP"])), 100),linefmt="r--",markerfmt="r.",basefmt="r.")
    ax.legend([traffic, testdata],['Traffic Captured','Test Data'])

plt.show()