In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from wifi import get_wifi_df
from labels import get_labels_df, get_cls2label

In [3]:
df = get_wifi_df('data/SHL-2021-Train/WiFi.txt')
df.head()

0it [00:00, ?it/s]

Unnamed: 0,ts,n_spots,bssid,ssid,rssi,freq,capabilities
0,2017-03-25 08:35:23.358,10,c0:05:c2:29:40:ff,VM8236218,-56.0,5220.0,[WPA2-PSK-CCMP+TKIP][WPS][ESS]
1,2017-03-25 08:35:23.358,10,c4:04:15:e4:5b:30,VM201213-2G,-70.0,2437.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
2,2017-03-25 08:35:23.358,10,a0:63:91:a2:5a:50,VM702835-2G_EXT,-85.0,2462.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
3,2017-03-25 08:35:23.358,10,c0:05:c2:29:40:f9,VM8236218,-55.0,2412.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]
4,2017-03-25 08:35:23.358,10,98:e7:f5:b9:3f:14,TALKTALKB93F0E,-65.0,2432.0,[WPA-PSK-CCMP+TKIP][WPA2-PSK-CCMP+TKIP][WPS][ESS]


In [4]:
df.shape

(13012752, 7)

In [5]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13012752 entries, 0 to 13012751
Data columns (total 7 columns):
 #   Column        Dtype         
---  ------        -----         
 0   ts            datetime64[ns]
 1   n_spots       uint16        
 2   bssid         object        
 3   ssid          object        
 4   rssi          float64       
 5   freq          float64       
 6   capabilities  object        
dtypes: datetime64[ns](1), float64(2), object(3), uint16(1)
memory usage: 3.0 GB


In [6]:
cls2label = get_cls2label()

label_df = get_labels_df('data/SHL-2021-Train/Label.txt')
label_df.head()

Unnamed: 0,ts,label
0,2017-03-25 08:46:23,4
1,2017-03-25 08:46:24,4
2,2017-03-25 08:46:25,4
3,2017-03-25 08:46:26,4
4,2017-03-25 08:46:27,4


In [7]:
import plotly.express as px
import plotly.graph_objects as go

ModuleNotFoundError: No module named 'plotly'

In [None]:
fig = px.line(label_df.iloc[::20, :], x='ts', y='label')
fig.show()

In [8]:
label_df.head()

Unnamed: 0,ts,label
0,2017-03-25 08:46:23,4
1,2017-03-25 08:46:24,4
2,2017-03-25 08:46:25,4
3,2017-03-25 08:46:26,4
4,2017-03-25 08:46:27,4


In [9]:
from labels import merge_data_labels

In [10]:
merged_df = merge_data_labels(df, label_df)

In [11]:
def freq2band(freq):
    return '5khz' if abs(freq - 5220) < abs(freq - 2420) else '2khz'

## Feature Extraction

In [12]:
def ts_group_apply(group):
    row_features = {}
    
    # ts features:
    ts = group.ts.iloc[0]
    row_features['hour'] = ts.hour
    row_features['minute'] = ts.minute
    row_features['day_of_week'] = ts.day_of_week
    
    # n of spots and optional lables
    row_features['n_spots'] = group.n_spots.iloc[0]
    if 'label' in group.columns:
        row_features['label'] = group.label.iloc[0]
    
    
    # cmb as well as gorup by frequency band
    freq_band = group.freq.apply(freq2band)
    
    for band_selector in ['cmb', '2khz', '5khz']:
        if band_selector == 'cmb':
            band_group = group.rssi
        else:
            band_group = group.rssi[freq_band == band_selector]
        
        agg_group = band_group.agg(aggregations)
        for m, l in agg_group.items():
            row_features[f'rssi_{band_selector}_{m}'] = l
    
    return row_features

In [16]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [None]:
features = []

aggregations = ['count', 'min', 'mean', 'std', 'max']

group_iter = merged_df.groupby('ts')

res_df = group_iter.parallel_apply(ts_group_apply)

## Preprocessing

In [None]:
prep_df = res_df.fillna(0)

In [None]:
prep_df