In [1]:
import numpy as np 
import pandas as pd
from scipy.stats import skew
from scipy.signal import find_peaks
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, roc_auc_score
from itertools import chain
from collections import Counter
import itertools
import pickle
import os

from scipy import signal
import matplotlib.pyplot as plt

##**Functions**

In [2]:
def extract_list_feats(list_name: str, data, features_name: list, base=None):
    """
    Extract Features from vector.
    :param list_name: Vector to extract features from.
    :param data: Dataset to extract features from.
    :param features_name: Feature list to add new feature names to.
    :param base: Disable the use of features.
    :return: Data with features, updated feature name list.
    """

    if base is None:
        base = DEFAULT_TRUE_LIST

    data[f'max_{list_name}'] = data[list_name].apply(np.max)
    if base[0]:
        features_name += [f'max_{list_name}']

    data[f'min_{list_name}'] = data[list_name].apply(np.min)
    if base[1]:
        features_name += [f'min_{list_name}']

    data[f'mean_{list_name}'] = data[list_name].apply(np.mean)
    if base[2]:
        features_name += [f'mean_{list_name}']

    data[f'median_{list_name}'] = data[list_name].apply(np.median)
    if base[3]:
        features_name += [f'median_{list_name}']

    data[f'std_{list_name}'] = data[list_name].apply(np.std)
    if base[4]:
        features_name += [f'std_{list_name}']

    data[f'skew_{list_name}'] = data[list_name].apply(skew)
    if base[5]:
        features_name += [f'skew_{list_name}']

    data[f'max_sub_min_{list_name}'] = data[list_name].apply(lambda x: np.max(x) - np.min(x))
    if base[6]:
        features_name += [f'max_sub_min_{list_name}']

    return data, features_name

In [3]:
def extract_features(data, bases=None):
    """
    Extract features from data.
    :param data: Dataset of time windows.
    :param bases: Dictionary with values of bool lists of size 7 and keys of the names of the vectors to extract
    features from
    :return: new dataset with extracted features, training feature name list
    """

    if bases is None:
        bases = DEFAULT_TRUE_DICT

    features_name = []
    data['RSSI_diffs'] = data.RSSI.apply(lambda x: x[1:] - x[:-1])
    data['RSSI_diffs_abs'] = data.RSSI.apply(lambda x: abs(x[1:] - x[:-1]))
    data['RSSI_median_dist'] = data.RSSI.apply(lambda x: abs(x - np.median(x)))

    data, features_name = extract_list_feats('RSSI', data, features_name, base=bases['RSSI'])
    data, features_name = extract_list_feats('RSSI_diffs', data, features_name, base=bases['RSSI_diffs'])
    data, features_name = extract_list_feats('RSSI_diffs_abs', data, features_name, base=bases['RSSI_diffs_abs'])
    data, features_name = extract_list_feats('RSSI_median_dist', data, features_name, base=bases['RSSI_median_dist'])

    data['max_count_same_value_RSSI'] = data.RSSI.apply(lambda x: np.max(np.unique(x, return_counts=True)[1]))
    features_name += ['max_count_same_value_RSSI']

    data['RSSI_peaks'] = data.RSSI.apply(lambda x: len(find_peaks(x)[0]))
    features_name += ['RSSI_peaks']

    data['RSSI_diffs_peaks'] = data.RSSI_diffs.apply(lambda x: len(find_peaks(x)[0]))
    features_name += ['RSSI_diffs_peaks']

    data['peak_ratio_diffs_RSSI'] = data.apply(
        lambda x: x['RSSI_diffs_peaks'] / x['RSSI_peaks'] if x['RSSI_peaks'] > 0 else 0, axis=1)
    features_name += ['peak_ratio_diffs_RSSI']

    data['RSSI_values_count'] = data.RSSI.apply(lambda x: len(np.unique(x)))
    features_name += ['RSSI_values_count']

    return data, features_name

In [4]:
def window(full_signal: np.ndarray, size: int = 360, stride: int = 360):
    """
    Take a long vector of signals and creates time windows of size "size" and stride of size "stride"
    :param full_signal: the signal to make time windows from
    :param size: size of each time window
    :param stride: time window stride (step size). When window size <= stride it's mean that there is not overlap between the windows.
    :return: time windows of the signal
    """
    return np.lib.stride_tricks.sliding_window_view(full_signal, size)[0::stride]

In [5]:
def make_data(X, y, window_size: int = 360, stride: int = 360):
    """
    Make data for training a model: making windows, adding metadata information to the time windows dataframe, removing
    windows with change in Num_People
    :param X: the data.
    :param y: the labels
    :param window_size: size of each time window
    :param stride: time window stride (step size). When window size <= stride it's mean that there is not overlap between the windows.
    :return: windowed RSSI DataFrame , labels dataframe
    """
    
    X['Num_People'] = y
    multi_vals = X.groupby(['Device_ID']).apply(lambda x: x.nunique() == 1).all()
    single_vals = list(multi_vals[multi_vals].index)
    multi_vals = list(multi_vals[~multi_vals].index)
    windows_df = X.groupby(['Device_ID']).RSSI.apply(
        lambda x: window(x.values, window_size, stride)).explode().to_frame().reset_index()
    for col in (multi_vals + single_vals):
        windows_df[col] = X.groupby(['Device_ID'])[col].apply(
            lambda x: window(x.values, window_size, stride)).explode().reset_index(drop=True).values
    for col in single_vals:
        windows_df[col] = windows_df[col].apply(lambda x: x[0])
    
    df = windows_df
    df['welch_f'] = df['RSSI'].apply(lambda x: signal.welch(x)).str[0]
    df['welch_psd'] = df['RSSI'].apply(lambda x: signal.welch(x)).str[1]
    df['change'] = df.Num_People.apply(lambda x: (len(np.unique(x)) > 1))
    dfx = df[~df['change']]
    df = dfx.copy()
    df.Num_People = df.Num_People.apply(lambda x: x[0])
    df.drop(columns='change', inplace=True)
    return df.drop(columns=['Num_People', 'Room_Num']), df.Num_People, df.Room_Num

In [6]:
def pre_data(data, RSSI_value_selection, window_size, stride):
    """
    Full preprocessing of the data - train_x, train_y split, feature extraction,
    remove data that is smaller than the selected size window, etc.
    :param data: the row data.
    :param RSSI_value_selection: Which signal values to use.
    :param window_size: size of each time window
    :param stride: time window stride (step size). When window size <= stride it's mean that there is not overlap between the windows.
    :return: train set x (with extracted features per window), train set y
    """
    if RSSI_value_selection=="RSSI_Left":
        data["RSSI"] = data.RSSI_Left
    elif RSSI_value_selection=="RSSI_Right":
        data["RSSI"] = data.RSSI_Right
    elif RSSI_value_selection=="Min":
        data["RSSI"] = data[['RSSI_Left','RSSI_Right']].min(axis=1).values
    elif RSSI_value_selection=="Max":
        data["RSSI"] = data[['RSSI_Left','RSSI_Right']].max(axis=1).values
    else: 
        data["RSSI"] = np.ceil(data[['RSSI_Left','RSSI_Right']].mean(axis=1).values).astype('int')

    # data.drop(['Room_Num'], axis=1, inplace=True)
    data.dropna(subset = ["Num_People"], inplace=True)

    for dev_id in list(set(data.Device_ID)):
        sub_dev_id = data.loc[data.Device_ID == dev_id]
        if len(sub_dev_id) < window_size:
            data = data[data.Device_ID != dev_id]
    train_x, train_y, raw_x, room_data = create_features(data, window_size, stride)
    train_x= train_x.reset_index(drop = True)
    train_y= train_y.reset_index(drop = True)
    train_x.drop('Device_ID', axis=1, inplace=True)
    return train_x, train_y, raw_x, room_data

In [7]:
def create_features(data, window_size, stride):
    """
    Feature engineering: 
    :param data: the data
    :param window_size: size of each time window
    :param stride: time window stride (step size). When window size <= stride it's mean that there is not overlap between the windows.
    :return: full dataset (with extracted features), train set y
    """

    X, y = data.drop(columns='Num_People'), data['Num_People']
    X, y, room_data = make_data(X, y, window_size=window_size, stride=stride)
    X_features, train_feat = extract_features(X.copy())
    train_feat.append('Device_ID')
    X_features = X_features[train_feat]
    return X_features, y, X, room_data

#**Model**

##**Data preparation**


Download training data

In [8]:
# !gdown -O ../data/mafat_wifi_challenge_training_set_v1.csv 'https://drive.google.com/uc?id=121CbFZbU6kAWNjmjZF232DsiGF2-BoYy'

Read traning data to dataframe

In [59]:
data = pd.read_csv('../data/mafat_wifi_challenge_training_set_v1.csv')

window_size - defines the number of timestamps in each window

window_stride - defines the shift between windows, i.e., for window_size = 360 and window_stride = 1: timestamps 0 - 359 will be selected for the first window and timestamps 1-360 will be selected for the second window. And so on for the rest of the windows for each device.

In [10]:
window_size = 360 #@param {type:"integer"}
window_stride = 360 #@param {type:"integer"}

Select the signal values to do the feature - engineering ("extract_feature" function): RSSI_Left/ RSSI_Right/ the minimum value ​​between the signals/ The maximum value ​​between the signals/ average of signals

In [11]:
RSSI_value_selection = "Average" #@param ["RSSI_Left","RSSI_Right","Min","Max","Average"]

In [12]:
"""
Lists of features to extract from each vector
"""

DEFAULT_TRUE_LIST = [True] * 7
DEFAULT_TRUE_DICT = {
    'RSSI': [True, False, False, False, True, True, True],
    'RSSI_diffs': [True, True, True, False, True, True, True],
    'RSSI_diffs_abs': [False, False, True, True, True, False, True],
    'RSSI_median_dist': [True, False, True, True, True, False, True]
}

Preprocess training data

In [109]:
data_train_x, data_train_y, raw_x, room_data = pre_data(data, RSSI_value_selection, window_size, window_stride)

In [40]:
raw_x

Unnamed: 0,Device_ID,RSSI,Time,RSSI_Left,RSSI_Right,welch_f,welch_psd
1,287,"[-50, -50, -50, -50, -50, -50, -50, -50, -50, ...","[180.0, 180.5, 181.0, 181.5, 182.0, 182.5, 183...","[-52, -52, -52, -52, -52, -52, -52, -52, -52, ...","[-49, -48, -48, -48, -49, -48, -48, -48, -48, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[28.225202688409954, 65.03615991355555, 37.556..."
2,287,"[-50, -50, -50, -50, -50, -50, -50, -50, -50, ...","[360.0, 360.5, 361.0, 361.5, 362.0, 362.5, 363...","[-52, -52, -52, -52, -52, -52, -52, -52, -52, ...","[-48, -48, -49, -49, -48, -49, -49, -49, -49, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[17.281185598550888, 95.69667924742882, 131.87..."
3,287,"[-49, -50, -50, -50, -49, -50, -51, -50, -50, ...","[540.0, 540.5, 541.0, 541.5, 542.0, 542.5, 543...","[-51, -52, -52, -52, -51, -52, -53, -51, -52, ...","[-48, -48, -48, -49, -48, -48, -49, -49, -48, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[0.6897504444054456, 45.74248836468422, 205.04..."
4,287,"[-54, -54, -54, -54, -50, -50, -50, -50, -50, ...","[720.0, 720.5, 721.0, 721.5, 722.0, 722.5, 723...","[-48, -48, -48, -48, -51, -51, -51, -51, -51, ...","[-60, -60, -60, -60, -49, -49, -49, -49, -49, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[21.491571523914505, 57.43307085597603, 61.158..."
5,287,"[-50, -50, -50, -50, -50, -50, -50, -50, -50, ...","[900.0, 900.5, 901.0, 901.5, 902.0, 902.5, 903...","[-51, -51, -51, -51, -51, -51, -51, -51, -51, ...","[-49, -49, -49, -49, -49, -49, -49, -49, -49, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[61.126760753930284, 44.32071824607235, 259.04..."
...,...,...,...,...,...,...,...
7592,96300,"[-45, -45, -45, -45, -45, -45, -45, -45, -45, ...","[14220.0, 14220.5, 14221.0, 14221.5, 14222.0, ...","[-44, -44, -44, -44, -44, -44, -44, -44, -44, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[3.780039203041675, 5.786948882031685, 0.59172..."
7594,96300,"[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[14580.0, 14580.5, 14581.0, 14581.5, 14582.0, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7595,96300,"[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[14760.0, 14760.5, 14761.0, 14761.5, 14762.0, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[0.003215190456431821, 0.01446728681209163, 0...."
7596,96300,"[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[14940.0, 14940.5, 14941.0, 14941.5, 14942.0, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[-46, -46, -46, -46, -46, -46, -46, -46, -46, ...","[0.0, 0.00390625, 0.0078125, 0.01171875, 0.015...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
tt = raw_x['RSSI'].iloc[500]

In [None]:
from scipy import signal
import matplotlib.pyplot as plt

In [None]:
f, Pxx_den = signal.welch(tt)

In [None]:
plt.semilogy(f, Pxx_den)
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
plt.show()

In [None]:
plt.semilogy(f, Pxx_den)
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
plt.show()

In [None]:
tt = raw_x['RSSI'].iloc[4]
f, Pxx_den = signal.welch(tt)
plt.semilogy(f, Pxx_den)
plt.xlabel('frequency [Hz]')
plt.ylabel('PSD [V**2/Hz]')
plt.show()

In [110]:
def get_embedded_data(X, method='pca', n_components=2):
    if method == 'pca':
        from sklearn.decomposition import PCA
        return PCA(n_components).fit_transform(X)
    elif method == 'tsne':
        from sklearn.manifold import TSNE
        return TSNE(n_components, learning_rate='auto', init='random').fit_transform(X)
    else:
        print('embedding method not implemented. bye')
        exit()

In [111]:
from sklearn.cluster import MiniBatchKMeans
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()



In [112]:
kmeans = MiniBatchKMeans(n_clusters=16, random_state=42, batch_size=100, max_iter=100).fit(data_train_x)
M = get_embedded_data(data_train_x, method='tsne', n_components=3)


##**Train RandomForestClassifier model**

###**Track 1**

####**Convert classes to 0/1**

In [113]:
# Convert classes to 0/1 to evaluate the model's score for predicting room occupancy
# in Track 1 you are required to predict probability for room occupancy (in the range of 0-1).
# however, the data is used for both tracks, and it contains the raw number of people
# in the room, here we convert the raw data to 0 or 1.

data_train_track1 = data_train_y.copy()
data_train_track1.loc[data_train_y>0] = 1

In [114]:
data_train_x['y_track1'] = data_train_track1

In [115]:
data_train_x['y_track2'] = data_train_y

In [None]:
data_train_x.to_csv("processed_rows.csv", index=False)

In [None]:
import csv
a=raw_x[:]['RSSI']#.to_csv("raw_rows.csv", index=False)
b=pd.DataFrame(a.to_list())


In [None]:
df = pd.concat([b, data_train_x['y_track1'], data_train_x['y_track2']], axis=1)

In [None]:
df.to_csv("raw_rows.csv", index=False)

In [116]:
import csv
a=raw_x[:]['welch_psd']#.to_csv("raw_rows.csv", index=False)
b=pd.DataFrame(a.to_list())

In [117]:
c=pd.DataFrame(M, columns=['d1','d2','d3'])

In [118]:
room_data_ = room_data.to_frame().reset_index().Room_Num

In [123]:
df = pd.concat([b, c, pd.DataFrame(kmeans.labels_, columns=['kmeans'])], axis=1)

In [126]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,123,124,125,126,127,128,d1,d2,d3,kmeans
0,0.009559,3.069338e-03,4.652196e-03,3.289716e-02,6.200779e-03,1.857071e-03,0.023095,0.097289,0.067937,0.016320,...,0.000935,0.023491,0.016554,0.007626,0.003440,0.001639,0.600394,0.461405,0.102576,0.866667
1,0.005853,4.516341e-03,1.633566e-02,3.457599e-02,2.140257e-02,2.904353e-03,0.019708,0.037982,0.035138,0.048638,...,0.015916,0.004976,0.001741,0.002369,0.000996,0.000107,0.650114,0.499685,0.170268,0.666667
2,0.000234,2.158786e-03,2.539861e-02,7.802720e-03,5.154414e-02,2.716234e-02,0.021093,0.073614,0.082010,0.027854,...,0.044931,0.013133,0.011714,0.015237,0.001105,0.001351,0.590255,0.349214,0.162308,0.866667
3,0.007279,2.710516e-03,7.575770e-03,5.604375e-03,1.564395e-02,1.223604e-02,0.003596,0.023651,0.025596,0.008300,...,0.000640,0.002747,0.004715,0.000597,0.001955,0.002580,0.610990,0.266492,0.469885,0.400000
4,0.020702,2.091687e-03,3.208837e-02,1.559382e-01,7.942705e-02,4.289367e-02,0.040400,0.008548,0.001527,0.000610,...,0.173267,0.104971,0.201805,0.112277,0.052425,0.037544,0.481523,0.506225,0.355640,0.466667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6320,0.001280,2.731112e-04,7.329797e-05,7.508726e-05,6.377857e-05,4.642460e-05,0.000044,0.000049,0.000045,0.000051,...,0.000185,0.000155,0.000185,0.000095,0.000046,0.000037,0.875871,0.382005,0.651607,0.133333
6321,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.576346,0.013042,0.357079,0.066667
6322,0.000001,6.827740e-07,5.097234e-07,6.860506e-07,8.438643e-07,9.497556e-07,0.000001,0.000002,0.000002,0.000003,...,0.000044,0.000037,0.000044,0.000022,0.000011,0.000009,0.157521,0.654146,0.607453,0.066667
6323,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.562219,0.017908,0.363979,0.066667


In [125]:
df = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)



In [127]:
df = pd.concat([df, room_data_, data_train_x['y_track1'], data_train_x['y_track2']], axis=1)

In [128]:
df#[df[1]==0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,126,127,128,d1,d2,d3,kmeans,Room_Num,y_track1,y_track2
0,0.009559,3.069338e-03,4.652196e-03,3.289716e-02,6.200779e-03,1.857071e-03,0.023095,0.097289,0.067937,0.016320,...,0.007626,0.003440,0.001639,0.600394,0.461405,0.102576,0.866667,6,0.0,0.0
1,0.005853,4.516341e-03,1.633566e-02,3.457599e-02,2.140257e-02,2.904353e-03,0.019708,0.037982,0.035138,0.048638,...,0.002369,0.000996,0.000107,0.650114,0.499685,0.170268,0.666667,6,0.0,0.0
2,0.000234,2.158786e-03,2.539861e-02,7.802720e-03,5.154414e-02,2.716234e-02,0.021093,0.073614,0.082010,0.027854,...,0.015237,0.001105,0.001351,0.590255,0.349214,0.162308,0.866667,6,0.0,0.0
3,0.007279,2.710516e-03,7.575770e-03,5.604375e-03,1.564395e-02,1.223604e-02,0.003596,0.023651,0.025596,0.008300,...,0.000597,0.001955,0.002580,0.610990,0.266492,0.469885,0.400000,6,0.0,0.0
4,0.020702,2.091687e-03,3.208837e-02,1.559382e-01,7.942705e-02,4.289367e-02,0.040400,0.008548,0.001527,0.000610,...,0.112277,0.052425,0.037544,0.481523,0.506225,0.355640,0.466667,6,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6320,0.001280,2.731112e-04,7.329797e-05,7.508726e-05,6.377857e-05,4.642460e-05,0.000044,0.000049,0.000045,0.000051,...,0.000095,0.000046,0.000037,0.875871,0.382005,0.651607,0.133333,3,0.0,0.0
6321,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.576346,0.013042,0.357079,0.066667,3,1.0,1.0
6322,0.000001,6.827740e-07,5.097234e-07,6.860506e-07,8.438643e-07,9.497556e-07,0.000001,0.000002,0.000002,0.000003,...,0.000022,0.000011,0.000009,0.157521,0.654146,0.607453,0.066667,3,1.0,1.0
6323,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.562219,0.017908,0.363979,0.066667,3,1.0,1.0


In [129]:
df.to_csv("psd_rows.csv", index=False)

In [130]:
data_train_x.drop(['y_track1', 'y_track2'], axis=1, inplace=True)
data_train_x = pd.DataFrame(min_max_scaler.fit_transform(data_train_x), columns=data_train_x.columns)

In [131]:
# df.drop(['y_track1', 'y_track2'], axis=1, inplace=True)
data_train_x

Unnamed: 0,max_RSSI,std_RSSI,skew_RSSI,max_sub_min_RSSI,max_RSSI_diffs,min_RSSI_diffs,mean_RSSI_diffs,std_RSSI_diffs,skew_RSSI_diffs,max_sub_min_RSSI_diffs,...,max_RSSI_median_dist,mean_RSSI_median_dist,median_RSSI_median_dist,std_RSSI_median_dist,max_sub_min_RSSI_median_dist,max_count_same_value_RSSI,RSSI_peaks,RSSI_diffs_peaks,peak_ratio_diffs_RSSI,RSSI_values_count
0,0.186275,0.171801,0.502745,0.098765,0.076923,0.9125,0.461538,0.232116,0.493128,0.082278,...,0.049383,0.228554,0.333333,0.094066,0.049383,0.318612,0.321429,0.409524,0.265432,0.421053
1,0.196078,0.180666,0.501891,0.111111,0.102564,0.9000,0.557692,0.242070,0.495164,0.101266,...,0.061728,0.234189,0.333333,0.164291,0.061728,0.318612,0.261905,0.323810,0.257576,0.473684
2,0.196078,0.181080,0.514729,0.111111,0.102564,0.9000,0.442308,0.264772,0.505484,0.101266,...,0.098765,0.220726,0.166667,0.193424,0.098765,0.381703,0.500000,0.561905,0.234127,0.473684
3,0.196078,0.139345,0.482292,0.098765,0.064103,0.9375,0.615385,0.199919,0.505627,0.063291,...,0.049383,0.107389,0.000000,0.132441,0.049383,0.611987,0.250000,0.323810,0.269841,0.368421
4,0.205882,0.248131,0.380148,0.530864,0.474359,0.5625,0.461538,0.494268,0.523367,0.455696,...,0.469136,0.232311,0.333333,0.221055,0.469136,0.208202,0.357143,0.428571,0.250000,0.578947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6320,0.225490,0.040468,0.500534,0.037037,0.025641,0.9625,0.519231,0.032234,0.297568,0.031646,...,0.024691,0.049468,0.000000,0.044247,0.024691,0.504732,0.011905,0.009524,0.166667,0.105263
6321,0.196078,0.000000,0.500000,0.000000,0.000000,1.0000,0.538462,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
6322,0.215686,0.008331,1.000000,0.024691,0.025641,0.9750,0.538462,0.025289,0.500000,0.025316,...,0.024691,0.000626,0.000000,0.009281,0.024691,0.996845,0.011905,0.009524,0.166667,0.052632
6323,0.196078,0.000000,0.500000,0.000000,0.000000,1.0000,0.538462,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000


In [132]:
df_combined = pd.concat([data_train_x, df], axis=1)

In [134]:
df_combined

Unnamed: 0,max_RSSI,std_RSSI,skew_RSSI,max_sub_min_RSSI,max_RSSI_diffs,min_RSSI_diffs,mean_RSSI_diffs,std_RSSI_diffs,skew_RSSI_diffs,max_sub_min_RSSI_diffs,...,126,127,128,d1,d2,d3,kmeans,Room_Num,y_track1,y_track2
0,0.186275,0.171801,0.502745,0.098765,0.076923,0.9125,0.461538,0.232116,0.493128,0.082278,...,0.007626,0.003440,0.001639,0.600394,0.461405,0.102576,0.866667,6,0.0,0.0
1,0.196078,0.180666,0.501891,0.111111,0.102564,0.9000,0.557692,0.242070,0.495164,0.101266,...,0.002369,0.000996,0.000107,0.650114,0.499685,0.170268,0.666667,6,0.0,0.0
2,0.196078,0.181080,0.514729,0.111111,0.102564,0.9000,0.442308,0.264772,0.505484,0.101266,...,0.015237,0.001105,0.001351,0.590255,0.349214,0.162308,0.866667,6,0.0,0.0
3,0.196078,0.139345,0.482292,0.098765,0.064103,0.9375,0.615385,0.199919,0.505627,0.063291,...,0.000597,0.001955,0.002580,0.610990,0.266492,0.469885,0.400000,6,0.0,0.0
4,0.205882,0.248131,0.380148,0.530864,0.474359,0.5625,0.461538,0.494268,0.523367,0.455696,...,0.112277,0.052425,0.037544,0.481523,0.506225,0.355640,0.466667,6,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6320,0.225490,0.040468,0.500534,0.037037,0.025641,0.9625,0.519231,0.032234,0.297568,0.031646,...,0.000095,0.000046,0.000037,0.875871,0.382005,0.651607,0.133333,3,0.0,0.0
6321,0.196078,0.000000,0.500000,0.000000,0.000000,1.0000,0.538462,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.576346,0.013042,0.357079,0.066667,3,1.0,1.0
6322,0.215686,0.008331,1.000000,0.024691,0.025641,0.9750,0.538462,0.025289,0.500000,0.025316,...,0.000022,0.000011,0.000009,0.157521,0.654146,0.607453,0.066667,3,1.0,1.0
6323,0.196078,0.000000,0.500000,0.000000,0.000000,1.0000,0.538462,0.000000,0.500000,0.000000,...,0.000000,0.000000,0.000000,0.562219,0.017908,0.363979,0.066667,3,1.0,1.0


In [133]:
df_combined.to_csv("combined_psd_rows.csv", index=False)

####**Fit Random Forest estimator to all training set**
#### No train-validation split is used (we use the submission as validation)

In [None]:
#Train the model on all training data and calculate the AUC metric for the first track
rfc  = RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=2,
                              n_estimators=350, random_state=0, class_weight="balanced", bootstrap = True)

rfc.fit(data_train_x, data_train_track1)

train_predict_classification = rfc.predict(data_train_x)
print(f'The auc for all training set: {round(roc_auc_score(data_train_track1, rfc.predict_proba(data_train_x)[:,1], average= None),3)}')

####**Save model - track 1**

In [None]:
# save model weights
filename = "model_track_1.sav"
pickle.dump(rfc, open(filename, 'wb'))

##**Prepare submmision**

**Attention!**

Full submission includes the following files in a zip archive:
1.   model.py (**must**) - contains a class named "model". The class must have implementations of "load", "__init__" and "predict" functions:
    *    __init__ - initialization function of the model class.
    *   load - a function that loads the model and model weights.
    *   predict - a function that receives one window each time (as a DataFrame) and returns a one value prediction.
    * **The file may contain other functions (within the class or outside of it)**
    * imports used by the class must be compatible with the permitted python packages.


2. metadata (**must**)
    * contain the command for running the model file - **do not change this file**


3.   model weights (**optional**)
    * in this example, we demonstrate how to save a Random Forest classifier   weights. however, these can be any kind of weights as long as they are compatible with the model and the permitted python packages. 
    * if the model depends on these weights, this file is mandatory.  


4. Helper_func.py (**optional**)
    * This file contains helper functions. The file can have a different name as long as it is compatible with model.py
    * if the model depends on these weights, this file is mandatory.  


Running the following cells will generate a zip file with a valid submission for track 1.

Notice the minor changes that can be made to make it a valid submission for track 2 .

This is the baseline submission, you can check it's score on the leaderboard.

In [None]:
%%writefile helper_func.py
import numpy as np   
import pandas as pd
from scipy.stats import skew
from scipy.signal import find_peaks

def extract_features(X, bases=None):
    """
    Extract features from data.
    :param X: Dataset of time windows.
    :param bases: Dictionary with values of bool lists of size 7 and keys of the names of the vectors to extract
    features from
    :return: new dataset with extracted features, training feature name list
    """
    # Restructure dataframe to fit preprocessing features extraction functions - changing the dataframe to have one row.
    # Each column is compressed to a list.
    data = pd.DataFrame(columns=X.columns)
    for col in data.columns:
        data.loc[0,col]= np.array(X[col])

    if bases is None:
        bases = {
        'RSSI': [True, False, False, False, True, True, True],
        'RSSI_diffs': [True, True, True, False, True, True, True],
        'RSSI_diffs_abs': [False, False, True, True, True, False, True],
        'RSSI_median_dist': [True, False, True, True, True, False, True]
    } 

    features_name = []
    data['RSSI_diffs'] = data.RSSI.apply(lambda x: x[1:] - x[:-1])
    data['RSSI_diffs_abs'] = data.RSSI.apply(lambda x: abs(x[1:] - x[:-1]))
    data['RSSI_median_dist'] = data.RSSI.apply(lambda x: abs(x - np.median(x)))

    data, features_name = extract_list_feats('RSSI', data, features_name, base=bases['RSSI'])
    data, features_name = extract_list_feats('RSSI_diffs', data, features_name, base=bases['RSSI_diffs'])
    data, features_name = extract_list_feats('RSSI_diffs_abs', data, features_name, base=bases['RSSI_diffs_abs'])
    data, features_name = extract_list_feats('RSSI_median_dist', data, features_name, base=bases['RSSI_median_dist'])

    data['max_count_same_value_RSSI'] = data.RSSI.apply(lambda x: np.max(np.unique(x, return_counts=True)[1]))
    features_name += ['max_count_same_value_RSSI']

    data['RSSI_peaks'] = data.RSSI.apply(lambda x: len(find_peaks(x)[0]))
    features_name += ['RSSI_peaks']

    data['RSSI_diffs_peaks'] = data.RSSI_diffs.apply(lambda x: len(find_peaks(x)[0]))
    features_name += ['RSSI_diffs_peaks']

    data['peak_ratio_diffs_RSSI'] = data.apply(
        lambda x: x['RSSI_diffs_peaks'] / x['RSSI_peaks'] if x['RSSI_peaks'] > 0 else 0, axis=1)
    features_name += ['peak_ratio_diffs_RSSI']

    data['RSSI_values_count'] = data.RSSI.apply(lambda x: len(np.unique(x)))
    features_name += ['RSSI_values_count']

    return data, features_name

def extract_list_feats(list_name: str, data, features_name: list, base=None):
    """
    Extract Features from vector.
    :param list_name: Vector to extract features from.
    :param data: Dataset to extract features from.
    :param features_name: Feature list to add new feature names to.
    :param base: Disable the use of features.
    :return: Data with features, updated feature name list.
    """

    if base is None:
        base = DEFAULT_TRUE_LIST

    data[f'max_{list_name}'] = data[list_name].apply(np.max)
    if base[0]:
        features_name += [f'max_{list_name}']

    data[f'min_{list_name}'] = data[list_name].apply(np.min)
    if base[1]:
        features_name += [f'min_{list_name}']

    data[f'mean_{list_name}'] = data[list_name].apply(np.mean)
    if base[2]:
        features_name += [f'mean_{list_name}']

    data[f'median_{list_name}'] = data[list_name].apply(np.median)
    if base[3]:
        features_name += [f'median_{list_name}']

    data[f'std_{list_name}'] = data[list_name].apply(np.std)
    if base[4]:
        features_name += [f'std_{list_name}']

    data[f'skew_{list_name}'] = data[list_name].apply(skew)
    if base[5]:
        features_name += [f'skew_{list_name}']

    data[f'max_sub_min_{list_name}'] = data[list_name].apply(lambda x: np.max(x) - np.min(x))
    if base[6]:
        features_name += [f'max_sub_min_{list_name}']

    return data, features_name
    
def preprocess(X, RSSI_value_selection):
    """
    Calculate the features on the selected RSSI on the test set
    :param X: Dataset to extract features from.
    :param RSSI_value_selection: Which signal values to use- - in our case it is Average.
    :return: Test x dataset with features
    """
    if RSSI_value_selection=="RSSI_Left":
        X["RSSI"] = X.RSSI_Left
    elif RSSI_value_selection=="RSSI_Right":
        X["RSSI"] = X.RSSI_Right
    elif RSSI_value_selection=="Min":
        X["RSSI"] = X[['RSSI_Left','RSSI_Right']].min(axis=1).values
    elif RSSI_value_selection=="Max":
        X["RSSI"] = X[['RSSI_Left','RSSI_Right']].max(axis=1).values
    else: 
        X["RSSI"] = np.ceil(X[['RSSI_Left','RSSI_Right']].mean(axis=1).values).astype('int')

    X, features_name = extract_features(X)
    X.drop('Device_ID', axis=1, inplace=True)
    return X[features_name]

In [None]:
%%writefile model.py

import pickle
import numpy as np
from os.path import isfile
import joblib
from sklearn.ensemble import RandomForestClassifier
from helper_func import preprocess
import os

class model:
    def __init__(self):
        '''
        Init the model
        '''

        self.model  = RandomForestClassifier(max_depth=5, min_samples_leaf=2, min_samples_split=2,
                              n_estimators=350, random_state=0, class_weight="balanced", bootstrap = True)
        self.RSSI_value_selection = 'Average'

    def predict(self, X):
        '''
        Edit this function to fit your model.

        This function should provide predictions of labels on (test) data.
        Make sure that the predicted values are in the correct format for the scoring
        metric.
        preprocess: it our code for add feature to the data before we predict the model.
        :param X: is DataFrame with the columns - 'Time', 'Device_ID', 'Rssi_Left','Rssi_Right'. 
                  X is window of size 360 samples time, shape(360,4).
        :return: a float value of the prediction for class 1 (the room is occupied).
        '''
        # preprocessing should work on a single window, i.e a dataframe with 360 rows and 4 columns
        X = preprocess(X,self.RSSI_value_selection)
        y = self.model.predict_proba(X)[:,1][0]
        
        '''
        Track 2 - for track 2 we naively assume that the model from track-1 predicts 0/1 correctly. 
        We use that assumption in the following way:
        when the room is occupied (1,2,3 - model predicted 1) we assign the majorty class (2) as prediction.       
        '''
        #y = 0 if y<0.5 else 2
        return y

    def load(self, dir_path):
        '''
        Edit this function to fit your model.

        This function should load the model that you trained on the train set.
        :param dir_path: A path for the folder the model is submitted 
        '''
        model_name = 'model_track_1.sav' 
        model_file = os.path.join(dir_path, model_name)
        self.model = joblib.load(model_file)

In [None]:
%%writefile metadata
command: python3 $program/model.py $input $output

zip the files to submit

In [None]:
!zip -r submission.zip model.py helper_func.py metadata model_track_1.sav

*You can use this notebook to save your file, download it, and submit it on CodaLab.

To download the zip file, use the file manager panel.
Use View > Table of contents to show the sidebar then click the Files tab. Right-click the file and select Download.

##**Example- Prediction with the submitted model**

In this section, we demonstrate how to predict with the submitted model  on window (360 samples).

###download and read one window for prediction

(An example is based on the train set)

In [None]:
!gdown -O one_window_for_demo.csv https://drive.google.com/uc?id=1kVAMV-zEn2bGLLtMOYA7-gVLnofCo3m_

In [None]:
X = pd.read_csv('/content/one_window_for_demo.csv')

In [None]:
print(X.head(10))
print(f'window shape: {len(X)}')

###Create object model, load and predict

Unzip the submission files

In [None]:
!unzip -o '/content/submission.zip'

Create model object, load and predict

In [None]:
from model import *
M = model()
M.load('')
Y_test=[]
unique_windows = list(set(X.Num_Window))
for window in unique_windows:
   X_test_window = X.loc[X['Num_Window'] == window]
   X_test_window.drop('Num_Window', axis=1, inplace=True)
   Y_test.append(M.predict(X_test_window))

print(f'Occupancy prediction: {round(Y_test[0],3)}')