## Sensor Noise Suppression using Multivariate Regression

### Import the dependencies

In [57]:
%load_ext lab_black

# Common Imports
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

# Assessing performance
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Making a prediction with the direct multioutput regression model
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR, NuSVC

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings

warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# To make this notebook's output identical at every run
np.random.seed(42)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


### Preprocess the data

In [2]:
data = (
    pd.read_csv("position_log_v2.csv")
    .query("marker % 2 == 0")
    .drop(["x", "y", "z", "anchors", "time"], axis=1)
    .reset_index(drop=True)
)

data["tag"] = data["tag"].replace(52630, 3)

data["marker"] = data["marker"].map(
    {j: i + 1 for i, j in enumerate(data.marker.unique())}
)

data.columns = [
    "position",
    "tag",
    "anchor_1",
    "anchor_2",
    "anchor_3",
    "anchor_4",
    "anchor_5",
]

data

Unnamed: 0,position,tag,anchor_1,anchor_2,anchor_3,anchor_4,anchor_5
0,1,1,204.0,871.0,1072.0,375.0,820.0
1,1,2,209.0,871.0,1082.0,468.0,820.0
2,1,3,189.0,882.0,1117.0,401.0,817.0
3,1,1,205.0,872.0,1072.0,375.0,824.0
4,1,2,214.0,875.0,1082.0,401.0,827.0
...,...,...,...,...,...,...,...
29415,12,2,336.0,602.0,814.0,160.0,603.0
29416,12,3,327.0,636.0,837.0,240.0,615.0
29417,12,1,332.0,618.0,801.0,184.0,586.0
29418,12,2,338.0,603.0,812.0,159.0,599.0


In [3]:
position = data.query("tag == 1").loc[:, ["position"]].reset_index(drop=True)

data_tag_1 = (
    data.query("tag == 1").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_2 = (
    data.query("tag == 2").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_3 = (
    data.query("tag == 3").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

In [4]:
data_tag_1.columns = map("{}_tag_1".format, data_tag_1.columns)
data_tag_2.columns = map("{}_tag_2".format, data_tag_2.columns)
data_tag_3.columns = map("{}_tag_3".format, data_tag_3.columns)

In [5]:
data = position.join([data_tag_1, data_tag_2, data_tag_3])
data

Unnamed: 0,position,anchor_1_tag_1,anchor_2_tag_1,anchor_3_tag_1,anchor_4_tag_1,anchor_5_tag_1,anchor_1_tag_2,anchor_2_tag_2,anchor_3_tag_2,anchor_4_tag_2,anchor_5_tag_2,anchor_1_tag_3,anchor_2_tag_3,anchor_3_tag_3,anchor_4_tag_3,anchor_5_tag_3
0,1.0,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,189.0,882.0,1117.0,401.0,817.0
1,1.0,205.0,872.0,1072.0,375.0,824.0,214.0,875.0,1082.0,401.0,827.0,188.0,880.0,1116.0,398.0,817.0
2,1.0,206.0,870.0,1074.0,374.0,822.0,208.0,874.0,1078.0,417.0,819.0,187.0,885.0,1121.0,396.0,817.0
3,1.0,203.0,872.0,1069.0,373.0,821.0,211.0,873.0,1082.0,421.0,821.0,190.0,890.0,1121.0,396.0,815.0
4,1.0,199.0,870.0,1070.0,371.0,829.0,207.0,872.0,1082.0,433.0,825.0,188.0,880.0,1120.0,399.0,815.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9395,12.0,333.0,622.0,801.0,183.0,585.0,333.0,599.0,812.0,164.0,606.0,325.0,637.0,840.0,239.0,622.0
9396,12.0,333.0,617.0,802.0,184.0,584.0,331.0,602.0,811.0,159.0,601.0,325.0,634.0,836.0,242.0,625.0
9397,12.0,330.0,621.0,807.0,182.0,581.0,332.0,601.0,811.0,158.0,602.0,323.0,636.0,837.0,243.0,626.0
9398,12.0,329.0,618.0,802.0,182.0,583.0,333.0,600.0,810.0,155.0,598.0,325.0,637.0,838.0,241.0,625.0


In [6]:
# Amount of readings per location
data["position"].value_counts()

9.0     1026
10.0     911
8.0      911
6.0      911
3.0      910
2.0      798
1.0      798
11.0     796
12.0     796
7.0      796
4.0      567
5.0      180
Name: position, dtype: int64

In [19]:
x_pos = [1597, 766, 530, 839, 349, 1269, 530, 1597, 530, 1597, 944, 1315]
y_pos = [1958, 1690, 2040, 2244, 1467, 1744, 1690, 1690, 2302, 2302, 2083, 1925]

data_t = pd.DataFrame()

for i in data["position"].unique():
    df = data.query(f"position == {i}").drop("position", axis=1).reset_index(drop=True)

    df.index = df.index + 1
    df_out = df.stack()
    df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
    df_out = df_out.to_frame().T

    data_t = data_t.append(df_out).fillna(0).reset_index(drop=True)

data_t["x"] = x_pos
data_t["y"] = y_pos

In [20]:
X = data_t.drop(["x", "y"], axis=1)
Y = data_t[["x", "y"]].copy()

In [21]:
X

Unnamed: 0,anchor_1_tag_1_1,anchor_2_tag_1_1,anchor_3_tag_1_1,anchor_4_tag_1_1,anchor_5_tag_1_1,anchor_1_tag_2_1,anchor_2_tag_2_1,anchor_3_tag_2_1,anchor_4_tag_2_1,anchor_5_tag_2_1,...,anchor_1_tag_2_1026,anchor_2_tag_2_1026,anchor_3_tag_2_1026,anchor_4_tag_2_1026,anchor_5_tag_2_1026,anchor_1_tag_3_1026,anchor_2_tag_3_1026,anchor_3_tag_3_1026,anchor_4_tag_3_1026,anchor_5_tag_3_1026
0,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,...,,,,,,,,,,
1,898.0,210.0,469.0,648.0,570.0,937.0,191.0,469.0,514.0,598.0,...,,,,,,,,,,
2,1085.0,485.0,223.0,860.0,376.0,1044.0,454.0,205.0,786.0,378.0,...,,,,,,,,,,
3,868.0,583.0,406.0,674.0,193.0,900.0,568.0,363.0,682.0,172.0,...,,,,,,,,,,
4,1376.0,483.0,716.0,,905.0,1357.0,473.0,652.0,956.0,918.0,...,,,,,,,,,,
5,421.0,530.0,865.0,,696.0,1353.0,477.0,654.0,958.0,917.0,...,,,,,,,,,,
6,1116.0,296.0,391.0,747.0,615.0,444.0,592.0,797.0,,658.0,...,,,,,,,,,,
7,1123.0,622.0,272.0,909.0,358.0,1110.0,299.0,434.0,753.0,632.0,...,,,,,,,,,,
8,390.0,996.0,1053.0,632.0,724.0,1107.0,664.0,338.0,1015.0,333.0,...,353.0,993.0,1029.0,609.0,775.0,377.0,1034.0,1087.0,641.0,791.0
9,325.0,867.0,1127.0,306.0,928.0,349.0,995.0,1027.0,604.0,763.0,...,,,,,,,,,,


In [18]:
Y

Unnamed: 0,x,y
0,1597,1958
1,766,1690
2,530,2040
3,839,2244
4,349,1467
5,1269,1744
6,530,1690
7,1597,1690
8,530,2302
9,1597,2302


### LOOCV for Evaluating Machine Learning Algorithms

In [65]:
x_pos = [1597, 766, 530, 839, 349, 1269, 530, 1597, 530, 1597, 944, 1315]
y_pos = [1958, 1690, 2040, 2244, 1467, 1744, 1690, 1690, 2302, 2302, 2083, 1925]

for i in sorted(data["position"].astype(int).unique()):

    #####################################################################
    # X_train - train on 11/12
    #####################################################################

    # leave one position out
    training_data = data.query(f"position != {i}").reset_index(drop=True)
    position = training_data[["position"]].copy()
    X_train = training_data.drop("position", axis=1)

    # normalize the 11 positions
    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_train)
    scaled_train_features = min_max_scaler.transform(X_train)

    # convert the normalized matrix to a dataframe
    X_train = pd.DataFrame(
        scaled_train_features, index=X_train.index, columns=X_train.columns
    )

    # add which position each row corresponds to
    X_train = position.join(X_train)

    X_train_transposed = pd.DataFrame()

    for j in X_train["position"].unique():

        # impute the null rows with the mean (fill 0 where this is not possible)
        X_train.loc[X_train["position"] == j] = (
            X_train.query(f"position == {j}")
            .fillna(X_train.query(f"position == {j}").mean())
            .fillna(0)
        )

        # stack the position data into a single row
        df = (
            X_train.query(f"position == {j}")
            .copy(deep=True)
            .drop("position", axis=1)
            .reset_index(drop=True)
        )

        df.index = df.index + 1
        df_out = df.stack()
        df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
        df_out = df_out.to_frame().T

        X_train_transposed = (
            X_train_transposed.append(df_out).fillna(0).reset_index(drop=True)
        )

    x = x_pos.copy()
    y = y_pos.copy()

    x_test_pos = x.pop(i - 1)
    y_test_pos = y.pop(i - 1)

    Y_train = pd.DataFrame()
    Y_train["x"] = x
    Y_train["y"] = y

    ####################################################################
    # X_test - test on 1/12
    ####################################################################

    testing_set = data.query(f"position == {i}").reset_index(drop=True)
    position = testing_set[["position"]].copy()

    X_test = testing_set.drop("position", axis=1)

    scaled_test_features = min_max_scaler.transform(X_test)

    X_test = pd.DataFrame(
        scaled_test_features, index=X_test.index, columns=X_test.columns
    )

    X_test = position.join(X_test)

    X_test.loc[X_test["position"] == i] = (
        X_test.query(f"position == {i}")
        .fillna(X_test.query(f"position == {i}").mean())
        .fillna(0)
    )

    df = X_test.drop("position", axis=1).reset_index(drop=True)

    df.index = df.index + 1
    df_out = df.stack()
    df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
    X_test_transposed = df_out.to_frame().T

    Y_test = pd.DataFrame()
    Y_test["x"] = [x_test_pos]
    Y_test["y"] = [y_test_pos]

    #####################################################################
    # PCA
    #####################################################################

    X_train = PCA().fit_transform(X_train_transposed)

    print(X_train.shape)

    X_test = PCA().fit_transform(X_test_transposed)

    print(X_test.shape)

    break

#####################################################################
# Train a non-linear SVM
#####################################################################

#     model = NuSVC(nu=0.12)
#     wrapper = MultiOutputRegressor(model)
#     wrapper.fit(X_train, Y_train)

#     yhat = wrapper.predict(X_test)

#     print(yhat)

(11, 11)
(1, 1)


In [53]:
Y_test

Unnamed: 0,x,y
0,1315,1925


In [26]:
X_train_transposed.astype(bool).sum(axis=1)

0     11962
1     11959
2     13644
3      8497
4      2700
5     12754
6     11940
7     13665
8     15390
9     13664
10    11938
dtype: int64

In [38]:
X_test_transposed

Unnamed: 0,anchor_1_tag_1_1,anchor_2_tag_1_1,anchor_3_tag_1_1,anchor_4_tag_1_1,anchor_5_tag_1_1,anchor_1_tag_2_1,anchor_2_tag_2_1,anchor_3_tag_2_1,anchor_4_tag_2_1,anchor_5_tag_2_1,...,anchor_1_tag_2_796,anchor_2_tag_2_796,anchor_3_tag_2_796,anchor_4_tag_2_796,anchor_5_tag_2_796,anchor_1_tag_3_796,anchor_2_tag_3_796,anchor_3_tag_3_796,anchor_4_tag_3_796,anchor_5_tag_3_796
0,0.109612,0.517024,0.632543,-0.167832,0.476132,0.418385,0.357494,0.26743,0.225515,0.124836,...,0.115979,0.507371,0.636837,-0.155928,0.563732,0.117012,0.53085,0.664603,-0.097983,0.553001


In [42]:
Y_test

Unnamed: 0,x,y


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func=f_regression, k=10)
X_selected = fs.fit_transform(X_train_transposed, pos_coord)
X_selected.shape

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10).fit_transform(X_train_transposed)
pca.shape

In [None]:
pca

In [None]:
df = (
    training_data.query("position == 5")
    .drop(["position", "x", "y"], axis=1)
    .reset_index(drop=True)
    .fillna(0)
)

df.index = df.index + 1
df_out = df.stack()
df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
pos_5 = df_out.to_frame().T
pos_5

In [None]:
# the problem is that the input vector needs to be 15 or does it not?
# it makes sense that the longer you stay the less sparse the matrix is going to be -- resulting in better accracy
# the input vector, thus, does not need to be of length 15
pos_5.append(pos_1).reset_index(drop=True)

In [None]:
for i in X_train["position"].unique():
    print(i, X_train.query("position == {}".format(i)).isnull().any(axis=1).sum())

In [None]:
X_train = data.drop(["position", "x", "y"], axis=1)
X_train

In [None]:
# Normalization
min_max_scaler = MinMaxScaler()
min_max_scaled_features = min_max_scaler.fit_transform(X_train)

In [None]:
X_train = pd.DataFrame(
    min_max_scaled_features, index=X_train.index, columns=X_train.columns
)
X_train = position.join(X_train)
X_train

In [None]:
for i in X_train["position"].unique():
    X_train.loc[X_train["position"] == i] = X_train.query(
        "position == {}".format(i)
    ).fillna(X_train.query("position == {}".format(i)).mean())

In [None]:
for i in X_train["position"].unique():
    print(i, X_train.query("position == {}".format(i)).isnull().any(axis=1).sum())

In [None]:
# Final X_train
X_train = X_train.drop(["position"], axis=1)
X_train

In [None]:
y_train = data[["x", "y"]].copy()
y_train

### Train the models

In [None]:
row = [
    0.004216,
    0.837327,
    0.920259,
    0.239286,
    0.777234,
    0.004296,
    0.839066,
    0.917794,
    0.351052,
    0.854139,
    0.004979,
    0.816065,
    0.956656,
    0.213820,
    0.802043,
]

In [None]:
model = LinearSVR()
wrapper = MultiOutputRegressor(model)
wrapper.fit(X_train, y_train)

yhat = wrapper.predict([row])

print("Predicted:", yhat[0].round())

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

yhat = model.predict([row])

print("Predicted:", yhat[0].round())

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit(X_train)
pca_2d = pca.transform(X_train)

import pylab as pl
for i in range(0, pca_2d.shape[0]):
    if y_train[i] == 1:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r',marker='+')
    elif y_train[i] == 2:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g',marker='o')
    elif y_train[i] == 3:
        c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='b',marker='*')

pl.legend([c1, c2, c3], ['Position 1', 'Position 2', 'Position 3'])
pl.title('Dataset with 3 clusters and known outcomes')
pl.show()