## Noise Suppression using Multivariate Regression

### Import the dependencies

In [27]:
%load_ext lab_black

# Common Imports
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Assessing performance
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

# Classifiers
from sklearn.neighbors import KNeighborsClassifier

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# Making a prediction with the direct multioutput regression model
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR, NuSVC

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings

warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# To make this notebook's output identical at every run
np.random.seed(42)

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


### Preprocess the data

In [2]:
data = (
    pd.read_csv("position_log_v2.csv")
    .query("marker % 2 == 0")
    .drop(["x", "y", "z", "anchors", "time"], axis=1)
    .reset_index(drop=True)
)

data["tag"] = data["tag"].replace(52630, 3)

data["marker"] = data["marker"].map(
    {j: i + 1 for i, j in enumerate(data.marker.unique())}
)

data.columns = [
    "position",
    "tag",
    "anchor_1",
    "anchor_2",
    "anchor_3",
    "anchor_4",
    "anchor_5",
]

data

Unnamed: 0,position,tag,anchor_1,anchor_2,anchor_3,anchor_4,anchor_5
0,1,1,204.0,871.0,1072.0,375.0,820.0
1,1,2,209.0,871.0,1082.0,468.0,820.0
2,1,3,189.0,882.0,1117.0,401.0,817.0
3,1,1,205.0,872.0,1072.0,375.0,824.0
4,1,2,214.0,875.0,1082.0,401.0,827.0
...,...,...,...,...,...,...,...
29415,12,2,336.0,602.0,814.0,160.0,603.0
29416,12,3,327.0,636.0,837.0,240.0,615.0
29417,12,1,332.0,618.0,801.0,184.0,586.0
29418,12,2,338.0,603.0,812.0,159.0,599.0


In [3]:
position = data.query("tag == 1").loc[:, ["position"]].reset_index(drop=True)

data_tag_1 = (
    data.query("tag == 1").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_2 = (
    data.query("tag == 2").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_3 = (
    data.query("tag == 3").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

In [4]:
data_tag_1.columns = map("{}_tag_1".format, data_tag_1.columns)
data_tag_2.columns = map("{}_tag_2".format, data_tag_2.columns)
data_tag_3.columns = map("{}_tag_3".format, data_tag_3.columns)

In [5]:
data = position.join([data_tag_1, data_tag_2, data_tag_3])
data

Unnamed: 0,position,anchor_1_tag_1,anchor_2_tag_1,anchor_3_tag_1,anchor_4_tag_1,anchor_5_tag_1,anchor_1_tag_2,anchor_2_tag_2,anchor_3_tag_2,anchor_4_tag_2,anchor_5_tag_2,anchor_1_tag_3,anchor_2_tag_3,anchor_3_tag_3,anchor_4_tag_3,anchor_5_tag_3
0,1.0,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,189.0,882.0,1117.0,401.0,817.0
1,1.0,205.0,872.0,1072.0,375.0,824.0,214.0,875.0,1082.0,401.0,827.0,188.0,880.0,1116.0,398.0,817.0
2,1.0,206.0,870.0,1074.0,374.0,822.0,208.0,874.0,1078.0,417.0,819.0,187.0,885.0,1121.0,396.0,817.0
3,1.0,203.0,872.0,1069.0,373.0,821.0,211.0,873.0,1082.0,421.0,821.0,190.0,890.0,1121.0,396.0,815.0
4,1.0,199.0,870.0,1070.0,371.0,829.0,207.0,872.0,1082.0,433.0,825.0,188.0,880.0,1120.0,399.0,815.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9395,12.0,333.0,622.0,801.0,183.0,585.0,333.0,599.0,812.0,164.0,606.0,325.0,637.0,840.0,239.0,622.0
9396,12.0,333.0,617.0,802.0,184.0,584.0,331.0,602.0,811.0,159.0,601.0,325.0,634.0,836.0,242.0,625.0
9397,12.0,330.0,621.0,807.0,182.0,581.0,332.0,601.0,811.0,158.0,602.0,323.0,636.0,837.0,243.0,626.0
9398,12.0,329.0,618.0,802.0,182.0,583.0,333.0,600.0,810.0,155.0,598.0,325.0,637.0,838.0,241.0,625.0


In [9]:
pos_uniq = sorted(data["position"].unique())

pos_coord = [
    (1597, 1958),
    (766, 1690),
    (530, 2040),
    (839, 2244),
    (349, 1467),
    (1269, 1744),
    (530, 1690),
    (1597, 1690),
    (530, 2302),
    (1597, 2302),
    (944, 2083),
    (1315, 1925),
]

pos_to_coord = dict(zip(pos_uniq, pos_coord))

for pos in pos_to_coord.keys():
    x, y = pos_to_coord[pos]
    data.loc[data["position"] == pos, "x"] = x
    data.loc[data["position"] == pos, "y"] = y

data

Unnamed: 0,position,anchor_1_tag_1,anchor_2_tag_1,anchor_3_tag_1,anchor_4_tag_1,anchor_5_tag_1,anchor_1_tag_2,anchor_2_tag_2,anchor_3_tag_2,anchor_4_tag_2,anchor_5_tag_2,anchor_1_tag_3,anchor_2_tag_3,anchor_3_tag_3,anchor_4_tag_3,anchor_5_tag_3,x,y
0,1.0,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,189.0,882.0,1117.0,401.0,817.0,1597.0,1958.0
1,1.0,205.0,872.0,1072.0,375.0,824.0,214.0,875.0,1082.0,401.0,827.0,188.0,880.0,1116.0,398.0,817.0,1597.0,1958.0
2,1.0,206.0,870.0,1074.0,374.0,822.0,208.0,874.0,1078.0,417.0,819.0,187.0,885.0,1121.0,396.0,817.0,1597.0,1958.0
3,1.0,203.0,872.0,1069.0,373.0,821.0,211.0,873.0,1082.0,421.0,821.0,190.0,890.0,1121.0,396.0,815.0,1597.0,1958.0
4,1.0,199.0,870.0,1070.0,371.0,829.0,207.0,872.0,1082.0,433.0,825.0,188.0,880.0,1120.0,399.0,815.0,1597.0,1958.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9395,12.0,333.0,622.0,801.0,183.0,585.0,333.0,599.0,812.0,164.0,606.0,325.0,637.0,840.0,239.0,622.0,1315.0,1925.0
9396,12.0,333.0,617.0,802.0,184.0,584.0,331.0,602.0,811.0,159.0,601.0,325.0,634.0,836.0,242.0,625.0,1315.0,1925.0
9397,12.0,330.0,621.0,807.0,182.0,581.0,332.0,601.0,811.0,158.0,602.0,323.0,636.0,837.0,243.0,626.0,1315.0,1925.0
9398,12.0,329.0,618.0,802.0,182.0,583.0,333.0,600.0,810.0,155.0,598.0,325.0,637.0,838.0,241.0,625.0,1315.0,1925.0


In [13]:
# Amount of readings per location
data["position"].value_counts()

9.0     1026
10.0     911
8.0      911
6.0      911
3.0      910
2.0      798
1.0      798
11.0     796
12.0     796
7.0      796
4.0      567
5.0      180
Name: position, dtype: int64

### LOOCV for Evaluating Machine Learning Algorithms

In [207]:
for i in sorted(data["position"].unique()):

    #####################################################################
    # X_train - train on 11/12
    #####################################################################

    training_data = data.query(f"position != {i}").reset_index(drop=True)
    position = training_data[["position"]].copy()

    X_train = training_data.drop(["position", "x", "y"], axis=1)
    Y_train = training_data[["x", "y"]].copy()

    min_max_scaler = MinMaxScaler()
    min_max_scaler.fit(X_train)

    scaled_train_features = min_max_scaler.transform(X_train)

    X_train = pd.DataFrame(
        scaled_train_features, index=X_train.index, columns=X_train.columns
    )

    X_train = position.join(X_train)

    X_train_transposed = pd.DataFrame()

    for j in X_train["position"].unique():

        X_train.loc[X_train["position"] == j] = (
            X_train.query(f"position == {j}")
            .fillna(X_train.query(f"position == {j}").mean())
            .fillna(0)
        )

        df = (
            X_train.query(f"position == {j}")
            .copy(deep=True)
            .drop("position", axis=1)
            .reset_index(drop=True)
        )

        df.index = df.index + 1
        df_out = df.stack()
        df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
        df_out = df_out.to_frame().T

        X_train_transposed = (
            X_train_transposed.append(df_out).fillna(0).reset_index(drop=True)
        )
        
    for pos in pos_to_coord.keys():
        x, y = pos_to_coord[pos]
        data.loc[data["position"] == pos, "x"] = x
        data.loc[data["position"] == pos, "y"] = y

#####################################################################
# X_test - test on 1/12
#####################################################################

#     testing_set = data.query(f"position == {i}").reset_index(drop=True)
#     position = testing_set[["position"]].copy()

#     X_test = testing_set.drop(["position", "x", "y"], axis=1)
#     Y_test = testing_set[["x", "y"]].copy()

#     scaled_test_features = min_max_scaler.transform(X_test)

#     X_test = pd.DataFrame(
#         scaled_test_features, index=X_test.index, columns=X_test.columns
#     )

#     X_test = position.join(X_test)

#     for j in X_test["position"].unique():
#         X_test.loc[X_test["position"] == j] = (
#             X_test.query(f"position == {j}")
#             .fillna(X_test.query(f"position == {j}").mean())
#             .fillna(0)
#         )

#     X_test = X_test.drop("position", axis=1)

#####################################################################
# Train a non-linear SVM
#####################################################################

#     model = NuSVC(nu=0.12)
#     wrapper = MultiOutputRegressor(model)
#     wrapper.fit(X_train, Y_train)

#     yhat = wrapper.predict(X_test)

In [210]:
X_train_transposed.astype(bool).sum(axis=1)

0     11962
1     11959
2     13644
3      8497
4      2700
5     12754
6     11940
7     13665
8     15390
9     13664
10    11938
dtype: int64

In [211]:
X_train_transposed

Unnamed: 0,anchor_1_tag_1_1,anchor_2_tag_1_1,anchor_3_tag_1_1,anchor_4_tag_1_1,anchor_5_tag_1_1,anchor_1_tag_2_1,anchor_2_tag_2_1,anchor_3_tag_2_1,anchor_4_tag_2_1,anchor_5_tag_2_1,...,anchor_1_tag_2_1026,anchor_2_tag_2_1026,anchor_3_tag_2_1026,anchor_4_tag_2_1026,anchor_5_tag_2_1026,anchor_1_tag_3_1026,anchor_2_tag_3_1026,anchor_3_tag_3_1026,anchor_4_tag_3_1026,anchor_5_tag_3_1026
0,0.004216,0.837327,0.920259,0.106294,0.777234,0.004296,0.839066,0.917794,0.244845,0.854139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.589376,0.003783,0.270474,0.488112,0.471236,0.629725,0.003686,0.279917,0.304124,0.562418,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.747049,0.350567,0.005388,0.784615,0.233782,0.721649,0.326781,0.005203,0.654639,0.273325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.564081,0.474149,0.202586,0.524476,0.009792,0.597938,0.46683,0.169615,0.520619,0.002628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.992411,0.348045,0.536638,0.962805,0.881273,0.99055,0.350123,0.470343,0.873711,0.982917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.187184,0.407314,0.697198,0.0,0.625459,0.987113,0.355037,0.472425,0.876289,0.981603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.773187,0.112232,0.186422,0.626573,0.526316,0.206186,0.496314,0.621228,0.610346,0.641261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.779089,0.523329,0.05819,0.853147,0.21175,0.778351,0.136364,0.243496,0.612113,0.607096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.161046,0.994956,0.899784,0.465734,0.659731,0.775773,0.584767,0.1436,0.949742,0.214192,...,0.128007,0.988943,0.862643,0.426546,0.795007,0.160996,0.993015,0.925697,0.476945,0.768838
9,0.106239,0.832282,0.979526,0.00979,0.909425,0.12457,0.9914,0.860562,0.420103,0.779238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [223]:
pos_to_coord

{1.0: (1597, 1958),
 2.0: (766, 1690),
 3.0: (530, 2040),
 4.0: (839, 2244),
 5.0: (349, 1467),
 6.0: (1269, 1744),
 7.0: (530, 1690),
 8.0: (1597, 1690),
 9.0: (530, 2302),
 10.0: (1597, 2302),
 11.0: (944, 2083),
 12.0: (1315, 1925)}

In [221]:
pd.DataFrame(pos_coord

AttributeError: 'list' object has no attribute 'to_dataframe'

In [220]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

fs = SelectKBest(score_func=f_regression, k=10)
X_selected = fs.fit_transform(X_train_transposed, pos_coord)
X_selected.shape

ValueError: Found input variables with inconsistent numbers of samples: [11, 12]

In [217]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10).fit_transform(X_train_transposed)
pca.shape

(11, 10)

In [218]:
pca

array([[ 3.73602600e+01, -1.90708554e+01,  1.29709301e+00,
        -5.00686651e+00, -1.31112756e+00, -2.64932970e+00,
        -8.71527414e+00, -9.77568919e+00, -4.74920918e+00,
         5.73150928e-01],
       [-1.83286363e+01, -6.06524111e+00, -4.84122375e+00,
         5.66579439e+00,  1.34632194e+01, -7.40420118e+00,
        -4.72398347e+00, -3.09583402e+00,  8.22919159e+00,
         8.36793889e-01],
       [-2.81805226e+01,  1.50351971e+01, -7.82271361e+00,
        -8.36223481e+00, -4.50480739e+00,  7.11914751e-01,
         5.21377043e+00, -3.90353651e+00, -1.53202805e+00,
         5.82798475e+00],
       [-2.22331593e+01, -6.62902144e+00,  1.69303440e+01,
        -9.93630829e+00,  8.05564051e+00,  1.58534822e+01,
         1.79688176e+00, -2.58386273e+00, -7.83510848e-01,
        -2.20242469e+00],
       [-2.47625339e+01, -1.98831763e+01,  2.81997135e+01,
         1.42785547e+01, -8.78673067e+00, -6.77229324e+00,
         1.58310972e+00,  1.39674660e+00, -1.23065543e+00,
        -5.

In [165]:
df = (
    training_data.query("position == 5")
    .drop(["position", "x", "y"], axis=1)
    .reset_index(drop=True)
    .fillna(0)
)

df.index = df.index + 1
df_out = df.stack()
df_out.index = df_out.index.map("{0[1]}_{0[0]}".format)
pos_5 = df_out.to_frame().T
pos_5

Unnamed: 0,anchor_1_tag_1_1,anchor_2_tag_1_1,anchor_3_tag_1_1,anchor_4_tag_1_1,anchor_5_tag_1_1,anchor_1_tag_2_1,anchor_2_tag_2_1,anchor_3_tag_2_1,anchor_4_tag_2_1,anchor_5_tag_2_1,...,anchor_1_tag_2_180,anchor_2_tag_2_180,anchor_3_tag_2_180,anchor_4_tag_2_180,anchor_5_tag_2_180,anchor_1_tag_3_180,anchor_2_tag_3_180,anchor_3_tag_3_180,anchor_4_tag_3_180,anchor_5_tag_3_180
0,1376.0,483.0,716.0,0.0,905.0,1357.0,473.0,652.0,956.0,918.0,...,1355.0,472.0,653.0,954.0,916.0,1385.0,507.0,691.0,990.0,969.0


In [175]:
# the problem is that the input vector needs to be 15 or does it not?
# it makes sense that the longer you stay the less sparse the matrix is going to be -- resulting in better accracy
# the input vector, thus, does not need to be of length 15
pos_5.append(pos_1).reset_index(drop=True)

Unnamed: 0,anchor_1_tag_1_1,anchor_2_tag_1_1,anchor_3_tag_1_1,anchor_4_tag_1_1,anchor_5_tag_1_1,anchor_1_tag_2_1,anchor_2_tag_2_1,anchor_3_tag_2_1,anchor_4_tag_2_1,anchor_5_tag_2_1,...,anchor_1_tag_2_798,anchor_2_tag_2_798,anchor_3_tag_2_798,anchor_4_tag_2_798,anchor_5_tag_2_798,anchor_1_tag_3_798,anchor_2_tag_3_798,anchor_3_tag_3_798,anchor_4_tag_3_798,anchor_5_tag_3_798
0,1376.0,483.0,716.0,0.0,905.0,1357.0,473.0,652.0,956.0,918.0,...,,,,,,,,,,
1,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,...,210.0,881.0,1078.0,424.0,827.0,185.0,875.0,1118.0,403.0,816.0


In [89]:
for i in X_train["position"].unique():
    print(i, X_train.query("position == {}".format(i)).isnull().any(axis=1).sum())

2.0 0
3.0 0
4.0 0
5.0 0
6.0 0
7.0 0
8.0 0
9.0 0
10.0 0
11.0 0
12.0 0


In [14]:
X_train = data.drop(["position", "x", "y"], axis=1)
X_train

Unnamed: 0,anchor_1_tag_1,anchor_2_tag_1,anchor_3_tag_1,anchor_4_tag_1,anchor_5_tag_1,anchor_1_tag_2,anchor_2_tag_2,anchor_3_tag_2,anchor_4_tag_2,anchor_5_tag_2,anchor_1_tag_3,anchor_2_tag_3,anchor_3_tag_3,anchor_4_tag_3,anchor_5_tag_3
0,204.0,871.0,1072.0,375.0,820.0,209.0,871.0,1082.0,468.0,820.0,189.0,882.0,1117.0,401.0,817.0
1,205.0,872.0,1072.0,375.0,824.0,214.0,875.0,1082.0,401.0,827.0,188.0,880.0,1116.0,398.0,817.0
2,206.0,870.0,1074.0,374.0,822.0,208.0,874.0,1078.0,417.0,819.0,187.0,885.0,1121.0,396.0,817.0
3,203.0,872.0,1069.0,373.0,821.0,211.0,873.0,1082.0,421.0,821.0,190.0,890.0,1121.0,396.0,815.0
4,199.0,870.0,1070.0,371.0,829.0,207.0,872.0,1082.0,433.0,825.0,188.0,880.0,1120.0,399.0,815.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9395,333.0,622.0,801.0,183.0,585.0,333.0,599.0,812.0,164.0,606.0,325.0,637.0,840.0,239.0,622.0
9396,333.0,617.0,802.0,184.0,584.0,331.0,602.0,811.0,159.0,601.0,325.0,634.0,836.0,242.0,625.0
9397,330.0,621.0,807.0,182.0,581.0,332.0,601.0,811.0,158.0,602.0,323.0,636.0,837.0,243.0,626.0
9398,329.0,618.0,802.0,182.0,583.0,333.0,600.0,810.0,155.0,598.0,325.0,637.0,838.0,241.0,625.0


In [None]:
# Normalization
min_max_scaler = MinMaxScaler()
min_max_scaled_features = min_max_scaler.fit_transform(X_train)

In [None]:
X_train = pd.DataFrame(
    min_max_scaled_features, index=X_train.index, columns=X_train.columns
)
X_train = position.join(X_train)
X_train

In [None]:
for i in X_train["position"].unique():
    X_train.loc[X_train["position"] == i] = X_train.query(
        "position == {}".format(i)
    ).fillna(X_train.query("position == {}".format(i)).mean())

In [None]:
for i in X_train["position"].unique():
    print(i, X_train.query("position == {}".format(i)).isnull().any(axis=1).sum())

In [None]:
# Final X_train
X_train = X_train.drop(["position"], axis=1)
X_train

In [None]:
y_train = data[["x", "y"]].copy()
y_train

### Train the models

In [None]:
row = [
    0.004216,
    0.837327,
    0.920259,
    0.239286,
    0.777234,
    0.004296,
    0.839066,
    0.917794,
    0.351052,
    0.854139,
    0.004979,
    0.816065,
    0.956656,
    0.213820,
    0.802043,
]

In [None]:
model = LinearSVR()
wrapper = MultiOutputRegressor(model)
wrapper.fit(X_train, y_train)

yhat = wrapper.predict([row])

print("Predicted:", yhat[0].round())

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

yhat = model.predict([row])

print("Predicted:", yhat[0].round())

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2).fit(X_train)
pca_2d = pca.transform(X_train)

import pylab as pl
for i in range(0, pca_2d.shape[0]):
    if y_train[i] == 1:
        c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r',marker='+')
    elif y_train[i] == 2:
        c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g',marker='o')
    elif y_train[i] == 3:
        c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='b',marker='*')

pl.legend([c1, c2, c3], ['Position 1', 'Position 2', 'Position 3'])
pl.title('Dataset with 3 clusters and known outcomes')
pl.show()