## Noise Suppression using Multivariate Regression

### Import the dependencies

In [1]:
%load_ext lab_black

# Common Imports
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

# Assessing performance
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold

# Classifiers
from sklearn.neighbors import KNeighborsClassifier

# Regressors
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

# example of making a prediction with the direct multioutput regression model
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings

warnings.filterwarnings(action="ignore", message="^internal gelsd")
warnings.filterwarnings("ignore")
warnings.filterwarnings(action="ignore", category=DeprecationWarning)
warnings.filterwarnings(action="ignore", category=FutureWarning)

# To make this notebook's output identical at every run
np.random.seed(42)

### Preprocess the data

In [2]:
data = (
    pd.read_csv("position_logs.csv")
    .query("marker % 2 == 0 ")
    .reset_index(drop=True
)

# data.marker = data.marker.map({j: i + 1 for i, j in enumerate(data.marker.unique())})

data.columns = [
    "position",
    "tag",
    "anchor_1",
    "anchor_2",
    "anchor_3",
    "anchor_4",
    "anchor_5",
]

data

Unnamed: 0,position,tag,anchor_1,anchor_2,anchor_3,anchor_4,anchor_5
0,0,1,940,185.0,453.0,515.0,560.0
1,0,2,855,185.0,505.0,484.0,581.0
2,0,3,939,191.0,483.0,527.0,587.0
3,0,1,939,186.0,454.0,510.0,555.0
4,0,2,852,186.0,503.0,487.0,583.0
...,...,...,...,...,...,...,...
6136,4,2,861,526.0,383.0,629.0,190.0
6137,4,3,843,581.0,392.0,688.0,212.0
6138,4,1,802,600.0,404.0,636.0,183.0
6139,4,2,832,526.0,377.0,625.0,189.0


In [None]:
position = data.query("tag == 1").loc[:, ["position"]].reset_index(drop=True)

data_tag_1 = (
    data.query("tag == 1").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_2 = (
    data.query("tag == 2").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

data_tag_3 = (
    data.query("tag == 3").drop(["position", "tag"], axis=1).reset_index(drop=True)
)

In [None]:
data_tag_1.columns = map("{}_tag_1".format, data_tag_1.columns)
data_tag_2.columns = map("{}_tag_2".format, data_tag_2.columns)
data_tag_3.columns = map("{}_tag_3".format, data_tag_3.columns)

In [None]:
data = position.join([data_tag_1, data_tag_2, data_tag_3])
data

In [None]:
pos_uniq = data["position"].unique()
pos_coord = [(1597, 1958), (766, 1690), (530, 2040)]

pos_to_coord = dict(zip(pos_uniq, pos_coord))

for pos in pos_to_coord.keys():
    x, y = pos_to_coord[pos]
    data.loc[data["position"] == pos, "x"] = x
    data.loc[data["position"] == pos, "y"] = y

data

In [None]:
# Amount of readings per location
data["position"].value_counts()

In [None]:
data.hist(bins=5, figsize=(15, 15))
plt.show()

In [None]:
X_train = data.drop(["position", "x", "y"], axis=1)

In [None]:
# Normalization
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train)
min_max_scaled_features = min_max_scaler.transform(X_train)

In [None]:
X_train = pd.DataFrame(
    min_max_scaled_features, index=X_train.index, columns=X_train.columns
)
X_train = position.join(X_train)
X_train

In [None]:
X_train.query("position == 1")

In [None]:
X_train.loc[X_train["position"] == 1] = X_train.query("position == 1").fillna(
    X_train.query("position == 1").mean()
)

In [None]:
X_train.query("position == 1")

In [None]:
X_train.query("position == 1").isnull().any(axis=1).sum()

In [None]:
# all the rows in the 2nd position have a null value (anchor 5)
X_train.query("position == 2").isnull().any(axis=1)

In [None]:
X_train.query("position == 2")

In [None]:
X_train.loc[X_train["position"] == 2] = X_train.query("position == 2").fillna(
    X_train.query("position == 2").mean()
)

In [None]:
X_train.query("position == 2")

In [None]:
X_train.query("position == 2").isnull().any(axis=1).sum()

In [None]:
X_train.query("position == 3")

In [None]:
X_train.loc[X_train["position"] == 3] = X_train.query("position == 3").fillna(
    X_train.query("position == 3").mean()
)

In [None]:
X_train.query("position == 3")

In [None]:
X_train.query("position == 3").isnull().any(axis=1).sum()

In [None]:
# How many rows exist with one or more null values?
X_train.isnull().any(axis=1).sum()

In [None]:
X_train

In [None]:
# Final X_train
X_train = X_train.drop(["position"], axis=1)
X_train

In [None]:
y_train = data[["x", "y"]].copy()
y_train

### Train the models

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

row = [561, 387, 623, 192, 489, 556, 349, 598, 212, 458, 499, 392, 639, 269, 461]
yhat = model.predict([row])

print('Predicted: %s' % yhat[0])

In [None]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)

row = [561, 387, 623, 192, 489, 556, 349, 598, 212, 458, 499, 392, 639, 269, 461]
yhat = model.predict([row])

print('Predicted: %s' % yhat[0])

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

row = [561, 387, 623, 192, 489, 556, 349, 598, 212, 458, 499, 392, 639, 269, 461]
yhat = model.predict([row])

print('Predicted: %s' % yhat[0])

In [None]:
model = LinearSVR()
wrapper = MultiOutputRegressor(model)
wrapper.fit(X_train, y_train)

row = [561, 387, 623, 192, 489, 556, 349, 598, 212, 458, 499, 392, 639, 269, 461]
yhat = wrapper.predict([row])

print('Predicted: %s' % yhat[0])

In [None]:
# from sklearn.decomposition import PCA

# pca = PCA(n_components=2).fit(X_train)
# pca_2d = pca.transform(X_train)

# import pylab as pl
# for i in range(0, pca_2d.shape[0]):
#     if y_train[i] == 1:
#         c1 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='r',marker='+')
#     elif y_train[i] == 2:
#         c2 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='g',marker='o')
#     elif y_train[i] == 3:
#         c3 = pl.scatter(pca_2d[i,0],pca_2d[i,1],c='b',marker='*')

# pl.legend([c1, c2, c3], ['Position 1', 'Position 2', 'Position 3'])
# pl.title('Dataset with 3 clusters and known outcomes')
# pl.show()

### Tune the hyperparameters

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

In [None]:
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

grid = dict(n_neighbors=n_neighbors, weights=weights, metric=metric)

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Parameters: %s with an accuracy of %f" % (grid_search.best_params_, grid_search.best_score_))

In [None]:
knn_clf = grid_search.best_estimator_