In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
plt.style.use('seaborn-talk')

In [None]:
dataset = pd.read_csv('/kaggle/input/system-identification-of-an-electric-motor/Dataset_Electric_Motor.csv')
dataset.shape

In [None]:
dataset.head()

# Distribution

According to [this introductory paper](https://arxiv.org/pdf/2003.07273.pdf), *id_k1* and *iq_k1* are to be treated as target features.

At the same time, they depend on elementary vectors label-encoded into integers.

Let's analyze how they are distributed.

In [None]:
fig, axes = plt.subplots(1,2,sharex=True, sharey=True)
for c, ax in zip(['n_k', 'n_1k'], axes.flatten()):
    sns.countplot(x=c, data=dataset, palette="ch:.25", ax=ax)
unique_elem_vecs = dataset['n_k'].nunique()

Element vector with k = 1 appears significantly more often than the remaining element vector types.

In [None]:
pairs = dataset.assign(pairs=lambda r: r.n_k.astype(str)+'->'+r.n_1k.astype(str))['pairs']
pairs.head()

In [None]:
print('Transition between elementary vectors count')
pairs.value_counts()

In [None]:
reduced_data = dataset.iloc[::1000, :]
analyzed_cols = [c for c in dataset if c != 'n_k']
fig, axes = plt.subplots(nrows=unique_elem_vecs, ncols=len(analyzed_cols), sharex='col', figsize=(20, 20))

for k, df in reduced_data.groupby('n_k'):
    for i, c in enumerate(analyzed_cols):
        sns.distplot(df[c], ax=axes[k-1, i])
        if i == 0:
            axes[k-1, i].set_ylabel(f'n_k = {k}')
plt.tight_layout()

It becomes evident that certain transitions in the elementary vectors are more common than others.

Moreover, depending on the current elementary vector, distribution of currents and rotor angle *epsilon_k* is either unimodal or bimodal distributed.

More subtle, we recognize a semi-sphere shape of the 2d histogram between the currents (remember, *d* and *q* currents are to be plotted perpendicular to each other).

It might be auspicious, to add another feature denoting the current vector norm *id^2 + iq^2*.

On another note, epsilon is the rotor angle, which by design has a value discontinuity in the extreme points over time.

In [None]:
reduced_data['epsilon_k'].describe()

Obviously, the value range is clipped to *[-$pi$, $pi$]*.

As many ML methods do not respond well to discontinuities in the input space with no corresponding effect on the target space, we replace epsilon by its sine and cosine.

# Feature Engineering

We add sine and cosine of the rotor angle and the current vector norm.

In [None]:
dataset = dataset.assign(sin_eps_k=lambda df: np.sin(df.epsilon_k), 
                         cos_eps_k=lambda df: np.cos(df.epsilon_k),
                         i_norm=lambda df: np.sqrt(df.id_k**2 + df.iq_k**2)).drop('epsilon_k', axis=1)
dataset.head()

# Correlation Matrix

In [None]:
corr = reduced_data.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Generate a custom diverging colormap
cmap = sns.diverging_palette(250, 15, s=75, l=40,n=9, center="dark", as_cmap=True)

plt.figure(figsize=(14,14))
_ = sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

We observe strong linear correlation between consecutive current measurements in *d/q* coordinates each.

All other pair-wise comparisons are relatively uncorrelated.

# Linear Regression

We kick off regression with a linear model, as the correlation matrix suggests expedient estimation performance just from actual currents.
Since elementary vectors are to be treated as categorical, we one-hot encode them before training.

Moreover, in order to fit in RAM, we subsample the data.

In [None]:
df = dataset.iloc[::100, :]\
            .assign(**{**{f'n_k_{i}': lambda x: (x.n_k == i).astype(int) for i in range(1, 8)},
                       **{f'n_1k_{i}': lambda x: (x.n_1k == i).astype(int) for i in range(1, 8)}})\
            .drop(['n_k', 'n_1k'], axis=1)

target_cols = ['id_k1', 'iq_k1']
input_cols = [c for c in df if c not in target_cols]
cv = KFold(shuffle=True, random_state=2020)

ss_y = StandardScaler().fit(df[target_cols])
df = pd.DataFrame(StandardScaler().fit_transform(df),
                     columns=df.columns)  # actually methodically unsound, but data is large enough

In [None]:

X, y = df[input_cols].values, df[target_cols].values

scores = []
for train_idx, test_idx in cv.split(X, y):
    ols = LinearRegression().fit(X[train_idx], y[train_idx])
    pred = ols.predict(X[test_idx])
    pred = ss_y.inverse_transform(pred)
    gtruth = ss_y.inverse_transform(y[test_idx])
    scores.append(mean_squared_error(pred, gtruth))
scores = np.asarray(scores)
print('MSE:')
print(f'Scores Mean: {scores.mean():.4f} A² +- {2*scores.std():.4f} A²\nScores Min: {scores.min():.4f} A², Scores Max: {scores.max():.4f} A²')

This is a rather weak estimation.
Can you beat this score?