In [14]:
from typing import Optional, Tuple

import numpy as np
import plotly.express as ple

from hypex.dataset import Dataset, DefaultRole
from hypex.extensions.scipy_linalg import CholeskyExtension, InverseExtension

# Funcs

In [21]:
def generate_data(size:int=1000, x_interval:Tuple[float] = (-5, 5), y_interval:Tuple[float]=(-7, 7), x_scale:float=5, y_scale:float=3, rs:Optional[int]=None, dotA:Tuple[int] = (0,0), dotB:Tuple[int] = (0, 5), dotC:Tuple[int] = (5, 0)):
    if rs:
        np.random.seed(rs)
    data = Dataset.from_dict(
        {
            'x': np.linspace(x_interval[0], x_interval[1], size) + np.random.normal(size=size, scale=x_scale),
            'y': np.linspace(y_interval[0], y_interval[1], size) + np.random.normal(size=size, scale=y_scale),
            'mark': [""] * size
        },
        roles = {}
    )
    dots = Dataset.from_dict(
        {
            'x': [dotA[0],dotB[0],dotC[0]],
            'y': [dotA[1], dotB[1], dotC[1]],
            'mark': ['A', 'B', 'C']
        },
        roles = {}
    )
    return data.append(dots, reset_index=True)

def dots_plot(data: Dataset, html_path:Optional[str]=None):
    p = ple.scatter(data_frame=data.data, x='x', y='y', color='mark', symbol='mark', color_discrete_sequence=["lightgray", "red", "green", "blue"], title="Точки в L2 пространстве")
    p.update_traces(
        marker_size=10,
        marker_line=dict(width=0.5, color='DarkSlateGrey'),
        selector=dict(mode='markers')
    )
    if html_path:
        p.write_html(html_path)
    return p

def machalanobis_transform(data: Dataset):
    cov = data[["x", "y"]].cov()
    cholesky = CholeskyExtension().calc(cov)
    mahalanobis_transform = InverseExtension().calc(cholesky)
    trans_data = data[["x", "y"]].dot(mahalanobis_transform.transpose())
    return trans_data.add_column(data.get_values(column="mark"), role={"mark": DefaultRole()}).rename({0: "x", 1: "y"})

def calc_dots_distances(data: Dataset, print_result=True):
    result = {
        "AB" : np.linalg.norm(np.array(data.get_values(len(data) - 3)[:-1]) - np.array(data.get_values(len(data) - 2)[:-1])),
        "AC": np.linalg.norm(np.array(data.get_values(len(data) - 3)[:-1]) - np.array(data.get_values(len(data) - 1)[:-1]))
    }
    if print_result:
        print(f"Distance between A and B:\n{result['AB']}")
        print(f"Distance between A and C:\n{result['AC']}")
    return result

# Generate data

The points are generated uniformly with additional normal noise. To demonstrate the effect of the Mahalanobis transformation, it is necessary to create a space with correlation, for which a different spread of values for the x and y coordinates is set. It is also useful to set different parameters of the noise spread for clarity.  

Marker points are added to the main data array in order to trace the transformation of space using them and see how their relative position changes.

In this example, the default generation parameters are used, but you can change them.

In [19]:
data = generate_data()
data

Unnamed: 0,x,y,mark
0,3.452629,-4.345336,
1,-7.319677,-9.713341,
2,-4.815879,-3.566020,
3,-2.932389,-6.697404,
4,-8.904575,-4.688942,
...,...,...,...
998,1.847381,6.215224,
999,1.612804,6.368563,
1000,0.000000,0.000000,A
1001,0.000000,5.000000,B


# Machalanobis transformation

Using the Mahalanobis distance, it is possible to determine the similarity of an unknown and a known sample. It differs from the Euclidean distance in that it takes into account correlations between variables and is invariant to scale.

In [20]:
macha_data = machalanobis_transform(data)
macha_data

Unnamed: 0,x,y,mark
0,0.604780,-1.234643,
1,-1.282153,-1.557487,
2,-0.843574,-0.401592,
3,-0.513652,-1.242973,
4,-1.559772,-0.322383,
...,...,...,...
998,0.323597,1.223360,
999,0.282507,1.275943,
1000,0.000000,0.000000,A
1001,0.000000,1.102996,B


# Plots

On the graphs, you can see how the relative position of the points and markers has changed after the transformation. If you pass the path to the html file, the graph will be saved to it.

In [22]:
dots_plot(data)

In [23]:
dots_plot(macha_data)

# Marker distances

Based on the displacement of the markers, it is possible to assess how the transformation of Mahalanobis affected the relative distance between the markers.

In [25]:
print("Distanceses in L2")
d = calc_dots_distances(data, True)

Distanceses in L2
Distance between A and B:
5.0
Distance between A and C:
5.0


In [26]:
print("Distanceses in Machalanobis")
d = calc_dots_distances(macha_data, True)

Distanceses in Machalanobis
Distance between A and B:
1.1029956192338142
Distance between A and C:
0.9627585035194691
