In [1]:
import matplotlib.pyplot as plt
import numpy as np

import glob

In [2]:
def mean(X):
    return sum(X) / X.shape[0]

# NOTE :- np.var(X) is a biased variance....uses N instead of N-1
def variance(X):
    c = 1 / (X.shape[0] - 1)
    mu = mean(X)
    var = c * sum((X - mu)**2)
    return var

def sd(X):
    return np.sqrt(variance(X))

def normalization_factor(X, Y):
    return np.sqrt(sum((X - mean(X))**2 + (Y - mean(Y))**2))

### Do all the following feature extraction on size normalized X and Y coordinates

# Feature 1
def size_normalization(X):
    return ((X - min(X)) / (max(X) - min(X)))

# Feature 2
# X and Y should be normalized or not?
# X and Y are not normalized, mu_x and mu_y are the means of normalized X and Y coordinates

def deviation_feature_1(X, Y):
    v1 = (X - mean(X)) / sd(Y)  # it should be sd(X) ?
    v2 = (Y - mean(Y)) / sd(X)
    return (v1, v2)

# Feature 3
def deviation_feature_2(X, Y):
    v1 = (X - mean(X)) / normalization_factor(X, Y)
    v2 = (Y - mean(Y)) / normalization_factor(X, Y)
    return (v1, v2)

# Feature 4
def zero_mean_feature(X, Y):
    v1 = (X - mean(X))
    v2 = (Y - mean(Y))
    return (v1, v2)

# Feature 5
def distance_from_origin(X, Y):
    return np.sqrt(X**2 + Y**2)

# Feature 6
def direction_with_horizontal_axis(X, Y):
    R = distance_from_origin(X, Y)
    return np.arccos(X/R)

# Feature 8
def distance_from_next_point(X, Y):
    R = distance_from_origin(X, Y)
    tmp = R
    tmp[0] = R[0]
    for i in range(1, R.shape[0]):
        tmp[i] = R[i] - R[i-1]
    return tmp    

# Feature 9
def angle_with_next_point(X, Y):
    theta = direction_with_horizontal_axis(X, Y)
    tmp = theta
    tmp[0] = theta[0]
    for i in range(1, theta.shape[0]):
        tmp[i] = theta[i] - theta[i-1]
    return tmp   

# Feature 11
def distance_from_centroid(X, Y):
    x_centroid = mean(X)
    y_centroid = mean(Y)
    return np.sqrt((x_centroid - X)**2 + (x_centroid - Y)**2)

# Feature 12
def direction_with_centroid(X, Y):
    x_centroid = mean(X)
    y_centroid = mean(Y)
    return np.arctan((y_centroid - Y) / (x_centroid - X))

In [3]:
def plot_without_normalization(X, Y):
    plt.scatter(X, Y, s=50, c= 'cyan', alpha=0.5)
    plt.show()

def plot_with_normalization(X, Y):
    plt.scatter(size_normalization(X), size_normalization(Y), s=50, c= 'cyan', alpha=0.5)
    plt.show()

In [4]:
def build_features(points):
    X = points[:,0]
    Y = points[:,1]

    no_of_coordinates = X.shape[0]
    no_of_features = 14
    data_point = np.zeros((no_of_coordinates, no_of_features))

    X = size_normalization(X)
    Y = size_normalization(Y)

    data_point[: , 0] = X
    data_point[: , 1] = Y
    data_point[: , 2], data_point[: , 3]   = deviation_feature_1(X, Y)
    data_point[: , 4], data_point[: , 5]   = deviation_feature_2(X, Y)
    data_point[: , 6], data_point[: , 7]   = zero_mean_feature(X, Y)
    data_point[: , 8]                      = distance_from_origin(X, Y)
    data_point[: , 9]                      = direction_with_horizontal_axis(X, Y)
    data_point[: , 10]                     = distance_from_next_point(X, Y)
    data_point[: , 11]                     = angle_with_next_point(X, Y)
    data_point[: , 12]                     = distance_from_centroid(X, Y)
    data_point[: , 13]                     = direction_with_centroid(X, Y)
    
    return data_point

In [5]:
all_files = glob.glob('test_data/a-bA-tA/*.txt')

for file in all_files:
    points = np.loadtxt(file)
    points = build_features(points)
    np.savetxt(file, points, fmt='%.6f')