prepare data for MLP

In [185]:
import numpy as np
import os
import json
from sklearn.decomposition import PCA

In [186]:
def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    if norm_a != 0 and norm_b != 0: 
        similarity = dot_product / (norm_a * norm_b)
    else:
        similarity = 0
    return similarity

In [187]:
input_dir = "../TrainingSamples"
max_Star = 0
for class_name in os.listdir(input_dir):
    if class_name in [".DS_Store", "readme.md", "previous", "Dataset_Details"]:
        continue
    class_path = os.path.join(input_dir, class_name)
    starpath = os.path.join(class_path, "results.json")
    with open(starpath, 'r') as json_file:
        star_df = json.load(json_file)
    for constellation, values in star_df.items():
        if len(values["area"]) > max_Star:
            max_Star = len(values["area"])

In [188]:
max_Star

17

In [189]:
def sort_normalize_data(data):
    data_out = {}
    for constellation, values in data.items():
        sorted_indices = sorted(range(len(values["area"])), key=lambda i: -values["area"][i])
        sorted_area = [values["area"][i] for i in sorted_indices]
        sorted_x = [values["x"][i] for i in sorted_indices]
        sorted_y = [values["y"][i] for i in sorted_indices]
        xx = [i**2 for i in sorted_x]
        yy = [i**2 for i in sorted_y]
        xy = [sorted_x[i]*sorted_y[i] for i in range(len(sorted_x))]
        normalized_values = {
           "area": [(a - min(sorted_area)) / (max(sorted_area) - min(sorted_area)) for a in sorted_area],
           "x": [(x - np.mean(sorted_x)) / np.std(sorted_x) for x in sorted_x],
           "y": [(y - np.mean(sorted_y)) / np.std(sorted_y) for y in sorted_y],
           "cos<max, rest>":  [cosine_similarity(np.array([sorted_x[0], sorted_y[0]]), np.array([sorted_x[i+1], sorted_y[i+1]])) for i in range(len(sorted_x)-1)]
           # "x*x": [(i - np.mean(xx)) / np.std(xx) for i in xx],
           # "y*y": [(i - np.mean(yy)) / np.std(yy) for i in yy],
           # "x*y": [(i - np.mean(xy)) / np.std(xy) for i in xy]
        }
        # stacked_array = np.column_stack((sorted_x, sorted_y))
        #pca = PCA(n_components=1)
        #print("stack")
        #print(stacked_array)
        #pca.fit(stacked_array)
        #print(f"{constellation} component")
        #print(pca.components_[0])
        #coeff1 = pca.components_[0][0]
        #coeff2 =pca.components_[0][1]
        #normalized_values["coeff"] = [coeff1, coeff2]
        
    #    values = {
    #       "area": sorted_area,
    #       "x": sorted_x,
    #       "y": sorted_y
    #    }
        data_out[constellation] = normalized_values
    return data_out

In [190]:
def create_feature_vectors(data, max_stars, target):
    feature_vectors = []
    targets = []
    for constellation, values in data.items():
        feature_vector = []
        real_attri = len(values["area"])
        for i in range(real_attri):
            for key in ["area", "x", "y"]:
                feature_vector.append(values[key][i])
        if real_attri < max_stars:
            feature_vector += [0.0] *(max_stars*3- real_attri*3)
        else:
            feature_vector = feature_vector[:max_stars * 3]
        n = len(values["cos<max, rest>"])
        # feature_vector += values["cos<max, rest>"]
        feature_vector += values["cos<max, rest>"]
        feature_vector += (max_stars-1-n)*[0.0]
        feature_vectors.append(feature_vector)
        targets.append(target)
    feature_vectors = np.array(feature_vectors)
    targets = np.array(targets)
    return feature_vectors, targets

In [191]:
# loop to read corrdinates and radius of stars in every constellation
input_dir = "../TrainingSamples"
output_dir = "../data_model"

attributes = None# coordinates and areas
targets = None
for class_name in os.listdir(input_dir):
    if class_name in [".DS_Store", "readme.md", "previous", "Dataset_Details"]:
        continue
    class_path = os.path.join(input_dir, class_name)
    starpath = os.path.join(class_path, "results.json")
    with open(starpath, 'r') as json_file:
        star_df = json.load(json_file)
    attr, target = create_feature_vectors(sort_normalize_data(star_df), max_stars=max_Star, target=class_name.split("_")[0])
    if attributes is None:
        attributes = attr
    else:
        try:
            attributes = np.vstack((attributes, attr))
        except:
            print(attr)
            print(class_name)
            print(attributes.shape)
            print(attr.shape)
            print("\n")
    
    if targets is None:
        targets = target
    else:
        targets = np.concatenate((targets, target))

np.save(os.path.join(output_dir, "star_coor_area_MLP.npy"),  attributes)
np.save(os.path.join(output_dir,"labels_MLP.npy") , targets)


In [192]:
star = np.load(os.path.join(output_dir, "star_coor_area_MLP.npy"))
labels = np.load(os.path.join(output_dir,"labels_MLP.npy"))
print(star.shape)
print(labels.shape)

(1728, 67)
(1728,)


In [193]:
star

array([[ 1.        , -1.71803765, -0.79372294, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.4360021 , -0.57534249, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.33406406, -0.85648781, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.        ,  1.05485699, -0.14749189, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        ,  1.14949122,  0.19919599, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.        , -1.05196684,  0.14412404, ...,  0.        ,
         0.        ,  0.        ]])