# Featurize Atoms Object with Voronoi Tesslation
---

Feature dimension reduction will be performed using PCA

# Notebook Setup

## Import Modules

In [1]:
import os
import pickle
# import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from sklearn.decomposition import PCA
from catlearn.fingerprint.voro import VoronoiFingerprintGenerator

## Script Inputs

In [2]:
file_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/00_ml_workflow/outdata",
    "01_irox_data_featurized.pickle",
    )

# Load Data

In [3]:
with open(file_i, "rb") as fle:
    df_m = pickle.load(fle)

# Generating Voronoi Fingerprints

In [4]:
VFG = VoronoiFingerprintGenerator(df_m["default_columns"]["atoms"].tolist())
df_vor = VFG.generate()
test_features = df_vor.values

# Setting Voronai index to those in the main dataframe
df_vor = df_vor.set_index(
    df_m.index.values,
    drop=True,
    append=False,
    inplace=False,
    verify_integrity=False,
    )

Generate Voronoi fingerprint of 968 structures


# Removing Columns of Non-unique Data

In [5]:
columns_to_remove = []
for column in df_vor:
    num_unique_vals = len(list(set(df_vor[column].tolist())))
    if num_unique_vals == 1:
        columns_to_remove.append(column)

df_vor = df_vor.drop(columns_to_remove, axis=1)

# Principle Component Analysis

In [6]:
pca = PCA(
    n_components=20,
    # n_components=len(df_vor.columns.values),
    )

pca.fit(df_vor.values)

print(pca.explained_variance_ratio_[0:5])
print(pca.singular_values_)

pca_features = pca.fit_transform(df_vor.values)
df_pca_features = pd.DataFrame(pca_features)

[0.59202418 0.32869709 0.05278911 0.02406058 0.00240959]
[1.43460651e+04 1.06895889e+04 4.28385963e+03 2.89211892e+03
 9.15238696e+02 6.90682674e+01 4.19473093e+01 1.32580282e+01
 5.46530440e+00 3.31201459e+00 2.45408953e+00 2.00789795e+00
 1.39430974e+00 1.26570748e+00 9.92269604e-01 8.38090660e-01
 6.21571382e-01 3.94278091e-01 3.62405775e-01 2.31319327e-01]


In [12]:
var_cum = 0.
for i in pca.explained_variance_ratio_:
    tmp = 42
    var_cum += i
    print(var_cum)

0.5920241809332064
0.9207212699897339
0.9735103769229374
0.9975709598714368
0.9999805468020805
0.9999942692305316
0.9999993307634653
0.999999836392175
0.999999922313816
0.9999999538681003
0.9999999711923588
0.9999999827896651
0.9999999883819897
0.9999999929902875
0.9999999958225483
0.9999999978430343
0.9999999989543972
0.9999999994015738
0.9999999997793756
0.9999999999332966


# Append PCA Features to Dataframe

In [7]:
keys_i = [df_m.columns.levels[0][0], df_m.columns.levels[0][1], "features_pca"]

df_list = [
    df_m[df_m.columns.levels[0][0]],
    df_m[df_m.columns.levels[0][1]],
    df_pca_features,
    ]

df_m = pd.concat(
    df_list,
    # [df_m, df_pca_features],
    axis=1,
    join_axes=[df_m.index],
    keys=keys_i)

# Write Dataframe to Pickle

In [8]:
directory = "outdata"
if not os.path.exists(directory):
    os.makedirs(directory)


with open(directory + "/02_data_featurized.pickle", "wb") as fle:
    pickle.dump(df_m, fle)

In [9]:
# names
# levels
# codes
# nlevels
# levshape

# df_m.columns.levshape

# df_m.columns?