# New ML Active Learning Workflow
---

# Import Modules

In [1]:
import os
import sys
import pickle

import pandas as pd

from sklearn.decomposition import PCA

from protosearch.ml_modelling.fingerprint import (
    FingerPrint,
    VoronoiFingerprint
    )

pd.set_option('display.max_rows', None)

# Read Data

In [2]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/parsing_chris_dft_data",
    "df_dft_calcs.pickle")
with open(path_i, "rb") as fle:
    df_dft_calcs = pickle.load(fle)

# #############################################################################

path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling",
    "df_oqmd_data.pickle")
with open(path_i, "rb") as fle:
    df_oqmd_data = pickle.load(fle)

df_train = pd.concat([df_oqmd_data, df_dft_calcs], sort=False)
# Removing missing data
df_train = df_train[df_train["atoms"].notnull()]

# #############################################################################
# #############################################################################
# #############################################################################

path_i = os.path.join(
    os.environ["PROJ_irox"],
    "chris_prototypes_structures",
    "data_structures.pickle")
with open(path_i, "rb") as fle:
    df_struct = pickle.load(fle)

# #############################################################################

path_i = os.path.join(
    os.environ["PROJ_irox"],
    "chris_prototypes_structures",
    "data_prototypes.pickle")
with open(path_i, "rb") as fle:
    df_proto = pickle.load(fle)

# #############################################################################

path_i = os.path.join(
    os.environ["PROJ_irox"],
    "data/ml_irox_data",
    "unique_ids.csv")
df_ids = pd.read_csv(path_i)

# Combining Static, OQMD, and DFT Calculated Structures

In [3]:
master_data = []
for i_cnt, row_i in df_ids.iterrows():
    id_i = row_i["unique_ids"] 

    if id_i in df_struct.index.values: id_in_structs = True
    else: id_in_structs = False
    # #############################################################################
    if id_i in df_train.index.values: id_in_trains = True
    else: id_in_trains = False

    # #############################################################################
    # #############################################################################

    # The DFT optimized z39g648rnl structure isn't yielding NaN's from
    # the Voronoi tesselation
    # For now, ignore the DFT structure and use the regular one
    # COMBAK | TEMP
    if id_i == "z39g648rnl":
        id_in_trains = False
        id_in_structs = True


    if id_in_trains: row_j = df_train.loc[id_i]
    elif id_in_structs: row_j = df_struct.loc[id_i]
    else: row_j = None; print("NOOOOOOOOOOOOOOOOO NOT GOOD | No ")

    master_data.append(row_j)

df_combined = pd.DataFrame(master_data)

# Featurizing

In [4]:
FP = FingerPrint(**{
    "feature_methods": ["voronoi"],
    "input_data": df_combined,
#     "input_data": df_combined,
    "input_index": ["atoms"]})

FP.generate_fingerprints()
FP.clean_features()
# FP.join_input_to_fingerprints()

df_features = FP.fingerprints["voronoi"]

if all(list(df_features.isnull().sum().to_dict().values())) == 0:
    print("No null values in the voro fingerprint")

Generate Voronoi fingerprint of 967 structures
No null values in the voro fingerprint


# Principle Component Analysis

In [5]:
pca = PCA(n_components=20)

pca_features = pca.fit_transform(df_features)

df_features_pca = pd.DataFrame(
    pca_features,
    columns=['PCA%i' % i for i in range(pca.n_components)],
    index=df_features.index)

# Save features to pickle

In [7]:
path_i = os.path.join(
    os.environ["PROJ_irox"],
    "workflow/ml_modelling/00_ml_workflow/"
    "190611_new_workflow/data/"
    "df_features_pca.pickle")

with open(path_i, "wb") as fle:
    pickle.dump(df_features_pca, fle)