# Walkthrough 

This notebook will provide examples on how to use the .py file functions and classes in this repository. 

In [1]:
# How to handle: KeyError: "column_name"
# If any of the functions don't seem to be working, make sure that your pandas df is indexed. If it is not, use the below code: 

import pandas as pd

def indexDf(infile_path):
    # Read the CSV file
    df = pd.read_csv(infile_path)

    # Add an index column
    df_reset = df.reset_index()

    # Save the DataFrame to a new CSV file
    df_reset.to_csv(infile_path, index=False)

# infile_path = "data archive/dielectric.csv"
# indexDf(infile_path)

## How to use Solvents.py

In [None]:
from Solvents import SolventData

Say you want to find a solvent's experimental data (ex. o-dichlorbenzene):

In [None]:
dielectric_path = "data archive/dielectric.csv"
solvent_data = SolventData(dielectric_path)
odichlorbenzene_dict = solvent_data.getSolventData("1,2-Dichlorobenzene")
print(odichlorbenzene_dict)

Say you want to update the solvent archive file with new experimental solvent data:

In [None]:
# The input file must have a column called "Solvent" that holds the solvent's name. 
# Solvents with experimental data can be present, the function will only update rows with missing experimental data
dielectric_path = "data archive/dielectric.csv"
solvent_archive_path = "data archive/solvents_archive.csv"

# Uncomment below line if .csv is not indexed
# indexDf(solvent_archive_path)

solvent_data = SolventData(dielectric_path)
solvent_data.main(solvent_archive_path)

## How to use Featurize.py

In [2]:
import Featurize

Say you want experimental data on a solvent, dioxane: 

In [None]:
solvent_name = "dioxane"
experimental_solvent = Featurize.getExperimentalSolvent(solvent_name)
print(experimental_solvent)

# Get a specific value in the dictionary:
solvent_smiles = experimental_solvent["Solvent_SMILES"]
print(f"solvent smiles: {solvent_smiles}")

A few examples on how you would use individual feature functions:

In [None]:
# Example monomer: L-Lactide
# define parameters that will be passed into functions
canonical_monomer_smiles = "C[C@@H]1OC(=O)[C@H](C)OC1=O"
dp = 5
solvent_smiles = "C1COCCO1"
monomer_base_state = "s"
polymerization_type = "ROP"

In [None]:
rdkit_features = Featurize.getRdkitDescriptors(canonical_monomer_smiles)
print(f"RDKIT features: {rdkit_features}")
pep = Featurize.getPEP(canonical_monomer_smiles, polymerization_type,dp)
print(f"Enthalpy PEP Feature: {pep}")
solvent_features = Featurize.getSolventFeatures(canonical_monomer_smiles, monomer_base_state, solvent_smiles)
print(f"Solvent features: {solvent_features}")

## Create a .csv with all feature data given you have a .csv with monomer strings

In [3]:
import Featurize

In [None]:
# path to .csv file will monomer data that you want to featurize
infile_path = "monomer data build/entropy.csv"

# target is dS (J/mol/K) or dH (KJ/mol)
target = "dS (J/mol/K)"

# choose a degree of polymerization
dp = 5

Featurize.main(infile_path, target, dp)

## How to use Polymerize.py

In [None]:
from Polymerize import Polymerization

Say you want to polymerize a monomer (ex. L-Lactide):

In [None]:
# Define parameters
canonical_monomer_smiles = "C[C@@H]1OC(=O)[C@H](C)OC1=O"
polymerization_type = "ROP"
dp = 5

In [None]:
# create an object of the class
polymerize_obj = Polymerization(canonical_monomer_smiles, polymerization_type,dp)

polymer_dict = polymerize_obj.main()
print(polymer_dict)

## How to use PreprocessData.py

In [None]:
import PreprocessData

In [None]:
# path to .csv file with monomer feature data
infile_path = "monomer data build/featurized_entropy.csv"

Impute, one-hot encode and clean RDKIT columns all at once:

In [None]:
PreprocessData.main(infile_path)

Alternatively, you can do each preprocessing step individually:

In [None]:
imputed_path = PreprocessData.impute(infile_path)

imputed_and_one_hot_encoded_path = PreprocessData.oneHotEncode(imputed_path)

completed_preprocess_path = PreprocessData.cleanRDKIT(imputed_and_one_hot_encoded_path)

## How to use Models.py

In [None]:
import Models

In [None]:
infile_path = "/Users/hunter/Downloads/BROADBELT LAB/thermonomer-1/monomer data build/cleaned_encoded_imputed_featurized_entropy.csv"
n_iters = 200
target = "dS (J/mol/K)"

Models.main(infile_path, n_iters, target, get_hyperparams = True, get_models = True, XGB_only = False, bayes=True)

## How to use Data.py

In [None]:
import Data

In [None]:
Data.summarizeTrainTestResults("/Users/hunter/Downloads/BROADBELT LAB/thermonomer-1/model_results")

In [None]:
Data.summarizeLOOCVResults("/Users/hunter/Downloads/BROADBELT LAB/thermonomer-1/model_results_LOOCV")

In [None]:
path = "/Users/hunter/Downloads/BROADBELT LAB/thermonomer-1/model_results_LOOCV/XGB_LOOCV_4.csv"
Data.graphExpPred(path)

In [None]:
import Data
path = "/Users/hunter/Downloads/BROADBELT LAB/thermonomer-1/final_results/feature_ranking_2.csv"
num_feat = 15
Data.graphFeatureRanking(path, num_feat)