In [1]:
from src.datasets.dataset_creation import RepresentationDataset
import pandas as pd
from src.representations.featurizer import ResidueFeaturizer

Representation Dataset and Representaion In Memory dataset are both used the same way.

# Set Up

Set up labels

In [2]:
# Read the temperature data
temp_df = pd.read_csv("../AFG/data/temp_pdb_train.csv", index_col=0)
# Adding .pdb extension so it has the same name as in the directory
temp_df["pdb_id"] = temp_df["pdb_id"] + "_cleaned_centered.cif"
# Transform it to dictionary
temp_dict = temp_df.set_index('pdb_id').to_dict()["t_opt"]
#temp_dict

Set up directories

In [3]:
raw_data_directory = "../AFG/data/pdbs/work_cleaned_cif"
dataset_directory = "../AFG/data/final_datasets/"

Set up feature lists

In [4]:
full_feature_list = ["one hot", "Blossum", "isoelectric point", "hydrophobicity",
            "polarity", "dihedral", "volume", "secondary structure", "accessible surface"]
onehot_feature_list = ["one hot"]
similarity_feature_list = ["Blossum"]
structural_ohe_feature_list = ["one hot", "dihedral", "volume", "secondary structure", "accessible surface"]
structural_blossum_feature_list = ["Blossum", "dihedral", "volume", "secondary structure", "accessible surface"]

Set up CA featurizers

In [5]:
residue_center = "CA"
full_res_featurizer = ResidueFeaturizer(residue_center=residue_center, feature_list=full_feature_list)
onehot_res_featurizer = ResidueFeaturizer(residue_center=residue_center, feature_list=onehot_feature_list)
similarity_res_featurizer = ResidueFeaturizer(residue_center=residue_center, feature_list=similarity_feature_list)
structural_ohe_res_featurizer = ResidueFeaturizer(residue_center=residue_center, feature_list=structural_ohe_feature_list)
structural_blossum_res_featurizer = ResidueFeaturizer(residue_center=residue_center, feature_list=structural_blossum_feature_list)

# Dataset Creation

Work Graph Dataset:
- Distance graph
- 8A
- All features
- CA residue center

In [6]:
root = dataset_directory + "work_graph"
RepresentationDataset(raw_data_dir=raw_data_directory, root = root, dataset_name= "work_final_dataset",
                      distance = 8, label_dict=temp_dict, representation="graph",
                      edges_method="distance", featurizer=full_res_featurizer)

RepresentationDataset(2267)

One Hot Graph Dataset:
- Distance graph
- 8A
- Only sequence one hot as feature
- CA residue center

In [7]:
dataset_name = "onehot_8A_CA_graph"
root = dataset_directory + dataset_name
RepresentationDataset(raw_data_dir=raw_data_directory, root = root, dataset_name= dataset_name,
                      distance = 8, label_dict=temp_dict, representation="graph",
                      edges_method="distance", featurizer=onehot_res_featurizer)

RepresentationDataset(2267)

Blossum Graph Dataset:
- Distance graph
- 8A
- Only sequence blossum as feature
- CA residue center

In [8]:
dataset_name = "blossum_8A_CA_graph"
root = dataset_directory + dataset_name
RepresentationDataset(raw_data_dir=raw_data_directory, root = root, dataset_name= dataset_name,
                      distance = 8, label_dict=temp_dict, representation="graph",
                      edges_method="distance", featurizer=similarity_res_featurizer)

RepresentationDataset(2267)

One hot structural Graph Dataset:
- Distance graph
- 8A
- Sequence one hot enconding and structural features
- CA residue center

In [9]:
dataset_name = "ohe_structural_8A_CA_graph"
root = dataset_directory + dataset_name
RepresentationDataset(raw_data_dir=raw_data_directory, root = root, dataset_name= dataset_name,
                      distance = 8, label_dict=temp_dict, representation="graph",
                      edges_method="distance", featurizer=structural_ohe_res_featurizer)

RepresentationDataset(2267)

Blossum structural Graph Dataset:
- Distance graph
- 8A
- Sequence blossum enconding and structural features
- CA residue center

In [10]:
dataset_name = "blossum_structural_8A_CA_graph"
root = dataset_directory + dataset_name
RepresentationDataset(raw_data_dir=raw_data_directory, root = root, dataset_name= dataset_name,
                      distance = 8, label_dict=temp_dict, representation="graph",
                      edges_method="distance", featurizer=structural_blossum_res_featurizer)

RepresentationDataset(2267)