## Table of contents
* [Prepare project](#prepare)
* [Function definitions](#function)
  * [Encode DRFP](#drfp)
  * [Encode RXNFP](#rxnfp)
  * [Split into Train / Test / Validation](#split)
  * [Main](#main)
* [Run](#run)

## Prepare project <a class="anchor" id="prepare"></a>

### Define parameters

In [None]:
# Set this to 'drfp' or 'rxnfp'.
fpencoder = 'drfp'

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler

if fpencoder == 'rxnfp':
    from rxnfp.transformer_fingerprints import RXNBERTFingerprintGenerator, get_default_model_and_tokenizer, generate_fingerprints
elif fpencoder == 'drfp':
    from drfp import DrfpEncoder
from tqdm import tqdm

tqdm.pandas()

### Create directories

In [None]:
# Changing paths is not recommended (you'd have to adapt the other notebooks accordingly).
output_path = 'experiments/data/'
Path(output_path).mkdir(parents=True, exist_ok=True)
Path('data').mkdir(parents=True, exist_ok=True)

### Download and prepare input data
Download the input files from the Rhea FTP site https://ftp.expasy.org/databases/rhea/tsv/ (internet connection required) and merge them.

In [None]:
import urllib.request
from contextlib import closing

# Get the curated Rhea-EC mapping.
with closing(urllib.request.urlopen('https://ftp.expasy.org/databases/rhea/tsv/rhea2ec.tsv')) as r:
    header = r.readline().decode("utf-8")
    with open('data/rhea_ec_directions_expanded.tsv', 'w') as w:
        w.write(header.strip()+'\trhea_exp\n')
        for line in r:
            for i in range(4):
                w.write(line.decode("utf-8").strip())
                w.write('\t'+str(int(line.decode("utf-8").split('\t')[0])+i)+'\n')
                
# Get the Rhea reaction SMILES and merge them with the EC mapping.
df_smiles = pd.read_csv('https://ftp.expasy.org/databases/rhea/tsv/rhea-reaction-smiles.tsv', sep='\t', header=None, names=['rhea_id','rxn_smiles'])
df_ec = pd.read_csv('data/rhea_ec_directions_expanded.tsv', sep='\t')
df = df_ec.merge(df_smiles, how='right', left_on='rhea_exp',right_on='rhea_id')
df.drop(columns=['DIRECTION', 'RHEA_ID', 'rhea_exp'], inplace=True)
df.rename({'rhea':'id', 'ID':'ec', 'rxnsmiles':'rxn'}, axis=1, inplace=True)
df.to_csv('data/rheadb.csv.gz', compression='gzip', index=False)

if Path('data/rheadb.csv.gz').exists():
    Path('data/rhea_ec_directions_expanded.tsv').unlink()

## Function definitions <a class="anchor" id="function"></a>

### Encode DRFP <a class="anchor" id="drfp"></a>

In [None]:
def encodeDRFP(df):
    
    df["fps"] = DrfpEncoder.encode(
        df.rxn_smiles,
        show_progress_bar=True,
        root_central_atom=False,
        radius=2,
        include_hydrogens=True,
        n_folded_length=10240,
    )
    output = 'data/reactions_with_fp_encoded_drfp.tsv'
    df.to_csv(output, sep='\t')
    print('=> Created file', output)
    return df

### Encode RXNFP <a class="anchor" id="rxnfp"></a>

In [None]:
def encodeRXNFP(df):
    
    model, tokenizer = get_default_model_and_tokenizer()

    rxnfp_generator = RXNBERTFingerprintGenerator(model, tokenizer)
    df["fps_unnormalized"] = df.progress_apply(rxnfp_for_entry, axis=1, args=[rxnfp_generator,])
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    df["fps_unnormalized"] = [str(i).replace('[', '').replace(']', '') for i in df["fps_unnormalized"]]
    array_fps = np.array([fp.split(', ') for fp in df["fps_unnormalized"].to_list()])
    array_fps = array_fps.astype(float)
    scaler.fit(array_fps)
    fit_array = scaler.transform(array_fps)
    df["fps"] = pd.Series(data=fit_array.tolist())
    df.dropna(subset=['fps_unnormalized'], inplace=True)
    
    output = 'data/reactions_with_fp_encoded_rxnfp.tsv'
    df.to_csv(output, sep='\t')
    print('=> Created file', output)
    return df

def rxnfp_for_entry(row, rxnfp_generator):
    try:
        fp = rxnfp_generator.convert(row.rxn_smiles)
        return fp #','.join([str(i) for i in fp])
    except Exception as e:
        print('Exception FP:', e)
        pass

### Split into Train / Test / Validation <a class="anchor" id="split"></a>

In [None]:
def splitIntoTrainTestValidation(df):
    for ec in ["ec1", "ec12", "ec123"]:
        X = df.rxn_smiles.to_numpy()
        y = df[ec].to_numpy()
        fps = df.fps.to_numpy()
        groups = df.ec_1.to_numpy()

        sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
        sss_valid = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

        for i, (train_index, test_valid_index) in enumerate(sss.split(X, groups)):
            for _, (test_index, valid_index) in enumerate(
                sss_valid.split(
                    X[test_valid_index],
                    groups[test_valid_index],
                )
            ):
                X_train = X[train_index]
                y_train = y[train_index]
                fps_train = fps[train_index]

                X_valid = X[valid_index]
                y_valid = y[valid_index]
                fps_valid = fps[valid_index]

                X_test = X[test_index]
                y_test = y[test_index]
                fps_test = fps[test_index]

                df_train = pd.DataFrame(
                    {
                        "rxn_smiles": X_train,
                        "label": y_train,
                        "fps": [";".join(map(str, fp)) for fp in fps_train],
                    }
                )
                df_valid = pd.DataFrame(
                    {
                        "rxn_smiles": X_valid,
                        "label": y_valid,
                        "fps": [";".join(map(str, fp)) for fp in fps_valid],
                    }
                )
                df_test = pd.DataFrame(
                    {
                        "rxn_smiles": X_test,
                        "label": y_test,
                        "fps": [";".join(map(str, fp)) for fp in fps_test],
                    }
                )

                df_train.to_csv(f"{output_path}{fpencoder}-{i}-{ec}-train.csv", index=False)
                df_valid.to_csv(f"{output_path}{fpencoder}-{i}-{ec}-valid.csv", index=False)
                df_test.to_csv(f"{output_path}{fpencoder}-{i}-{ec}-test.csv", index=False)
                
    print('=> Created train, test and validation files in', output_path)

### Main <a class="anchor" id="main"></a>

In [None]:
def main():
    # Read the Rhea input data.
    df = pd.read_csv('data/rheadb.csv.gz')
    # Exclude reactions without EC.
    df = df[df.ec.notna()]
    df.drop_duplicates(subset=['MASTER_ID'], inplace=True)

    df[["ec_1", "ec_2", "ec_3", "ec_4"]] = df.ec.str.split(".", expand=True)
    df["ec1"]   = df.ec_1.astype(str)
    df["ec12"]  = df.ec_1.astype(str) + "." + df.ec_2.astype(str)
    df["ec123"] = df.ec_1.astype(str) + "." + df.ec_2.astype(str) + "." + df.ec_3.astype(str)
    
    # Remove transport reactions (EC class 7).
    df = df[df.ec1 != "7"]
    df.reset_index(inplace=True)
    
    print('-> Creating', fpencoder, 'fingerprints for', len(df), 'unique Rhea reactions with curated EC (excluding EC7)')
    if fpencoder == "drfp":
        df = encodeDRFP(df)
    elif fpencoder == "rxnfp":
        df = encodeRXNFP(df)
    else:
        print('Error: Fingerprint encoder type must be drfp or rxnfp')
    
    print('-> Creating train, test, validation subsets for the training phase')
    splitIntoTrainTestValidation(df)
    
    print('=> Finished')

## Run  <a class="anchor" id="run"></a>

In [None]:
main()