# Imports

In [2]:
import os
import sys
import re
import pickle
import random
import collections
from enum import (
    Enum,
    unique
)
from typing import (
    Dict,
    List,
    Optional
)

import numpy as np
from numpy import ndarray

import pandas as pd
from pandas import DataFrame

In [3]:
# Place the package on the path

QSRR_PATH: str = os.path.join(os.path.dirname(os.getcwd()))

if QSRR_PATH not in sys.path:
    sys.path.insert(0, QSRR_PATH)

In [4]:
from qsrr.process import MoleculeEncoder

# Load and Process Data

In [None]:
DATA_PATH: str = os.path.join(QSRR_PATH, "data")
SMRT_DATASET: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset.csv")
SMRT_DATASET_SMILES: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset_smiles.csv")
SMRT_DATASET_ENCODED_SMILES: str = os.path.join(DATA_PATH, "2023-11-18-smrt_dataset_encoded_smiles.pkl")

In [None]:
_data_df = pd.read_csv(
    SMRT_DATASET, 
    delimiter=';'
)

In [None]:
display(_data_df.head())

In [None]:
_data_df['rt'].describe().round(2)

In [None]:
# Plot the distribution of the retention times
plt.hist(
    _data_df['rt'] / 60, 
    bins=50, 
    edgecolor='black'
)
plt.xlabel('Retention time (min)')
plt.ylabel('Number of molecules')
plt.show()

In [None]:
# Filter out data with RT < 7 minutes
_threshold_rt = 7 * 60
_data_adjusted_df = _data_df.loc[_data_df['rt'] > _threshold_rt].reset_index(drop=True)
_count = _data_df[_data_df['rt'] < _threshold_rt].count()

In [None]:
print(f'Number of rows with rt values less than {_threshold_rt // 60} minutes: ', _count.iloc[0])

In [None]:
display(_data_adjusted_df['rt'].describe().round(2))

In [None]:
# Plot the distribution of the retention times
plt.hist(
    _data_adjusted_df['rt'] / 60, 
    bins=50, 
    edgecolor='black'
)
plt.xlabel('Retention time (min)')
plt.ylabel('Number of molecules')
plt.show()

# Generate Descriptors

In [None]:
# Convert InChi to SMILES
_smiles_list = []
_rt_list = []

for _col, _row in _data_adjusted_df.iterrows():
    try:
        _smiles_list.append(
            Chem.MolToSmiles(
                Chem.MolFromInchi(
                    _row["inchi"]
                )
            )
        )
        _rt_list.append(_row['rt'])
    except:
        pass

In [None]:
_data_adjusted_smiles_df = pd.DataFrame().from_dict(
    {'smiles': _smiles_list, 'rt': _rt_list}
)
_data_adjusted_smiles_df.to_csv(
    SMRT_DATASET_SMILES, 
    index=False
)

In [None]:
_data_adjusted_smiles_df.head()

# Create Dataset


In [None]:
# Build vocabulary of SMILES characters
_vocabulary, _inverse_vocabulary = Encoder.build(_data_adjusted_smiles_df, file_path=None)

In [None]:
print(_inverse_vocabulary)

In [None]:
# Encode SMILES to integers
_encoded_smiles_array = Encoder.encode(_data_adjusted_smiles_df, _vocabulary, 90, EncodingType.OneHot)

In [None]:
_encoded_smiles_array.shape

In [None]:
# Check if the process is reversible
_random_compound_idx: int = random.randint(0, _encoded_smiles_array.shape[0])
_random_smiles_encoded: str = _encoded_smiles_array[_random_compound_idx, :, :]
_random_smiles_decoded: str = Encoder.decode(_random_smiles_encoded, _inverse_vocabulary, EncodingType.OneHot)

In [None]:
_random_smiles_encoded

In [None]:
_random_smiles_decoded == _data_adjusted_smiles_df.loc[_random_compound_idx, "smiles"]

# Save the Dataset

In [None]:
with open(SMRT_DATASET_ENCODED_SMILES, "wb") as f:
    pickle.dump(_encoded_smiles_array, f)