# Cleaning of CJHIF dataset

This Notebook is used for the manual correction of name to SMILES that could not be translated with PubChemPy or py2opsin.

## Manual correction of names to SMILES by occurrence.

### Load datasets

In [None]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

In [None]:
# Load the translation table in a pandas dataframe
df = pd.read_csv("../data/helper/cjhif_translation_table.tsv", sep="\t")
print(f"There are {len(df)} compounds in the translation table.")

In [None]:
# Load the correction table, assuming the file exists
df_correction = pd.read_csv("../data/helper/corrected_pubchem.tsv", sep="\t")
print(f"There are {len(df_correction)} compounds in the correction table.")

### Manually correct a name to SMILES

In [None]:
new_data = ["NAME_HERE", "SMILES_HERE"]
df_correction.loc[len(df_correction)] = new_data
print(f"{new_data[0]} has been added to the correction table.")

In [None]:
# Save the file and reload it
df_correction.to_csv(
    "../data/helper/corrected_pubchem.tsv", sep="\t", index=False
)
df_correction = pd.read_csv("../data/helper/corrected_pubchem.tsv", sep="\t")

### Visualization of the non-translated names

In [None]:
# Extract the non-translated compounds
empty_pubchem_iso = df[df["pubchem_isosmiles"] == "empty"]
empty_pubchem_iso = empty_pubchem_iso[empty_pubchem_iso["Compound"] != "empty"]

# remove the raw if the name is in df_correction[0]
for i in df_correction["Name"]:
    empty_pubchem_iso = empty_pubchem_iso[empty_pubchem_iso["Compound"] != i]

In [None]:
import matplotlib.pyplot as plt
import pandas as pd


def plot_top_compounds(df: pd.DataFrame, top_N: int = 30):
    data = dict(zip(df["Compound"], df["Occurrences"]))

    sorted_data = sorted(data.items(), key=lambda x: x[1], reverse=True)

    top_compounds = sorted_data[:top_N]
    labels, values = zip(*top_compounds)

    plt.figure(figsize=(15, 7))
    plt.bar(labels, values, color="#5402A3")
    plt.xticks(rotation=45, ha="right")
    plt.ylabel("Occurrences")
    plt.title(f"Top {top_N} Most Common Compounds")
    plt.tight_layout()
    plt.show()

    for i, (compound, occurrence) in enumerate(top_compounds, start=1):
        print(f"{i}. {compound}: {occurrence}")


plot_top_compounds(empty_pubchem_iso, top_N=60)

### Manual correction of chiral reagents and catalysts

In [None]:
# Selecting compound only with +, -, (r) or (s) in the name


def select_compound(df: pd.DataFrame):
    df = df[
        df["Compound"].str.contains("\(\+\)")
        | df["Compound"].str.contains("\(\-\)")
        | df["Compound"].str.contains("\(r\)")
        | df["Compound"].str.contains("\(s\)")
    ]
    return df


empty_pubchem_chiral = select_compound(empty_pubchem_iso)
empty_pubchem_chiral[empty_pubchem_chiral["Occurrences"] > 50]

## Clustering 

Here we will cluster the names that were not translated using the DBSCAN algorithm. Once the names are clustered, we corrected them manually starting from the cluster that have the most occurrence.

In [None]:
# This takes about 10 minutes to run

import ast

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from sklearn.cluster import DBSCAN


def get_similarity_matrix(compounds):
    num_compounds = len(compounds)
    similarity_matrix = np.zeros((num_compounds, num_compounds))
    for i in range(num_compounds):
        for j in range(i, num_compounds):
            similarity = fuzz.ratio(compounds[i], compounds[j])
            similarity_matrix[i, j] = similarity_matrix[j, i] = similarity
    return similarity_matrix


compounds = empty_pubchem_iso["Compound"].tolist()
similarity_matrix = get_similarity_matrix(compounds)

distance_matrix = 1 - similarity_matrix / 100.0

db = DBSCAN(eps=0.1, min_samples=1, metric="precomputed")
clusters = db.fit_predict(distance_matrix)
empty_pubchem_iso["cluster"] = clusters


clustered_data = (
    empty_pubchem_iso.groupby("cluster")
    .agg(
        Total_Occurrences=("Occurrences", "sum"),
        Compound_List=("Compound", list),
        Num_Compounds=("Compound", "count"),
    )
    .sort_values(by="Total_Occurrences", ascending=False)
    .reset_index()
)

# save results
clustered_data.to_csv("clustered_data.csv", index=False)

In [None]:
clustered_data = pd.read_csv("clustered_data.csv")

In [None]:
# Show the most total occurrences clusters
clustered_data.head(10)

In [None]:
# Show the compounds in a cluster by dataframe index
cluster_index = 1
print(ast.literal_eval(clustered_data["Compound_List"][cluster_index]))

In [None]:
# Add names and SMILES to the correction table

new_smiles = "NEW_SMILES_HERE"
cluster_index = 1
for k in ast.literal_eval(clustered_data["Compound_List"][cluster_index]):
    new_data = [k, new_smiles]
    df_correction.loc[len(df_correction)] = new_data

clustered_data.drop(index=index, inplace=True)
df_correction.to_csv("corrected_pubchem.tsv", sep="\t", index=False)
df_correction = pd.read_csv("../data/helper/corrected_pubchem.tsv", sep="\t")
print("Dataset updated and saved.")