In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('datasets/smiles2dock_train.csv')
test  = pd.read_csv('datasets/smiles2dock_test.csv')
val   = pd.read_csv('datasets/smiles2dock_val.csv')

In [4]:
train

Unnamed: 0,ligand,protein,score1
0,CHEMBL4165409,dpp4,-6.681
1,CHEMBL5012345,map3k15,-6.266
2,CHEMBL4209181,scn10a,-7.041
3,CHEMBL4594123,scn9a,-5.366
4,CHEMBL1559636,adcy5,-3.259
...,...,...,...
11977521,CHEMBL1375008,scn10a,-6.729
11977522,CHEMBL297168,scn10a,-3.061
11977523,CHEMBL3184097,gpr75,-8.563
11977524,CHEMBL1602012,scn9a,-9.391


In [6]:
def categorize_score_based_on_sigma(score, mean, sigma):
    """
    Categorizes the score with 'Medium' split into 'Medium+' and 'Medium-',
    and 'Very Strong' and 'Very Weak' for scores beyond ±2 sigma of the mean.
    
    Parameters:
    - score: The docking score to categorize.
    - mean: Mean of the docking scores.
    - sigma: Standard deviation of the docking scores.
    
    Returns:
    - Category of the score based on sigma.
    """
    if score <= mean - 2*sigma:
        return 'Very Strong'
    elif score <= mean - sigma:
        return 'Strong'
    elif score < mean:
        return 'Medium+'
    elif score < mean + sigma:
        return 'Medium-'
    elif score < mean + 2*sigma:
        return 'Weak'
    else:
        return 'Very Weak'

In [8]:
# Calculate mean and standard deviation
mean  = train['score1'].mean()
sigma = train['score1'].std()

# Categorize scores
train['category'] = train['score1'].apply(lambda x: categorize_score_based_on_sigma(x, mean, sigma))
# Count the number of scores in each category
category_counts_train = train['category'].value_counts()

# Print the results
print(category_counts_train)

category
Medium+        6762625
Medium-        4849468
Weak            149022
Very Weak       139964
Strong           75562
Very Strong        885
Name: count, dtype: int64


In [10]:
def get_category_count(df):
    mean  = df['score1'].mean()
    sigma = df['score1'].std()

    # Categorize scores
    df['category'] = df['score1'].apply(lambda x: categorize_score_based_on_sigma(x, mean, sigma))
    
    # Count the number of scores in each category
    category_counts = df['category'].value_counts()

    return category_counts

In [11]:
category_counts_train = get_category_count(train)
category_counts_test  = get_category_count(test)
category_counts_val   = get_category_count(val)

In [17]:
# Ensure all categories are present in each series
all_categories = ['Very Strong', 'Strong', 'Medium+', 'Medium-', 'Weak', 'Very Weak']  # Add any missing categories

train_counts = category_counts_train.reindex(all_categories, fill_value=0)
test_counts  = category_counts_test.reindex(all_categories,  fill_value=0)
val_counts   = category_counts_val.reindex(all_categories,   fill_value=0)

counts_df = pd.DataFrame({
    'Train': train_counts,
    'Test': test_counts,
    'Validation': val_counts
})

latex_table = counts_df.to_latex()
print(latex_table)

\begin{tabular}{lrrr}
\toprule
 & Train & Test & Validation \\
category &  &  &  \\
\midrule
Very Strong & 885 & 184 & 209 \\
Strong & 75562 & 14944 & 15470 \\
Medium+ & 6762625 & 962654 & 1940959 \\
Medium- & 4849468 & 689325 & 1387228 \\
Weak & 149022 & 22607 & 40578 \\
Very Weak & 139964 & 21362 & 37706 \\
\bottomrule
\end{tabular}

