Here, I try to obtain embeddings of the given smiles using chemberta so that later, we can use different types of simple machine learning models to predict their properties

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np
from functions import *

In [180]:
# model = SentenceTransformer("Derify/ChemMRL-alpha")
model = SentenceTransformer('../models/ChemBERTa-77M-MLM')

No sentence-transformers model found with name ../models/ChemBERTa-77M-MLM. Creating a new one with mean pooling.


In [3]:
# read data
df_train = pd.read_csv('../data/raw/neurips-open-polymer-prediction-2025/train.csv')
df_test = pd.read_csv('../data/raw/neurips-open-polymer-prediction-2025/test.csv')
print(f'Train shape: {df_train.shape}')
print(f'Test shape: {df_test.shape}')

Train shape: (7973, 7)
Test shape: (3, 2)


### Simple model, Nadaraya-Watson regression for each property

In [25]:
# get all the subsets of the df, where each subset has all the information for one property
data_tg = df_train.loc[~df_train['Tg'].isna(), ['id', 'SMILES', 'Tg']]
data_ffv = df_train.loc[~df_train['FFV'].isna(), ['id', 'SMILES', 'FFV']]
data_tc = df_train.loc[~df_train['Tc'].isna(), ['id', 'SMILES', 'Tc']]
data_density = df_train.loc[~df_train['Density'].isna(), ['id', 'SMILES', 'Density']]
data_rg = df_train.loc[~df_train['Rg'].isna(), ['id', 'SMILES', 'Rg']]

In [182]:
embeddings = model.encode(df_train['SMILES'])

In [183]:
# class holding all information of the regressor we are creating in one place
class NWRegressor():
    def __init__(self, df_train, model, kernel_bandwidth):
        # store the model as an attribute
        self.model = model
        # store as an attribute, the bandwidth of the kernel
        self.bandwidth = kernel_bandwidth
        # the df_train is also an integral part of the object, so should be stored as an attribute
        self.df_train = df_train
        # store all the data-subset attributes
        self.data_tg = df_train.loc[~df_train['Tg'].isna(), ['id', 'SMILES', 'Tg']]
        self.data_ffv = df_train.loc[~df_train['FFV'].isna(), ['id', 'SMILES', 'FFV']]
        self.data_tc = df_train.loc[~df_train['Tc'].isna(), ['id', 'SMILES', 'Tc']]
        self.data_density = df_train.loc[~df_train['Density'].isna(), ['id', 'SMILES', 'Density']]
        self.data_rg = df_train.loc[~df_train['Rg'].isna(), ['id', 'SMILES', 'Rg']]

    # separate method to initialize and assign embeddings because sometimes we may have them already computed and just want to assign so there is no need to do this in __init__
    def store_embeddings(self):
        # store the embeddings of all the smiles in df_train
        self.embeddings = self.model.encode(self.df_train['SMILES']) # shape: (nrow(df_train), 1024) = (7973, 1024)

    def kernel_fn(self, x):
        numerator = -(x * x)
        denominator = 2 * self.bandwidth * self.bandwidth
        return np.exp(numerator/denominator)
    
    def distance(self, arr1, arr2):
        # Compute pairwise distances (Euclidean by default)
        arr1_exp = arr1[:, np.newaxis, :]  # (x, 1, 120)
        arr2_exp = arr2[np.newaxis, :, :]  # (1, y, 120)
        sq_diff = (arr1_exp - arr2_exp) ** 2  # (x, y, 120)
        sq_distances = np.sum(sq_diff, axis=2)  # (x, y)
        distances = np.sqrt(sq_distances)  # (x, y)
        return distances

    # data_property is the dataframe corresponding to a certain property for which predictions are being made
    def predict_property(self, test_embeddings, data_property):
        # find out the distances between test_embeddings and self.embeddings' subset of data_property
        dists = self.distance(self.embeddings[data_property.index], test_embeddings) # shape: (nrow(data_property), test_embeddings)
        weights = self.kernel_fn(dists) # pass the mere distances to kernel to get weights
        # now, we scale the kernel-passed weights so that for each test_embedding, the weights' sum is 1
        weights_scaled = weights/weights.sum(axis = 0)
        # now we just tranpose as test_embeddings being as rownames are more intuitive
        weights_scaled = weights_scaled.T # shape: (test_embeddings,nrow(data_property))
        # now, just perform dot product with tg values vector to get tg value for each test_embedding
        train_tg_values = data_property.iloc[:, -1].values
        output_property_values = np.matmul(weights_scaled, train_tg_values)
        return output_property_values # shape: (test_embeddings,)

        # HERE... go through the details of the nadaraya-watson implementation. Where scaling, softmax etc. Then, finally
        # use the scaled weights to compute the avg value of tg.
    
    # predict on a list of smiles
    def predict(self, smiles):
        # create empty numpy arrays on which we can append as we get more and more predictions
        tg = np.empty(0)
        ffv = np.empty(0)
        tc = np.empty(0)
        density = np.empty(0)
        rg = np.empty(0)
        # loop is for being able to treat big list of smiles in parts
        for i in range(0, len(smiles), 1):
            smiles_subset = smiles[i:i+1000]
            test_embeddings = self.model.encode(smiles_subset)
            # call the predict method for each property and return the values
            tg = np.concatenate([tg, self.predict_property(test_embeddings, self.data_tg)])
            ffv = np.concatenate([ffv, self.predict_property(test_embeddings, self.data_ffv)])
            tc = np.concatenate([tc, self.predict_property(test_embeddings, self.data_tc)])
            density = np.concatenate([density, self.predict_property(test_embeddings, self.data_density)])
            rg = np.concatenate([rg, self.predict_property(test_embeddings, self.data_rg)])
        
        return {'Tg': tg, 'FFV': ffv, 'Tc': tc, 'Density': density, 'Rg': rg}

In [164]:
# finding good bandwidth using validation
from sklearn.model_selection import train_test_split
df_train_subset1, df_train_subset2 = train_test_split(df_train, test_size=0.05, train_size=0.95)

In [165]:
# resetting indices is important as the NWRegressor class assumes the df to have index in proper order
df_train_subset1 = df_train_subset1.reset_index(drop = True)
df_train_subset2 = df_train_subset2.reset_index(drop = True)

In [187]:
embeddings_df_train_subset1 = model.encode(list(df_train_subset1['SMILES']))

In [188]:
df_train

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...,...
7968,2146592435,*Oc1cc(CCCCCCCC)cc(OC(=O)c2cccc(C(*)=O)c2)c1,,0.367498,,,
7969,2146810552,*C(=O)OCCN(CCOC(=O)c1ccc2c(c1)C(=O)N(c1cccc(N3...,,0.353280,,,
7970,2147191531,*c1cc(C(=O)NCCCCCCCC)cc(N2C(=O)c3ccc(-c4ccc5c(...,,0.369411,,,
7971,2147435020,*C=C(*)c1ccccc1C,261.662355,,,,


In [167]:
bw = 5
regressor = NWRegressor(df_train_subset1, model, bw)
regressor.embeddings = embeddings_df_train_subset1

In [168]:
results = regressor.predict(list(df_train_subset2['SMILES']))
results

{'Tg': array([96.9021827 , 96.86489382, 97.24017122, 97.08235879, 97.12738053,
        97.30993194, 96.41359025, 96.93743498, 97.02886353, 97.30738959,
        97.34384608, 97.30927689, 96.51444131, 96.47728191, 97.32933266,
        97.40552919, 97.42008   , 97.28600103, 97.31883791, 97.11382146,
        97.10139481, 97.17634599, 97.1814653 , 96.40158375, 97.02794687,
        97.11619299, 96.44739131, 96.73368298, 96.78446292, 96.80254243,
        97.24679163, 97.03527212, 97.28839675, 97.28476433, 96.86038435,
        96.85009155, 97.29550642, 97.34383014, 96.50819611, 96.51558432,
        96.77138167, 97.31019825, 97.16751858, 97.02672698, 97.16551505,
        96.70325237, 97.36286234, 97.41203767, 97.02902145, 96.50791265,
        96.87759389, 97.38628484, 97.18247764, 97.02217287, 96.9863213 ,
        97.14216936, 97.36011206, 97.28390467, 96.43026674, 96.73843746,
        96.82060294, 96.87652483, 96.57402842, 96.72118769, 96.99451865,
        97.3303303 , 97.46580452, 97.09954597

In [None]:
# mse between all the values we have in df_train_subset2, vs what we predicted
def MSE(results, df_train_subset2):
    # now, for all the values we did have in df_train_subset2, we will see how different our predictions were from them
    mask = df_train_subset2.iloc[:, -5:].isna().values
    # convert results to array
    results_arr = pd.DataFrame(results).values
    diff = results_arr - df_train_subset2.iloc[:, -5:].values
    mse = (diff[~mask]*diff[~mask]).mean()
    return mse

In [192]:
# loop through a range of bandwidth values and get the mse for each of them
for bw in [1, 3, 5, 7, 10, 15, 20]:
    regressor = NWRegressor(df_train_subset1, model, bw)
    regressor.embeddings = embeddings_df_train_subset1
    results = regressor.predict(list(df_train_subset2['SMILES']))
    mse = MSE(results, df_train_subset2)
    print(f'MSE for bandwidth {bw} is: {mse}')

KeyboardInterrupt: 

In [174]:
# loop through a range of bandwidth values and get the mse for each of them
for bw in [0.1, 0.5, 1, 3, 5, 7, 10, 15, 20]:
    regressor = NWRegressor(df_train_subset1, model, bw)
    regressor.embeddings = embeddings_df_train_subset1
    results = regressor.predict(list(df_train_subset2['SMILES']))
    mse = MSE(results, df_train_subset2)
    print(f'MSE for bandwidth {bw} is: {mse}')

MSE for bandwidth 0.1 is: 295.4577030396035
MSE for bandwidth 0.5 is: 381.51585066733224
MSE for bandwidth 1 is: 537.8446041863151
MSE for bandwidth 3 is: 596.609191776331
MSE for bandwidth 5 is: 601.4957767584965
MSE for bandwidth 7 is: 602.8462505183746
MSE for bandwidth 10 is: 603.5644226114141
MSE for bandwidth 15 is: 603.9479373244706
MSE for bandwidth 20 is: 604.0822210377564


In [178]:
# loop through a range of bandwidth values and get the mse for each of them
for bw in [0.09]:
    regressor = NWRegressor(df_train_subset1, model, bw)
    regressor.embeddings = embeddings_df_train_subset1
    results = regressor.predict(list(df_train_subset2['SMILES']))
    mse = MSE(results, df_train_subset2)
    print(f'MSE for bandwidth {bw} is: {mse}')

MSE for bandwidth 0.09 is: 310.3871255423069


In [104]:
regressor = NWRegressor(df_train, model, 10)
regressor.embeddings = embeddings

In [109]:
results = regressor.predict(list(df_test['SMILES']))
results

{'Tg': array([96.5414254 , 96.53859863, 96.52507728]),
 'FFV': array([0.36722244, 0.36721512, 0.36721965]),
 'Tc': array([0.25633297, 0.25630172, 0.2563517 ]),
 'Density': array([0.98554821, 0.98557284, 0.98552317]),
 'Rg': array([16.42099139, 16.41996536, 16.42111135])}

In [71]:
regressor.bandwidth = 1
results2 = regressor.predict(df_test['SMILES'])
results2

{'Tg': array([105.09817477, 104.73706845, 103.6838063 ]),
 'FFV': array([0.36823022, 0.36748777, 0.36795999]),
 'Tc': array([0.25606466, 0.25295064, 0.25783874]),
 'Density': array([0.99242539, 0.99511359, 0.98956572]),
 'Rg': array([16.56232519, 16.46827539, 16.55897016])}

In [73]:
regressor.bandwidth = 0.1
results3 = regressor.predict(df_test['SMILES'])
results3

{'Tg': array([117.60633137, 126.80818897,  48.24818847]),
 'FFV': array([0.38278817, 0.37967689, 0.35148612]),
 'Tc': array([0.21379043, 0.22237086, 0.18213848]),
 'Density': array([1.12927828, 1.10327563, 1.21231996]),
 'Rg': array([21.22370989, 21.5836155 , 18.07928376])}

In [87]:
# export into csv
def SaveOutput(results_dict, output_filename, df_test):
    df = pd.DataFrame()
    # get id col from df_test
    df['id'] = df_test['id']
    # get other info from results_dict
    df['Tg'] = results_dict['Tg']
    df['FFV'] = results_dict['FFV']
    df['Tc'] = results_dict['Tc']
    df['Density'] = results_dict['Density']
    df['Rg'] = results_dict['Rg']
    # save to specified location
    df.to_csv(output_filename, index = False)    

In [88]:
SaveOutput(results, 'nw_bw10.csv', df_test)

In [89]:
SaveOutput(results2, 'nw_bw1.csv', df_test)

In [90]:
SaveOutput(results3, 'nw_bw1tenth.csv', df_test)

In [114]:
data_tg

Unnamed: 0,id,SMILES,Tg
40,10142210,*NC(C)C(=O)NCC(=O)NCC(*)=O,208.639749
57,13838538,*CCCCCCSSCCCCSS*,-41.266724
63,16498242,*C=CCCCCCCCC*,-17.282022
108,30582999,*CCCCCCCCCCOC(=O)c1ccc(C(=O)NCCNC(=O)c2ccc(C(=...,4.250403
123,36217683,*c1nc2cc3sc(-c4cc(OCCCCCC)c(*)cc4OCCCCCC)nc3cc2s1,168.526313
...,...,...,...
7863,2116365788,*Nc1cc(SCCC#N)c(NC(=O)c2cccc(C(*)=O)c2)cc1SCCC#N,38.160660
7868,2117950580,*c1ccc(C2C(C(=O)OCC)C(*)C2C(=O)OCC)cc1,164.322463
7889,2124040823,*Oc1ccc(C=Cc2ccc(C=Cc3ccc(OC(=O)CCCCCCCCC(*)=O...,35.475235
7911,2130807414,*CC(*)C(=O)OCC1(C)COC(C)(C)OC1,95.741049


In [112]:
df_test

Unnamed: 0,id,SMILES
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


In [91]:
temp = SentenceTransformer('/Users/shashankkatiyar/.cache/huggingface/hub/models--Derify--ChemMRL-alpha/snapshots/10bace4387f5ce86181c382b110ece9cd55d9dcc')

In [68]:
list(df_test['SMILES'])

['*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)(F)F)C(F)(F)F)cc3)cc2)cc1',
 '*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c5ccc(*)cc5)c4)cc3)cc2)cc1',
 '*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6c5C(=O)N(*)C6=O)cc4C3=O)c2)c1']

In [53]:
temp1 = np.random.randn(10,1024)
temp2 = np.random.randn(20,1024)
similarities = model.similarity(temp1, temp2)

In [55]:
similarities.shape

torch.Size([10, 20])

In [58]:
np.__version__

'1.26.4'

In [59]:
np.linalg.norm(temp1, ord = 2, axis = 1)

array([31.63327622, 32.68170798, 32.44342167, 31.57763404, 33.89966671,
       31.79731525, 31.81958678, 32.31310153, 32.28397145, 31.48179124])

In [4]:
embeddings = model.encode(sentences)

In [47]:
temp2

array([[-0.01229778, -1.22884953,  0.53899921, ...,  1.2770107 ,
        -0.77500096, -0.23875563],
       [ 0.98284899, -0.93888819, -0.44800889, ...,  0.04444339,
         0.2609156 ,  0.66138676],
       [-0.07738374, -0.09886296,  1.53370645, ...,  0.76334611,
        -1.50325518, -1.71726304],
       ...,
       [ 0.41544835, -0.44895731,  0.35903934, ...,  0.35689733,
        -1.41245859, -1.79132368],
       [ 2.70032027, -0.19512066, -2.31937895, ...,  0.46694383,
        -0.72352537, -0.44254963],
       [ 1.15613964,  0.10046639, -0.9822714 , ...,  0.66516305,
        -0.18370319,  2.55267478]])

In [51]:
similarities = model.similarity(temp2, temp)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (7973x1024 and 512x10)

In [8]:
similarities

tensor([[1.0000, 0.6260, 0.6811],
        [0.6260, 1.0000, 0.6901],
        [0.6811, 0.6901, 1.0000]])