## Imports

In [1]:
!pip install pinecone-client datasets PyTDC rdkit datamol pandas numpy molfeat




In [2]:
import tqdm
import pandas as pd
import numpy as np
import itertools
from tdc.generation import MolGen
from pinecone import Pinecone, ServerlessSpec
import datamol as dm
import molfeat
from molfeat.calc import FPCalculator, RDKitDescriptors2D
from molfeat.trans import MoleculeTransformer
from molfeat.store.modelstore import ModelStore
from molfeat.trans.pretrained import PretrainedMolTransformer, GraphormerTransformer

Failed to find the pandas get_adjustment() function to patch
Failed to patch pandas - PandasTools will have limited functionality


## Load data

In [3]:
from tdc.generation import MolGen
data = MolGen(name = 'ZINC')
split = data.get_split() 

Found local copy...
Loading...
Done!


In [4]:
mols = dm.convert.from_df(split['train'])

## Featurize

In [5]:
# df = dm.descriptors.batch_compute_many_descriptors(mols, properties_fn=None, add_properties=True, n_jobs=1, batch_size=None, progress=False, progress_leave=True)

In [6]:
# # Load some dummy data
# data = dm.data.freesolv().sample(100).smiles.values

# # Featurize a single molecule
# calc = FPCalculator("ecfp")
# calc(data[0])

# # Define a parallelized featurization pipeline
# mol_transf = MoleculeTransformer(calc, n_jobs=-1)
# mol_transf(data)

# # Easily save and load featurizers
# mol_transf.to_state_yaml_file("state_dict.yml")
# mol_transf = MoleculeTransformer.from_state_yaml_file("state_dict.yml")
# mol_transf(data)

# # List all available featurizers
# store = ModelStore()
# store.available_models

# # Find a featurizer and learn how to use it
# model_card = store.search(name="ChemBERTa-77M-MLM")[0]
# model_card.usage()

In [6]:
# # List all available featurizers
store = ModelStore()
store.available_models
model_card = store.search(name='pcqm4mv2_graphormer_base')[0]

In [7]:
store.available_models

[ModelInfo(name='cats2d', inputs='smiles', type='hashed', version=0, group='all', submitter='Datamol', description='2D version of the 6 Potential Pharmacophore Points CATS (Chemically Advanced Template Search) pharmacophore. This version differs from `pharm2D-cats` on the process to make the descriptors fuzzy, which is closer to the original paper implementation. Implementation is based on work by Rajarshi Guha (08/26/07) and Chris Arthur (1/11/2015)', representation='vector', require_3D=False, tags=['CATS', 'hashed', '2D', 'pharmacophore', 'search'], authors=['Michael Reutlinger', 'Christian P Koch', 'Daniel Reker', 'Nickolay Todoroff', 'Petra Schneider', 'Tiago Rodrigues', 'Gisbert Schneider', 'Rajarshi Guha', 'Chris Arthur'], reference='https://doi.org/10.1021/ci050413p', created_at=datetime.datetime(2023, 7, 20, 9, 40, 19, 315784), sha256sum='9c298d589a2158eb513cb52191144518a2acab2cb0c04f1df14fca0f712fa4a1', model_usage=None),
 ModelInfo(name='cats3d', inputs='mol', type='hashed', 

In [9]:
# featurizer = GraphormerTransformer(kind='pcqm4mv2_graphormer_base', dtype=np.float32, pooling='mean', max_length=None, concat_layers=-1, ignore_padding=True, version=None)
featurizer = PretrainedMolTransformer(kind='ChemGPT-1.2B', dtype=np.float32, pooling='mean', max_length=None, concat_layers=-1, ignore_padding=True, version=None)

In [10]:
featurizer

In [11]:
data = dm.data.freesolv().smiles.values

In [12]:
len(data)

642

In [13]:
len(mols)

174618

In [24]:
calc = FPCalculator(replace_nan=True)

TypeError: FPCalculator.__init__() missing 1 required positional argument: 'method'

In [25]:
calc = FPCalculator("ecfp")

In [26]:
calc

<molfeat.calc.fingerprints.FPCalculator at 0x364f42ea0>

In [27]:
featurizer = MoleculeTransformer(calc, dtype=np.float32)

with dm.without_rdkit_log():
    feats = np.stack(featurizer(mols[:100]))

feats.dtype

dtype('float32')

In [28]:
len(feats[30])

2048

In [29]:
feats

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [30]:
id_list = [f'Molecule {i}' for i, _ in enumerate(feats)]

In [31]:
payload = zip(id_list, feats)

In [23]:
payload

<zip at 0x36531f500>

## Configure Pinecone index

In [37]:
pc = Pinecone(api_key="c2c9ba1d-9710-472b-a950-a3db5b40a67c")

In [38]:
index = pc.Index('molsearch')

In [None]:
## Upsert vectors

In [49]:
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [50]:
for ids_vectors_chunk in chunks(payload, batch_size=10):
    index.upsert(vectors=ids_vectors_chunk) 

## Similarity search

In [None]:
## Get 101st molecule and search

In [32]:
with dm.without_rdkit_log():
    feats = np.stack(featurizer(mols[100]))

feats.dtype

dtype('float32')

In [33]:
feats

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [34]:
feats.tolist()[0]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0

In [39]:
query_results = index.query(vector = feats.tolist()[0], top_k = 100)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 27 Mar 2024 00:19:13 GMT', 'Content-Type': 'application/json', 'Content-Length': '103', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '5', 'x-pinecone-request-id': '6781508509351382494', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 2048 does not match the dimension of the index 216","details":[]}


In [40]:
query_results

NameError: name 'query_results' is not defined

In [86]:
w1 = "abcd"
w2 = "pq"

word1, word2 = list(w1), list(w2)

In [88]:
min(word1, word2)

['a', 'b', 'c', 'd']

In [89]:
word2

['p', 'q']

In [93]:
short_arr, long_arr = sorted([list(w1), list(w2)], key=len)

In [94]:
short_arr

['p', 'q']

In [95]:
long_arr

['a', 'b', 'c', 'd']

In [96]:
long_arr > short_arr

False

In [112]:
candies = [4,2,1,1,2]

In [104]:
max(candies)

4

In [113]:
candies.remove(2)

In [114]:
candies

[4, 1, 1, 2]

In [107]:
elem

2

In [116]:
d = {'key': 2}

In [118]:
list(d.values())

[2]

In [127]:
ransomNote, magazine = "aa",  "aab"

In [122]:
from collections import Counter

In [128]:
st1, st2 = Counter(ransomNote), Counter(magazine)

In [129]:
st1

Counter({'a': 2})

In [130]:
st2

Counter({'a': 2, 'b': 1})

In [131]:
st1 & st2

Counter({'a': 2})

In [132]:
 s = "dog cat cat dog"

In [133]:
s.split()

['dog', 'cat', 'cat', 'dog']

In [134]:
pattern = "abba"

In [135]:
list(pattern)

['a', 'b', 'b', 'a']

In [158]:
n = 96

In [151]:
str(n)[0]

'9'

In [159]:
digits = list(map(int, str(n)))

In [162]:
digits

[9, 6]

In [163]:
sum

387467145

In [164]:
del sum

In [165]:
res = sum(list(map(lambda x: x**2, digits)))

In [166]:
res

117

In [168]:
nums = ['a', 'b', 'c']
freq = [1, 1, 3]

In [169]:
nums

['a', 'b', 'c']

In [171]:
res =  list(map(lambda x, y: x*y, nums, freq))

In [186]:
new_str = ''
for i in range(len(nums)):
    new_str += nums[i] * freq[i]
    

In [188]:
import random

In [195]:
random.randint(0, len(nums))

1

In [27]:
positive =["Dogecoin to the moon", "Dogecoin to the moon"]

In [28]:
negative = ["I will short Tesla today", "I will short Tesla today"]

In [3]:
positive_words = [entry.split() for entry in positive]

[['Dogecoin', 'to', 'the', 'moon'], ['Dogecoin', 'to', 'the', 'moon']]

In [8]:
positive_words_single = [word for sublist in positive_words for word in sublist]

In [9]:
positive_words_single

['Dogecoin', 'to', 'the', 'moon', 'Dogecoin', 'to', 'the', 'moon']

In [12]:
import numpy as np

In [26]:
np.arange(1, 5, 1)

array([1, 2, 3, 4])

In [30]:
import torch

In [33]:
vocab_dict

{'Dogecoin': 1, 'I': 2, 'Tesla': 3, 'moon': 4, 'short': 5, 'the': 6, 'to': 7}

In [34]:
combined

['Dogecoin to the moon',
 'Dogecoin to the moon',
 'I will short Tesla today',
 'I will short Tesla today']

In [35]:
combined_words

['Dogecoin',
 'to',
 'the',
 'moon',
 'Dogecoin',
 'to',
 'the',
 'moon',
 'I',
 'will',
 'short',
 'Tesla',
 'today',
 'I',
 'will',
 'short',
 'Tesla',
 'today']

In [38]:
len(vocab)

9

In [36]:
vocab

['Dogecoin', 'I', 'Tesla', 'moon', 'short', 'the', 'to', 'today', 'will']

In [39]:
np.arange(1, len(vocab) + 1, 1)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [40]:
combined = positive + negative
        
## Create vocab dictionary
combined_words_list = [entry.split() for entry in combined]
combined_words = [word for sublist in combined_words_list for word in sublist]

vocab = sorted(list(set(combined_words)))
vocab_dict = dict(zip(vocab, np.arange(1, len(vocab) + 1, 1)))

## Encode sentences
tensors = []
for sentence in combined:
    values = []
    for word in sentence.split():
        values.append(vocab_dict[word])

    tensors.append(torch.tensor(values))

In [41]:
tensors

[tensor([1, 7, 6, 4]),
 tensor([1, 7, 6, 4]),
 tensor([2, 9, 5, 3, 8]),
 tensor([2, 9, 5, 3, 8])]