In [1]:
#Import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

In [2]:
#Import dataset
df = pd.read_csv("PI1M_v2.csv")

#Generate molecules
df['ROMol'] = df.apply(lambda x: Chem.MolFromSmiles(x['SMILES']), axis=1)
# print(df.head())

#Generate Morgan Fingerprints with Frequency (MFF) and convert RDKit object to numpy array
def computeMFF_tgc(mol, depth=2, nBits=98):
    arr = np.zeros(nBits)
    try:
      DataStructs.ConvertToNumpyArray(AllChem.GetHashedMorganFingerprint(mol,depth,nBits),arr)
    except:
      return None
    return arr

def computeMFF_egc(mol, depth=2, nBits=2048):
    arr = np.zeros(nBits)
    try:
      DataStructs.ConvertToNumpyArray(AllChem.GetHashedMorganFingerprint(mol,depth,nBits),arr)
    except:
      return None
    return arr

df['MFF_tgc'] = df.apply(lambda x: computeMFF_tgc(x['ROMol']),axis=1)
X_tgc = np.array(df['MFF_tgc'].values.tolist())
df['MFF_egc'] = df.apply(lambda x: computeMFF_egc(x['ROMol']),axis=1)
X_egc = np.array(df['MFF_egc'].values.tolist())

In [3]:
#import model
tgc = pickle.load(open("Tg_C_Predictor.model","rb")) #taken from Tao 2021 (https://doi.org/10.1021/acs.jcim.1c01031)
y = tgc.predict(X_tgc)
df['Tg_C'] = y
del y

egc = pickle.load(open("Eg_C_Predictor.model","rb")) #taken from generated model in model folder - Note: Remember to change the filename to match the saved model
y = egc.predict(X_egc)
df['Eg_C'] = y
del y

# df.to_csv("output.csv",index=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
df.head(1)

Unnamed: 0,SMILES,SA Score,ROMol,MFF_tgc,MFF_egc,Tg_C,Eg_C
0,*CCC[Fe]CCCC(=O)OCCCCOCCCNCC(*)=O,4.174851,<rdkit.Chem.rdchem.Mol object at 0x17baaa810>,"[1.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 2.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",58.654611,5.993917


In [5]:
#Full dataset
df = df.drop(['MFF_tgc','MFF_egc','ROMol'],axis=1)
df.to_csv("PI1M_Processed.csv",index=False)


In [6]:
#Filtered dataset
filtered_df = df[df['Eg_C'] <= 2.0]
filtered_df.to_csv("Filtered_Polymers.csv",index=False)