In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [2]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [3]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [4]:
test_data['type'] = "test"
train_data['type'] = "train"

In [5]:
df = pd.concat([test_data, train_data])
df.head()

Unnamed: 0,Id,type,Expected
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,


In [6]:
df[["Chemical_Id", "Assay_Id"]] = df.Id.str.split(";", expand = True)
df.head()

Unnamed: 0,Id,type,Expected,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [None]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(df['Chemical_Id'])

In [59]:
df_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86372,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
86373,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
86374,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
86375,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
df_descriptors.columns[df_descriptors.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [61]:
df_descriptors = df_descriptors.replace([np.inf, -np.inf], np.nan)

In [62]:
df_descriptors = df_descriptors.fillna(df_descriptors.mean())

In [63]:
df_descriptors.columns[df_descriptors.isnull().any()].tolist()

[]

In [64]:
df_descriptors['MaxEStateIndex'] = df_descriptors['MaxEStateIndex'].apply(np.int64)
df_descriptors['MinEStateIndex'] = df_descriptors['MinEStateIndex'].apply(np.int64)
df_descriptors['MaxAbsEStateIndex'] = df_descriptors['MaxAbsEStateIndex'].apply(np.int64)
df_descriptors['MinAbsEStateIndex'] = df_descriptors['MinAbsEStateIndex'].apply(np.int64)
df_descriptors['qed'] = df_descriptors['qed'].apply(np.int64)
df_descriptors['MolWt'] = df_descriptors['MolWt'].apply(np.int64)
df_descriptors['HeavyAtomMolWt'] = df_descriptors['HeavyAtomMolWt'].apply(np.int64)
df_descriptors['ExactMolWt'] = df_descriptors['ExactMolWt'].apply(np.int64)

In [65]:
display(df_descriptors.dtypes)

MaxEStateIndex       int64
MinEStateIndex       int64
MaxAbsEStateIndex    int64
MinAbsEStateIndex    int64
qed                  int64
                     ...  
fr_thiazole          int64
fr_thiocyan          int64
fr_thiophene         int64
fr_unbrch_alkane     int64
fr_urea              int64
Length: 208, dtype: object

In [66]:
df = df.reset_index(drop=True)
df = pd.concat([df, df_descriptors], axis = 1)

In [67]:
df.head()

Unnamed: 0,Id,type,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8,-3,8,1,0,...,0,0,0,0,0,0,0,0,0,0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13,-6,13,0,0,...,0,1,1,0,0,0,0,0,0,1
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16,-6,16,0,0,...,0,1,0,0,0,0,0,0,0,0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10,-4,10,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14,-5,14,2,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
# from rdkit.Chem import AllChem
# def smiles_to_fingerprints(smiles_list):
#     fingerprints = []
#     for smiles in smiles_list:
#         m = Chem.MolFromSmiles(smiles)
#         if m is None:
#             continue
#         fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
#         mfpts = np.array(fp)
#         fingerprints.append(mfpts)
#     return np.array(fingerprints)

In [69]:
# chemical_Id_list = df['Chemical_Id'].to_list()

In [70]:
# fingerprints = smiles_to_fingerprints(chemical_Id_list)

In [71]:
# Mfingerprints = pd.DataFrame(fingerprints,columns=['Col_{}'.format(i) for i in range(fingerprints.shape[1])])

In [72]:
# Mfingerprints

In [73]:
# Mfingerprints.columns[Mfingerprints.isnull().any()].tolist()

In [74]:
# df = df.reset_index(drop=True)
# df = pd.concat([df, Mfingerprints], axis = 1)

In [75]:
# df.head()

In [76]:
train = df[df["type"] == "train"]
train.shape

(75383, 213)

In [77]:
train.columns[train.isnull().any()].tolist()

[]

In [78]:
# train = train.replace([np.inf, -np.inf], np.nan)
# train = train.fillna(train.mean())

In [79]:
train.columns[train.isnull().any()].tolist()

[]

In [80]:
test = df[df["type"] == "test"]
test.shape

(10994, 213)

In [81]:
test.columns[test.isnull().any()].tolist()

['Expected']

In [82]:
train_X = train.drop(["Id","type", "Chemical_Id", "Expected"], axis=1)
train_X.shape

(75383, 209)

In [83]:
test_X = test.drop(["Id", "type", "Expected", "Chemical_Id"], axis=1)
test_X.shape

(10994, 209)

In [84]:
train_Y = train["Expected"]
train_Y.shape

(75383,)

In [85]:
test_X['Assay_Id'] = test_X['Assay_Id'].astype(str).astype(float).astype(int)
train_X['Assay_Id'] = train_X['Assay_Id'].astype(str).astype(float).astype(int)

In [86]:
train_Y = train_Y.astype(int)

In [87]:
train_X.columns[train_X.isnull().any()].tolist()

[]

In [88]:
test_X.columns[test_X.isnull().any()].tolist()

[]

In [94]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(train_X)
train_X_imp = imp.transform(train_X)

In [95]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(test_X)
test_X_imp = imp.transform(test_X)

In [96]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train_X_imp, train_Y, test_size=0.2, random_state=0)

In [106]:
np.any(np.isnan(X_train))

False

In [107]:
np.all(np.isfinite(X_train))

True

In [101]:
from sklearn.ensemble import HistGradientBoostingClassifier
clf = HistGradientBoostingClassifier(learning_rate=0.1, max_depth=8, random_state=11)
clf.fit(X_train, Y_train)

In [109]:
np.where(X_train.values >= np.finfo(np.float32).max)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [108]:
from sklearn.ensemble import RandomForestClassifier
# RandomForestClassifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, Y_train)
pred = model.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, pred)*100, "%")

  array = numpy.asarray(array, order=order, dtype=dtype)


ValueError: Input X contains infinity or a value too large for dtype('float32').

In [103]:
pred = clf.predict(X_test)
acc_score = clf.score(X_train,Y_train)
print(acc_score)

0.9060126687228468


In [104]:
prd = clf.predict(test_X)
print(prd)
#prd1 = [int(s) for s in prd]

[2 2 2 ... 2 2 2]




In [105]:
op = test_data[["Id"]]
op['Predicted'] = prd
op.to_csv("Output.csv", index=False)