In [68]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors

In [69]:
train_data = pd.read_csv('train_II.csv')
test_data = pd.read_csv('test_II.csv')

In [70]:
test_data = test_data.rename(columns = {"x": "Id"})
test_data.head()

Unnamed: 0,Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...


In [71]:
test_data['type'] = "test"
train_data['type'] = "train"

In [72]:
df = pd.concat([test_data, train_data])
df.head()

Unnamed: 0,Id,type,Expected
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,


In [73]:
df[["Chemical_Id", "Assay_Id"]] = df.Id.str.split(";", expand = True)
df.head()

Unnamed: 0,Id,type,Expected,Chemical_Id,Assay_Id
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,,CC1=CC(=C(C=C1)C(C)(C)C)O,1682
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30


In [74]:
from rdkit.Chem import Descriptors
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        if mol:
        # add hydrogens to molecules
            mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
            descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(df['Chemical_Id'])

[15:05:42] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:05:44] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:05:45] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:05:47] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:05:50] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:05:51] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [75]:
df_descriptors = pd.DataFrame(Mol_descriptors,columns=desc_names)
df_descriptors

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.120,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.056940,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.000000,0.393203,201.244,197.212,200.949810,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.270,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86372,12.226619,-5.339907,12.226619,0.265375,0.712426,230.245,220.165,230.036128,82,0,...,0,1,0,0,0,0,0,0,0,0
86373,8.048957,-5.193498,8.048957,0.110750,0.720533,313.747,296.611,313.041677,104,0,...,0,0,0,0,0,0,0,0,0,0
86374,7.612361,-0.332130,7.612361,0.131389,0.596343,167.258,162.218,166.986341,50,0,...,0,0,0,0,0,1,0,0,0,0
86375,11.548424,-4.352546,11.548424,2.482554,0.519485,128.215,112.087,128.120115,54,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# df_descriptors = df_descriptors.astype('float32')

In [76]:
df_descriptors.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,8.018449,-3.840318,8.018449,1.098125,0.624614,164.248,148.12,164.120115,66,0,...,0,0,0,0,0,0,0,0,0,0
1,13.253248,-6.190954,13.253248,0.749312,0.617511,431.452,414.316,431.05694,152,0,...,0,1,1,0,0,0,0,0,0,1
2,16.485402,-6.427849,16.485402,0.554422,0.224134,696.264,655.944,695.250845,254,0,...,0,1,0,0,0,0,0,0,0,0
3,10.745579,-4.559583,10.745579,0.0,0.393203,201.244,197.212,200.94981,56,0,...,0,0,0,0,0,0,0,0,0,0
4,14.743838,-5.881101,14.743838,2.392476,0.639062,418.574,380.27,418.271924,168,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
df_descriptors.columns[df_descriptors.isnull().any()].tolist()

['MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'BCUT2D_MWHI',
 'BCUT2D_MWLOW',
 'BCUT2D_CHGHI',
 'BCUT2D_CHGLO',
 'BCUT2D_LOGPHI',
 'BCUT2D_LOGPLOW',
 'BCUT2D_MRHI',
 'BCUT2D_MRLOW']

In [78]:
df_descriptors = df_descriptors.replace([np.inf, -np.inf], np.nan)

In [79]:
df_descriptors = df_descriptors.fillna(df_descriptors.mean())

In [80]:
df_descriptors.columns[df_descriptors.isnull().any()].tolist()

[]

In [81]:
df_descriptors['MaxEStateIndex'] = df_descriptors['MaxEStateIndex'].apply(np.int64)
df_descriptors['MinEStateIndex'] = df_descriptors['MinEStateIndex'].apply(np.int64)
df_descriptors['MaxAbsEStateIndex'] = df_descriptors['MaxAbsEStateIndex'].apply(np.int64)
df_descriptors['MinAbsEStateIndex'] = df_descriptors['MinAbsEStateIndex'].apply(np.int64)
df_descriptors['qed'] = df_descriptors['qed'].apply(np.int64)
df_descriptors['MolWt'] = df_descriptors['MolWt'].apply(np.int64)
df_descriptors['HeavyAtomMolWt'] = df_descriptors['HeavyAtomMolWt'].apply(np.int64)
df_descriptors['ExactMolWt'] = df_descriptors['ExactMolWt'].apply(np.int64)

In [82]:
df = df.reset_index(drop=True)
df = pd.concat([df, df_descriptors], axis = 1)

In [83]:
from rdkit.Chem import AllChem
def smiles_to_fingerprints(smiles_list):
    fingerprints = []
    for smiles in smiles_list:
        m = Chem.MolFromSmiles(smiles)
        if m is None:
            continue
        fp = AllChem.GetMorganFingerprintAsBitVect(m, 2, 2048)
        mfpts = np.array(fp)
        fingerprints.append(mfpts)
    return np.array(fingerprints)

In [84]:
chemical_Id_list = df['Chemical_Id'].to_list()

In [85]:
fingerprints = smiles_to_fingerprints(chemical_Id_list)

[15:23:21] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:23:35] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:23:49] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:23:53] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:24:07] Explicit valence for atom # 1 Si, 8, is greater than permitted
[15:24:15] Explicit valence for atom # 1 Si, 8, is greater than permitted


In [86]:
Mfingerprints = pd.DataFrame(fingerprints,columns=['Col_{}'.format(i) for i in range(fingerprints.shape[1])])

In [87]:
Mfingerprints

Unnamed: 0,Col_0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,...,Col_2038,Col_2039,Col_2040,Col_2041,Col_2042,Col_2043,Col_2044,Col_2045,Col_2046,Col_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86366,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86367,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86368,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
86369,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
Mfingerprints.columns[Mfingerprints.isnull().any()].tolist()

[]

In [89]:
df = df.reset_index(drop=True)
df = pd.concat([df, Mfingerprints], axis = 1)

In [90]:
df.head()

Unnamed: 0,Id,type,Expected,Chemical_Id,Assay_Id,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,...,Col_2038,Col_2039,Col_2040,Col_2041,Col_2042,Col_2043,Col_2044,Col_2045,Col_2046,Col_2047
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,test,,CC1=CC(=C(C=C1)C(C)(C)C)O,1682,8,-3,8,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,test,,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,1656,13,-6,13,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,test,,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,36,16,-6,16,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,test,,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+],1850,10,-4,10,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,test,,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,30,14,-5,14,2,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
train = df[df["type"] == "train"]
train.shape

(75383, 213)

In [20]:
train.columns[train.isnull().any()].tolist()

[]

In [21]:
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(train.mean())

  train = train.fillna(train.mean())


In [23]:
train.columns[train.isnull().any()].tolist()

[]

In [24]:
test = df[df["type"] == "test"]
test.shape

(10994, 213)

In [25]:
test.columns[test.isnull().any()].tolist()

['Expected']

In [28]:
train_X = train[['Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex']]
train_X.shape

(75383, 5)

In [29]:
test_X = test[['Assay_Id', 'MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex']]
test_X.shape

(10994, 5)

In [30]:
train_Y = train["Expected"]
train_Y.shape

(75383,)

In [31]:
test_X['Assay_Id'] = test_X['Assay_Id'].astype(str).astype(float).astype(int)
train_X['Assay_Id'] = train_X['Assay_Id'].astype(str).astype(float).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X['Assay_Id'] = test_X['Assay_Id'].astype(str).astype(float).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_X['Assay_Id'] = train_X['Assay_Id'].astype(str).astype(float).astype(int)


In [32]:
train_Y = train_Y.astype(int)

In [33]:
train_X.columns[train_X.isnull().any()].tolist()

[]

In [34]:
test_X.columns[test_X.isnull().any()].tolist()

[]

In [35]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(train_X)
train_X_imp = imp.transform(train_X)

In [36]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(test_X)
test_X_imp = imp.transform(test_X)

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=0)

In [38]:
np.any(np.isnan(X_train))

False

In [39]:
np.all(np.isfinite(X_train))

True

In [40]:
# from sklearn.ensemble import HistGradientBoostingClassifier
# clf = HistGradientBoostingClassifier(learning_rate=0.1, max_depth=8, random_state=11)
# clf.fit(X_train, Y_train)

In [44]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score
# # RandomForestClassifier
# clf = RandomForestClassifier(random_state=42)
# clf.fit(X_train, Y_train)
# pred = clf.predict(X_test)
# print("Accuracy: ", accuracy_score(Y_test, pred)*100, "%")

Accuracy:  86.27711083106718 %


In [55]:
# Define mapping dictionary
mapping = {1: 0, 2: 1}

In [57]:
Y_train = Y_train.map(lambda x: mapping.get(x, x))

In [58]:
# Y_train['Expected'] = Y_train['Expected'].map(1:0, 2:1)

In [59]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
#XGBClassifier
clf = XGBClassifier(max_depth = 8,n_estimators = 500, random_state = 11)
clf.fit(X_train, Y_train)
pred = clf.predict(X_test)
print("Accuracy: ", accuracy_score(Y_test, pred)*100, "%")

Accuracy:  9.305564767526695 %


In [60]:
pred = clf.predict(X_test)
acc_score = clf.score(X_train,Y_train)
print(acc_score)

0.8998607103770769


In [61]:
prd = clf.predict(test_X)
print(prd)
#prd1 = [int(s) for s in prd]

[1 1 1 ... 1 1 1]


In [64]:
arr = np.array(prd)

# Define mapping function
def transform_data(x):
    if x == 0:
        return 1
    elif x == 1:
        return 2
    else:
        return x

# Use vectorize() to apply mapping function to each element of array
mapped_arr = np.vectorize(transform_data)(arr)
print(mapped_arr)

[2 2 2 ... 2 2 2]


In [67]:
op = test_data[["Id"]]
op['Predicted'] = mapped_arr
op.to_csv("Output.csv", index=False)