In [3]:
import numpy as np
import deepchem as dc
from pathlib import Path
import pandas as pd

In [112]:
base_path = Path().cwd().joinpath('content/datasets/')
train_df = pd.read_csv(base_path.joinpath('train_II.csv'))
test_df = pd.read_csv(base_path.joinpath('test_II.csv'))

In [113]:
train_df[["Chemical Id", "Assay Id"]] = train_df.Id.str.split(
    ";", expand=True
)
train_df.drop("Id", axis=1, inplace=True)
train_df = train_df[["Assay Id", "Chemical Id", "Expected"]]

# Dropping silicon based elemets
train_df.drop(index=[10135, 26306, 42332, 47225, 62942, 72002], axis=0, inplace=True)
train_df.reset_index(drop=True, inplace=True)
train_df['Expected'].replace({1:0, 2:1}, inplace=True)

test_df[["Chemical Id", "Assay Id"]] = test_df.x.str.split(";", expand=True)
test_df.drop("x", axis=1, inplace=True)
test_df = test_df[["Assay Id", "Chemical Id"]]

In [114]:
train_df.to_csv('train_deep.csv', index=False)
test_df.to_csv('test_deep.csv', index=False)

In [115]:
train_loader = dc.data.CSVLoader(
    tasks=['Expected'], feature_field='Chemical Id', id_field='Assay Id', featurizer=dc.feat.ConvMolFeaturizer()
)

In [116]:
dataset = train_loader.create_dataset('train_deep.csv')

In [117]:
dataset.to_dataframe()

Unnamed: 0,X,y,w,ids
0,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,1644
1,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,2451
2,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,1384
3,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,16
4,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,1856
...,...,...,...,...
75372,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,33
75373,<deepchem.feat.mol_graphs.ConvMol object at 0x...,0.0,1.0,1632
75374,<deepchem.feat.mol_graphs.ConvMol object at 0x...,0.0,1.0,1373
75375,<deepchem.feat.mol_graphs.ConvMol object at 0x...,1.0,1.0,2


In [118]:
dataset.save_to_disk()

In [126]:
transformer = dc.trans.BalancingTransformer(dataset=dataset)
dataset = transformer.transform(dataset)

In [127]:
splitter = dc.splits.RandomStratifiedSplitter()
kfold_splits = splitter.k_fold_split(dataset=dataset, seed=10, k=5)

In [135]:
def kfold_cross_val(epochs = 10):
    f1_scores = []
    metrics = dc.metrics.Metric(dc.metrics.f1_score)
    for train, valid in kfold_splits:
        gcn_model = dc.models.GraphConvModel(n_tasks=1, mode='classification', n_classes=2)
        gcn_model.fit(train, epochs)
        score = gcn_model.evaluate(valid, [metrics], [transformer])
        f1_scores.append(score)
    return f1_scores

In [136]:
def print_f1_scores(f1):
    arr = []
    for x in f1:
        arr.append(x['f1_score'])
    print(np.mean(arr))

In [128]:
v1_scores = kfold_cross_val()
print_f1_scores(v1_scores)













In [None]:
# Model -1 
gcn_model_1 = dc.models.GraphConvModel(n_tasks=1, mode='classification', n_classes=2)
gcn_model_1.fit(dataset)

In [137]:
v2_scores = kfold_cross_val(50)
print_f1_scores(v2_scores)













0.7537214594004455


In [138]:
gcn_model_2 = dc.models.GraphConvModel(n_tasks=1, mode='classification', n_classes=2)

In [139]:
gcn_model_2.fit(dataset, nb_epoch=50)





1.1315798950195313

In [140]:
y2 = gcn_model_2.predict(dataset=test_ds, transformers=[transformer])

In [142]:
preds_2 = np.ravel(y2.argmax(axis=-1))

In [144]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = preds_2
final_df['Predicted'].replace({0:1, 1:2}, inplace=True)
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission23.csv'), index=False)

In [78]:
test_loader = dc.data.CSVLoader(
    tasks=[], feature_field='Chemical Id', id_field='Assay Id', featurizer=dc.feat.ConvMolFeaturizer()
)

In [79]:
test_ds = test_loader.create_dataset('test_deep.csv')

In [94]:
y_pred = gcn_model.predict(dataset=test_ds, transformers=[transformer])

In [101]:
preds = np.ravel(y_pred.argmax(axis=-1))

In [108]:
submission_path = Path().cwd().joinpath('content/submissions/')

In [109]:
final_df = pd.read_csv(base_path.joinpath('test_II.csv'))
final_df['Predicted'] = preds
final_df['Predicted'].replace({0:1, 1:2}, inplace=True)
final_df.columns = ['Id', 'Predicted']
final_df.to_csv(submission_path.joinpath('submission22.csv'), index=False)

Unnamed: 0,Id,Predicted
0,CC1=CC(=C(C=C1)C(C)(C)C)O;1682,1
1,CCS(=O)(=O)C1=C(N=CC=C1)S(=O)(=O)NC(=O)NC2=NC(...,2
2,CC1=NC2=CC=CC=C2N1C3CC4CCC(C3)N4CCC5(CCN(CC5)C...,1
3,CC1=CC(=O)[N-]S(=O)(=O)O1.[K+];1850,2
4,CCC(C)(C)C(=O)O[C@H]1C[C@H](C=C2[C@H]1[C@H]([C...,1
...,...,...
10989,CC(=NOCC1=CC=CC=C1C(=NOC)C(=O)OC)C2=CC(=CC=C2)...,1
10990,C1=CC=C(C(=C1)C(C2=CC=C(C=C2)O)C3=CC=C(C=C3)O)...,2
10991,CC1=NC(=NC(=N1)OC)NC(=O)NS(=O)(=O)C2=CC=CC=C2C...,2
10992,COP(=O)(OC)OC=C(Cl)Cl;28,2
