## Imports

In [1]:
%load_ext autoreload
%autoreload 2

## Oversample Data

In [90]:
import pandas as pd

df = pd.read_csv("data/raw/HIV_train.csv")
df.index = df["index"]
df["HIV_active"].value_counts()
start_index = df.iloc[0]["index"]

df.head()

Unnamed: 0_level_0,index,smiles,activity,HIV_active
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3999,3999,CC1c2nc(N)nc(N)c2CN1C(=O)c1ccccc1,CI,0
4000,4000,Cc1nc(N)c2c(n1)C(C)N(C(=O)c1ccccc1)C2,CI,0
4001,4001,NC(=S)NN=Cc1ccc(O)cn1,CI,0
4002,4002,COC1C(OC(=O)c2ccc(C)[nH]2)C(O)C(Oc2ccc3c(O)c(N...,CM,1
4003,4003,O=C1C=C2C=CC3CC2(O1)C1CCCCN31,CI,0


In [92]:
# Check how many additional samples we need
neg_class = df["HIV_active"].value_counts()[0]
pos_class = df["HIV_active"].value_counts()[1]
multiplier = int(neg_class / pos_class) - 1

# Replicate the dataset for the positive class
replicated_pos = [df[df["HIV_active"] == 1]] * multiplier

# Append replicated data
df = pd.concat([df] + replicated_pos, ignore_index=True)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

# Re-assign index (This is our ID later)
index = range(start_index, start_index + df.shape[0])
df.index = index
df["index"] = df.index
df.head()

Unnamed: 0,index,smiles,activity,HIV_active
3999,3999,c1ccc(Nc2nnc(CCCCCCCCc3nnc(Nc4ccccc4)o3)o2)cc1,CI,0
4000,4000,O=C(C=Cc1ccc(O)c(O)c1)OC1CCCCC1OC(=O)C=Cc1ccc(...,CI,0
4001,4001,CCOC12CC(OC)C3(O)CC(C1C3OCc1ccccc1)C13C(OC)CC(...,CM,1
4002,4002,CCOC(=O)CCc1cn2cc(Br)nc(OC)c2n1,CI,0
4003,4003,Oc1nc2[nH]c(-c3ccc(Br)cc3)cc2c2ccccc12,CI,0


In [None]:
# df.to_csv("data/raw/HIV_train_oversampled.csv", index=False)

## Training

In [1]:
from project.config import TrainingConfig
from project.train import train_model

import warnings
warnings.filterwarnings(action="ignore")

2023-07-23 20:36:39.348847: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[32m2023-07-23 20:36:40.477[0m | [1mINFO    [0m | [36mproject.train[0m:[36m<module>[0m:[36m22[0m - [1mUsing device: cuda:0[0m


In [None]:
config = TrainingConfig()

train_model(config, n_epochs=100)

[32m2023-07-23 20:36:42.911[0m | [1mINFO    [0m | [36mproject.train[0m:[36mtrain_model[0m:[36m129[0m - [1mLoading the dataset...[0m
[32m2023-07-23 20:36:44.771[0m | [1mINFO    [0m | [36mproject.train[0m:[36mtrain_model[0m:[36m146[0m - [1mConstructed model with 365633 parameters[0m
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 560/560 [00:30<00:00, 18.39it/s]
[32m2023-07-23 20:37:15.289[0m | [1mINFO    [0m | [36mproject.train[0m:[36mtrain_model[0m:[36m168[0m - [1mEpoch 0 | Train Loss 0.7824822069278785[0m
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 20.42it/s]
[32m2023-07-23 20:37:16.875[0m | [1mINFO    [0m | [36mproject.train[0m:[36mtrain_model[0m:[36m176[0m - [1mEpoch 0 | Test Loss 0.7187392804771662[0m
100%|██████████████████████████████████████████████████████████████████████████████████████████