In [2]:
pip install torch

Collecting torchNote: you may need to restart the kernel to use updated packages.





  Downloading torch-1.10.0-cp38-cp38-win_amd64.whl (226.6 MB)
Installing collected packages: torch
Successfully installed torch-1.10.0





In [4]:
import argparse
import torch
import torch.nn as nn
#import tensorboard_logger
from nets import UnitedNet
from torch.utils.data import dataloader
from dataset import EGFRDataset, train_validation_split
import torch.optim as optim
from metrics import *
import utils
import matplotlib.pyplot as plt
import warnings
from sklearn.metrics import precision_recall_curve
import pandas as pd
import torch.utils.data as data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
import os, glob

plt.switch_backend('agg')
warnings.filterwarnings('ignore')

In [5]:
def train_validation_split(data_path):
    if os.path.isdir(data_path):
        train_path = os.path.join(data_path, 'train.json')
        val_path = os.path.join(data_path, 'val.json')
    else:
        train_path = data_path.split('.')[0] + '_' + 'train.json'
        val_path = data_path.split('.')[0] + '_' + 'val.json'
    if os.path.exists(train_path) and os.path.exists(val_path):
        # return read_data(train_path), read_data(val_path)
        return pd.read_json(train_path, lines=True), pd.read_json(val_path, lines=True)
    data = read_data(data_path)
    train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data.to_json(train_path, orient='records', lines=True)
    val_data.to_json(val_path, orient='records', lines=True)
    return train_data, val_data

In [6]:
class EGFRDataset(data.Dataset):
    def __init__(self, data, infer=False):
        if isinstance(data, pd.DataFrame):
            self.data = data
        elif isinstance(data, str):
            self.data = read_data(data)
        self.NON_MORD_NAMES = ['smile_ft', 'id', 'subset', 'quinazoline', 'pyrimidine', 'smiles', 'active']
        self.infer = infer

        # Standardize mord features
        scl = StandardScaler()
        self.mord_ft = scl.fit_transform(
            self.data.drop(columns=self.NON_MORD_NAMES).astype(np.float64)).tolist()
        self.non_mord_ft = self.data['smile_ft'].values.tolist()
        self.smiles = self.data['smiles'].values.tolist()
        self.label = self.data['active'].values.tolist()

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        if self.infer:
            return self.smiles[idx], self.mord_ft[idx], self.non_mord_ft[idx], self.label[idx]
        else:
            return self.mord_ft[idx], self.non_mord_ft[idx], self.label[idx]

    def get_dim(self, ft):
        if ft == 'non_mord':
            return len(self.non_mord_ft[0])
        if ft == 'mord':
            return len(self.mord_ft[0])

    def get_smile_ft(self):
        return self.non_mord_ft

In [36]:
def read_data(data_path):
    data = None
    if data_path.endswith('.json'):
        try:
            data = pd.read_json(data_path, lines=True)
        except ValueError:
            data = pd.read_json(data_path)
    if data_path.endswith('.zip'):
        try:
            data = pd.read_json(data_path, compression='zip', lines=True)
        except ValueError:
            data = pd.read_json(data_path, compression='zip')
    return data


In [34]:
dataset = 'data/tyk2_1.json'

In [37]:
train_data, val_data = train_validation_split(dataset) # train 80%/ test 20% 으로 split해서 json으로 저장

In [16]:
train_data.drop(columns=['smile_ft', 'id', 'subset', 'quinazoline', 'pyrimidine', 'smiles', 'active']).dtypes.value_counts()

float64    651
int64      206
object       3
bool         2
dtype: int64

In [31]:
train_data.drop(columns=['smile_ft', 'id', 'subset', 'quinazoline', 'pyrimidine', 'smiles', 'active']).select_dtypes(object)

Unnamed: 0,AXp-0dv,MDEC-33,Xp-0dv
0,0.585284,8.813214,12.290961
1,0.566462,7.540357,16.427388
2,0.554092,14.771312,15.514575
3,0.547562,6.100588,10.951246
4,0.565457,5.092906,13.570979
...,...,...,...
1231,0.532986,16.987293,21.852432
1232,0.543126,13.67501,19.552553
1233,0.549541,5.946115,14.837618
1234,0.564558,7.515625,12.984842


In [26]:
train_data.drop(columns=['smile_ft', 'id', 'subset', 'quinazoline', 'pyrimidine', 'smiles', 'active']).select_dtypes(object)

Unnamed: 0,AXp-0dv,MDEC-33,Xp-0dv
0,0.585284,8.813214,12.290961
1,0.566462,7.540357,16.427388
2,0.554092,14.771312,15.514575
3,0.547562,6.100588,10.951246
4,0.565457,5.092906,13.570979
...,...,...,...
1231,0.532986,16.987293,21.852432
1232,0.543126,13.67501,19.552553
1233,0.549541,5.946115,14.837618
1234,0.564558,7.515625,12.984842


In [33]:
train_data['Xp-0dv'][60]

{'error': {'args': ['some properties less then or equal to 0']},
 'stack': [{'chi_types': ['path', 'cluster', 'path_cluster', 'chain'],
   'explicit_hydrogens': False,
   'kekulize': False,
   'parameter_names': ['type', 'order', 'prop', 'averaged'],
   'require_3D': False,
   'since': {'prerelease': None,
    'version_re': {'flags': 320,
     'groupindex': {},
     'pattern': '^(\\d+) \\. (\\d+) (\\. (\\d+))? ([ab](\\d+))?$'}}}]}

In [27]:
pd.to_numeric(train_data['AXp-0dv'])

TypeError: Invalid object type at position 60

In [8]:
train_dataset = EGFRDataset(train_data) # 정규화 & 필요없는 col drops

TypeError: float() argument must be a string or a number, not 'dict'