In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 500)

from collections import OrderedDict
import numpy as np
from matplotlib.pylab import plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from catalyst.contrib.nn.schedulers import OneCycleLRWithWarmup
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from catalyst.dl.callbacks import CriterionCallback
import catalyst.dl.utils as utils

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from catalyst.dl import SupervisedRunner
from catalyst.utils import set_global_seed

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')

ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
def preprocess(df):
    df.loc[:, 'cp_type'] = df.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    df.loc[:, 'cp_dose'] = df.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    del df['sig_id']
    return df

train = preprocess(train_features)
test = preprocess(test_features)

del train_targets['sig_id']

In [None]:
train.head()

In [None]:
'''mscl = MinMaxScaler()
scale_cols = list(train.columns[3:])
train[scale_cols] = mscl.fit_transform(train[scale_cols].values)
test[scale_cols] = mscl.transform(test[scale_cols].values)'''

In [None]:
def plot_pca(train, text="PCA", algo = 'PCA', size = 2):
    """Function visualizating PCA/TSNE"""

    plt.figure(figsize=(20,8))
    if algo == 'PCA':
        pca = PCA(n_components = 2,copy=False)
    elif algo == 'TSNE':
        pca = TSNE(n_components = 2)
    else:
        print('Unknown algo, using PCA...')
        pca = PCA(n_components = 2, copy=False)
        
    train_pca = pca.fit_transform(train)

    plt.scatter(train_pca[:,0], train_pca[:,1], edgecolor='none', alpha=0.9,
            cmap=plt.cm.get_cmap('seismic', size))
    plt.title(text)
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.colorbar()

plot_pca(train.values, algo = "TSNE")

In [None]:
scale_cols = list(train.columns[3:])

In [None]:
def plot_data(train, test):
    vlas = train.columns.values
    vlas_t = test.columns.values
    plt.figure(figsize=(20,8))
    plt.title("Distribution of mean values per row in the train and test set")
    sns.distplot(train[vlas].mean(axis=1),color="green", kde=True,bins=100, label='train')
    sns.distplot(test[vlas_t].mean(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of std values per row in the train and test set")
    sns.distplot(train[vlas].std(axis=1),color="green", kde=True,bins=100, label='train')
    sns.distplot(test[vlas_t].std(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of max values per row in the train and test set")
    sns.distplot(train[vlas].max(axis=1), color="green", kde=True, bins=100, label='train')
    sns.distplot(test[vlas_t].max(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
    plt.figure(figsize=(20,8))
    plt.title("Distribution of min values per row in the train and test set")
    sns.distplot(train[vlas].min(axis=1), color="green", kde=True, bins=100, label='train')
    sns.distplot(test[vlas_t].min(axis=1), color='red', kde=True, bins=100, label='test')
    plt.legend()
    plt.show()
    
plot_data(train[scale_cols], test[scale_cols])

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(train, train_targets, test_size = 0.2, random_state = 42)

In [None]:
class TrainDataset(Dataset):
    def __init__(self, train_dataframe, train_target):
        self.train_df = train_dataframe
        self.train_target = train_target
        
    def __len__(self):
        return len(self.train_df)
    
    def __getitem__(self, index):
        
        label = self.train_df[index]
        target = self.train_target[index]
        
        label = torch.FloatTensor(label)
        target = torch.FloatTensor(target)
        
        return {"label":label, "target":target}
        
class TestDataset(Dataset):
    def __init__(self, test_df):
        self.test_df = test_df
        
    def __len__(self):
        return len(self.test_df)
    
    def __getitem__(self, index):
        
        label = self.test_df[index]
        label = torch.FloatTensor(label)
        
        return {"label":label}                                 

In [None]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
       
        self.linear1  = nn.Linear(875, 875) # 168
        self.batchn1  = nn.BatchNorm1d(875)
        #self.dropout1 = nn.Dropout(0.2)
        self.prelu1   = nn.PReLU()
        self.linear2  = nn.Linear(875, 512)
        self.batchn2  = nn.BatchNorm1d(512)
        
        self.prelu3   = nn.PReLU()
        self.linear3  = nn.Linear(512, 256)
        self.batchn3  = nn.BatchNorm1d(256)
        #self.dropout2 = nn.Dropout(0.2)
        self.prelu4   = nn.PReLU()
        self.linear5  = nn.Linear(256, 206)
        
        
    
        
    def forward(self, x):
        
        x = self.linear1(x)
        x = self.batchn1(x)
        #x = self.dropout1(x)
        x = self.prelu1(x)
        x = self.linear2(x)
        x = self.batchn2(x)
        x = self.prelu3(x)
        x = self.linear3(x)
        x = self.batchn3(x)
        #x = self.dropout2(x)
        x = self.prelu4(x)
        
        out = self.linear5(x)   
        
        return out


In [None]:
model = Classifier()
#model.to("cuda")

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = {
    "bce": nn.BCEWithLogitsLoss(),
}
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
scheduler = OneCycleLRWithWarmup(
    optimizer,
    num_steps=2,
    lr_range=(0.05, 0.0005),
    warmup_steps=2,
    momentum_range=(0.85, 0.95))

In [None]:
runner = SupervisedRunner(
    input_key="label",
    output_key = "pred",
    input_target_key = "target"
    #device = "cuda"
    )

In [None]:
kf = KFold(n_splits=10)
test_ds = TestDataset(test.values)

test_dict = OrderedDict()
test_dl = DataLoader(test_ds, batch_size = 1, shuffle = False, num_workers = 1)

test_dict["test"] = test_dl

predictions = np.zeros((len(test), 206))

for fold_, (train_index, test_index) in enumerate(kf.split(train)):
    train_ds = TrainDataset(train.iloc[train_index].values, train_targets.iloc[train_index].values)
    valid_ds = TrainDataset(train.iloc[test_index].values, train_targets.iloc[test_index].values)

    batch = 64

    train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=1)
    valid_dl = DataLoader(valid_ds, batch_size=batch, shuffle=False, num_workers=1)
    
    data = OrderedDict()
    data["train"] = train_dl
    data["valid"] = valid_dl
    
    print("Fold idx:{}".format(fold_ + 1))
    
    runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler = scheduler,
    callbacks=[
        CriterionCallback(
            #fields = ['image'],
            input_key="target",
            output_key="pred",
            criterion_key='bce',
            prefix='loss',
        ),],
    loaders=data,
    logdir="run",
    load_best_on_end=True,
    num_epochs=30,
    verbose=True,)
    
    pred = np.vstack(list(map(
    lambda x: x["pred"].sigmoid().cpu().numpy(), 
    runner.predict_loader(loader=test_dict["test"])
    )))
    
    predictions += pred / kf.n_splits

In [None]:
utils.plot_metrics(
    logdir="../working", 
    # specify which metrics we want to plot
    metrics=["loss", "metric"]
)

In [None]:
predictions.shape

In [None]:
ss[ss.columns[1:]] = predictions

In [None]:
ss.head()

In [None]:
ss.to_csv("submission.csv", index = False)