In [None]:
import numpy as np
import torch as tc
import pandas as pd

import scipy.stats as stats
import statsmodels.api as sm

from tqdm import tqdm as tqdm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
f = 3

SMALL_SIZE = 8*f
MEDIUM_SIZE = 10*f
BIGGER_SIZE = 12*f

plt.style.use('seaborn')
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
plt.rc('figure', figsize=(15, 10))

In [None]:
from functools import partial as p
from functools import reduce as r

def c(*fs): return p(r, (lambda x, f: f(x)), fs[::-1])
def lmap(f, a): return list(map(f, a))
def lfilter(f, a): return list(filter(f, a))
def nth(n): return (lambda v: v[n])
def div(d): return (lambda x: x/d)
def fst(v): return next(iter(v))
snd = nth(-1)

def infrange(start=0):
    i = start
    while 1:
        yield i
        i += 1
        
def srange(n): return pd.Series(range(n), index=range(n))

In [None]:
features = pd.read_csv('../input/lish-moa/train_features.csv')
test_features = pd.read_csv('../input/lish-moa/test_features.csv')
target = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

assert features.sig_id.duplicated().sum() == 0
assert test_features.sig_id.duplicated().sum() == 0
assert target.sig_id.duplicated().sum() == 0

features = features.set_index('sig_id').sort_index()
test_features = test_features.set_index('sig_id').sort_index()
target = target.set_index('sig_id').sort_index()

In [None]:
def prefix_filter(prefix, a): return lfilter(lambda s: s.startswith(prefix), features.columns)

cp_columns = prefix_filter('cp_', features.columns) 
g_columns = prefix_filter('g-', features.columns)
c_columns = prefix_filter('c-', features.columns)

assert sum(map(len, [cp_columns, g_columns, c_columns])) == len(features.columns)
assert all(features.index == target.index)

body = pd.merge(features, target, left_index=True, right_index=True)

assert len(body) == len(features) == len(target)

In [None]:
cp_columns

In [None]:
features.cp_type.unique()

In [None]:
transformed_features = features[g_columns + c_columns].copy()

for f in ['cp_time', 'cp_dose']:
    for val in features[f].unique():
        transformed_features['{}_is_{}'.format(f, str(val))] = (features[f] == val).astype(int)

In [None]:
transformed_test_features = test_features[g_columns + c_columns].copy()

for f in ['cp_time', 'cp_dose']:
    for val in test_features[f].unique():
        transformed_test_features['{}_is_{}'.format(f, str(val))] = (test_features[f] == val).astype(int)

In [None]:
transformed_test_features.head(5)

In [None]:
transformed_features.head(5)

In [None]:
import pytorch_lightning as pl

In [None]:
mask = (features.cp_type == 'trt_cp').values
x_train = tc.from_numpy(transformed_features.values[mask]).to(tc.float32)
y_train = tc.from_numpy(target.values[mask]).to(tc.long)

In [None]:
test_mask = (test_features.cp_type == 'trt_cp').values
x_test = tc.from_numpy(transformed_test_features.values).to(tc.float32)

In [None]:
def score(prob, true):
    prob = tc.clamp(prob, 1e-15, 1-1e-15)
    return -tc.stack([
        tc.log(1-prob),
        tc.log(prob)
    ]).reshape(2, -1)[true.reshape(-1), tc.arange(np.prod(true.shape))].mean()

In [None]:
class HiddenBlock(tc.nn.Module):
    def __init__(self, w, k):
        super().__init__()
        self.features = tc.nn.Sequential(
            tc.nn.Linear(w, k),
            tc.nn.ReLU(),
            tc.nn.Linear(k, w),
            tc.nn.ReLU(),
        )
        
    def forward(self, x):
        return self.features(x) + x


class Model(pl.LightningModule):
    def __init__(self, c=0.0, hidden_w=400, hidden_k=400, hidden_h=1):
        super().__init__()
        
        hidden = [
            HiddenBlock(hidden_w, hidden_k)
            for _ in range(hidden_h)
        ]
        
        layers = [
            tc.nn.Linear(x_train.shape[1], hidden_w),
            tc.nn.ReLU()
        ] + hidden + [
            tc.nn.Linear(hidden_w, y_train.shape[1])
        ]
        
        self.c = c
        self.baseline = tc.nn.Parameter(y_train.to(tc.float32).mean(dim=0), requires_grad=False)
        self.features = tc.nn.Sequential(*layers)
        
    def forward(self, x):
        return tc.sigmoid(self.features(x))
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        loss = score(self(x), y)
        reg = sum(((p**2).sum() for p in self.parameters()))
        
        self.log('train_loss', loss)
        return loss + reg*self.c
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        p = self(x)
        loss = score(self(x), y)
        
        self.log('test_loss', loss)
    
    def configure_optimizers(self):
        return tc.optim.Adam(self.parameters(), lr=1e-4)

In [None]:
x_train.shape[1]

In [None]:
!rm -r nn_sub/
tc.manual_seed(21)

train_loader = tc.utils.data.DataLoader(tc.utils.data.TensorDataset(x_train, y_train), batch_size=100)

model = Model(c=5e-5, hidden_w=800, hidden_k=1, hidden_h=0)
trainer = pl.Trainer(gpus=1, max_epochs=20, logger=pl.loggers.TensorBoardLogger(save_dir='nn_sub', name='nn sub'))
trainer.fit(model, train_loader, train_loader)

In [None]:
model.eval()
with tc.no_grad():
    print(float(score(model(x_train), y_train)))

In [None]:
res = np.zeros((len(x_test), len(target.columns)))

model.eval()
with tc.no_grad():
    res[test_mask] = model(x_test[test_mask]).numpy()
    
res = pd.DataFrame(data=res, index=transformed_test_features.index, columns=target.columns)
res.head(5)

In [None]:
res.to_csv('submission.csv')