In [None]:
import numpy as np 
import pandas as pd 

import os

import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
%config InlineBackend.figure_format = 'svg'

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# matplotlib.style.use('seaborn') 

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

In [None]:
train_features.head()

In [None]:
train_targets_scored.head()

In [None]:
train_features.shape, train_targets_scored.shape, train_targets_scored.shape

In [None]:
train_features.isnull().sum().sum()

Проверим, все ли id уникальны и совпадают ли id для признаков и таргетов

In [None]:
train_features.sig_id.nunique()

In [None]:
(train_features.sig_id != train_targets_scored.sig_id).sum()

In [None]:
train_features.set_index('sig_id', inplace=True)
test_features.set_index('sig_id', inplace=True)
train_targets_scored.set_index('sig_id', inplace=True)
train_targets_nonscored.set_index('sig_id', inplace=True)

### Исследуем таргеты

Проверим, может ли таргет принимать несколько целевых значений

In [None]:
train_targets_scored.sum(axis=1).value_counts()

In [None]:
train_targets_scored.sum(axis=0)

Посмотрим на кол-во объектов с положительным значением для каждого таргета.

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
fig.set_figheight(5)

plt.scatter(np.arange(train_targets_scored.shape[1]), train_targets_scored.sum(axis=0)) 
plt.grid(True)
plt.ylabel('Кол-во положительных исходов')
plt.xlabel('Номер целевой переменной')
plt.xticks(np.arange(train_targets_scored.shape[1])[::10])
plt.show()

In [None]:
g_features = [col for col in train_features.columns if col.startswith('g-')]
c_features = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
len(g_features), len(c_features)

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
fig.set_figheight(5)

plt.scatter(np.arange(len(g_features)), train_features[g_features].mean(axis=0), label='train') 
plt.scatter(np.arange(len(g_features)), test_features[g_features].mean(axis=0), label='test') 
plt.grid(True)
plt.legend()
plt.ylabel('Среднее значение')
plt.xlabel('Номер генной переменной')
plt.xticks(np.arange(len(g_features))[::40])
plt.show()

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
fig.set_figheight(5)

plt.scatter(np.arange(len(g_features)), train_features[g_features].var(axis=0), label='train') 
plt.scatter(np.arange(len(g_features)), test_features[g_features].var(axis=0), label='test') 
plt.grid(True)
plt.legend()
plt.ylabel('Разброс')
plt.xlabel('Номер генной переменной')
plt.xticks(np.arange(len(g_features))[::40])
plt.show()

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
fig.set_figheight(5)

plt.scatter(np.arange(len(c_features)), train_features[c_features].mean(axis=0), label='train') 
plt.scatter(np.arange(len(c_features)), test_features[c_features].mean(axis=0), label='test') 
plt.grid(True)
plt.legend()
plt.ylabel('Среднее значение')
plt.xlabel('Номер клеточной переменной')
plt.xticks(np.arange(len(c_features))[::10])
plt.show()

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(10)
fig.set_figheight(5)

plt.scatter(np.arange(len(c_features)), train_features[c_features].var(axis=0), label='train') 
plt.scatter(np.arange(len(c_features)), test_features[c_features].var(axis=0), label='test') 
plt.grid(True)
plt.legend()
plt.ylabel('Разброс')
plt.xlabel('Номер клеточной переменной')
plt.xticks(np.arange(len(c_features))[::10])
plt.show()

Выделяются три группы таргетов - редкие (чсило объектов от 0 до 200), частые (чиcло объектов от 200 до 600), популярные (чиcло объектов > 600). Может, пригодится в дальнейшем.

In [None]:
train_targets_scored.loc[:, train_targets_scored.sum(axis=0) > 600]

In [None]:
from collections import Counter
moa_types = Counter([name.split('_')[-1] for name in train_targets_scored.columns])

In [None]:
moa_types

### Исследуем признаки

In [None]:
train_features.cp_type.value_counts()

In [None]:
pd.crosstab(train_features.cp_type, train_targets_scored.sum(axis=1))

Проверим, что признаки с ctrl_vehicle не имеют MoAs.

In [None]:
train_targets_scored.loc[train_features[train_features.cp_type == 'ctl_vehicle'].index].sum(axis=0).sum()

В дальнейшем для обучения удалим признаки с cp_type == 'ctl_vehicle' и будем выдавать для них нулевые веротяности для всех MoAs.

In [None]:
train_features.cp_time.value_counts()

In [None]:
pd.crosstab(train_features.cp_time, train_targets_scored.sum(axis=1))

In [None]:
pd.crosstab(train_features.cp_dose, train_targets_scored.sum(axis=1))

In [None]:
train_features.cp_dose.value_counts()

### Подготовка данных

In [None]:
len(g_features), len(c_features)

In [None]:
def add_statistics(df):
    
    df['g_sum'] = df[g_features].sum(axis=1)
    df['g_mean'] = df[g_features].mean(axis=1)
    df['g_std'] = df[g_features].std(axis=1)
    df['g_kurt'] = df[g_features].kurtosis(axis=1)
    df['g_skew'] = df[g_features].skew(axis=1)
    df['c_sum'] = df[c_features].sum(axis=1)
    df['c_mean'] = df[c_features].mean(axis=1)
    df['c_std'] = df[c_features].std(axis=1)
    df['c_kurt'] = df[c_features].kurtosis(axis=1)
    df['c_skew'] = df[c_features].skew(axis=1)
    df['gc_sum'] = df[g_features + c_features].sum(axis=1)
    df['gc_mean'] = df[g_features + c_features].mean(axis=1)
    df['gc_std'] = df[g_features + c_features].std(axis=1)
    df['gc_kurt'] = df[g_features + c_features].kurtosis(axis=1)
    df['gc_skew'] = df[g_features + c_features].skew(axis=1)

In [None]:
g_pca = PCA(n_components=70).fit_transform(
    pd.concat((train_features[g_features], test_features[g_features])).values)
g_pca_train = pd.DataFrame(g_pca[:train_features.shape[0]], index=train_features.index)
g_pca_test = pd.DataFrame(g_pca[train_features.shape[0]:], index=test_features.index)

In [None]:
c_pca = PCA(n_components=10).fit_transform(
    pd.concat((train_features[c_features], test_features[c_features])).values)
c_pca_train = pd.DataFrame(c_pca[:train_features.shape[0]], index=train_features.index)
c_pca_test = pd.DataFrame(c_pca[train_features.shape[0]:], index=test_features.index)

In [None]:
train_features.shape, c_pca_train.shape, g_pca_train.shape, 

In [None]:
train_features = pd.concat([train_features, g_pca_train, c_pca_train], axis=1)
test_features = pd.concat([test_features, g_pca_test, c_pca_test], axis=1)

In [None]:
drop_index = train_features[train_features.cp_type == 'ctl_vehicle'].index

train_features_df = train_features.drop(drop_index, axis=0)
train_features_df = train_features_df.drop('cp_type', axis=1)

train_target_df = train_targets_scored.drop(drop_index, axis=0)


drop_index = test_features[test_features.cp_type == 'ctl_vehicle'].index
test_features_df = test_features.drop(drop_index, axis=0)
test_features_df = test_features_df.drop('cp_type', axis=1)

In [None]:
train_features_df = pd.get_dummies(train_features_df, columns=['cp_time', 'cp_dose'], drop_first=True)
test_features_df = pd.get_dummies(test_features_df , columns=['cp_time', 'cp_dose'], drop_first=True)

In [None]:
add_statistics(train_features_df)
add_statistics(test_features_df)

In [None]:
train_features_df.shape, test_features_df.shape

In [None]:
X_train_all = train_features_df.values
y_train_all = train_target_df.values
X_test = test_features_df.values

In [None]:
scaler = StandardScaler()
X_train_all = scaler.fit_transform(X_train_all)
X_test = scaler.transform(X_test)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42)

In [None]:
train_all_dataset = TensorDataset(torch.tensor(X_train_all).float(), torch.tensor(y_train_all).float())
train_all_loader = DataLoader(train_all_dataset, batch_size=128)

train_dataset = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
val_dataset = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).float())

train_loader = DataLoader(train_dataset, batch_size=128)
val_loader = DataLoader(val_dataset, batch_size=128)

In [None]:
x, y = next(iter(train_loader))
x.shape, y.shape

### FFNN model

In [None]:
class FFNN(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        
        self.bn1 = nn.BatchNorm1d(input_size)
        self.dropout1 = nn.Dropout(0.2)
        self.l1 = nn.utils.weight_norm(nn.Linear(input_size, 2048))
        self.bn2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.5)
        self.l2 = nn.utils.weight_norm(nn.Linear(2048, 1024))
        self.bn3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.5)
        self.l3 = nn.utils.weight_norm(nn.Linear(1024, output_size))
    
    def forward(self, x):
        x = self.bn1(x)
        x = self.dropout1(x)
        x = F.elu(self.l1(x))
        
        x = self.bn2(x)
        x = self.dropout2(x)
        x = F.elu(self.l2(x))
        
        x = self.bn3(x)
        x = self.dropout3(x)
        x = torch.sigmoid(self.l3(x))
        
        return x

In [None]:
model = FFNN(970, 206)

In [None]:
model(x).shape

In [None]:
def train_model(model, optimizer, loss_function, train_loader, 
                val_loader=None, scheduler=None, epochs=1):

    for epoch in range(epochs):
        running_loss = 0.0
        for n_iter, (x, y) in enumerate(train_loader):
            model.train()
            x = x.to(device)
            y = y.to(device) 
            optimizer.zero_grad()
            y_pred = model(x)
            loss = loss_function(y_pred, y)
            loss.backward()
            optimizer.step()      
            running_loss += loss.item()
        running_loss /= len(train_loader)   
        
        if val_loader is not None:
            model.eval()  
            loss = 0.0
            with torch.no_grad():
                for (x, y) in val_loader:
                    x = x.to(device)
                    y = y.to(device) 
                    y_pred = model(x)
                    loss += loss_function(y_pred, y).item()
                loss /= len(val_loader)

            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss),
                  "Val loss: {:.6f} ".format(loss))
        else:
            print("Epoch: [{}/{}] ".format(epoch + 1, epochs),
                  "Train loss: {:.6f}".format(running_loss))
        if scheduler is not None:
            scheduler.step()     

In [None]:
loss_function = nn.BCELoss()
model = FFNN(970, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)

In [None]:
train_model(model, optimizer, loss_function, train_loader, val_loader, epochs=50, scheduler=scheduler)

In [None]:
loss_function = nn.BCELoss()
model = FFNN(970, 206).to(device)
optimizer = optim.Adam(lr=0.001, params=model.parameters(), weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer,10, gamma=0.5, last_epoch=-1)
train_model(model, optimizer, loss_function, train_all_loader, epochs=50, scheduler=scheduler)

In [None]:
def predict(model, X):
    model.eval()  
            
    with torch.no_grad():
        X = X.to(device)
        preds = model(X)
#         y_pred = torch.sigmoid(preds)
    return preds.cpu().numpy()

In [None]:
y_pred = predict(model, torch.tensor(X_test).float())

###  Submission

In [None]:
submission = pd.DataFrame(np.zeros((test_features.shape[0], train_targets_scored.shape[1])),
                         index=test_features.index, columns=train_targets_scored.columns)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
pred_index = test_features[test_features.cp_type != 'ctl_vehicle'].index

In [None]:
len(pred_index)

In [None]:
y_pred.shape

In [None]:
submission.shape

In [None]:
submission.loc[pred_index, :] = y_pred

In [None]:
submission.reset_index(inplace=True)

In [None]:
submission

In [None]:
sample_submission.shape

In [None]:
submission.shape

In [None]:
submission.to_csv('/kaggle/working/submission.csv', index=False)