In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import gc
import random
import math
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.metrics import log_loss

import category_encoders as ce

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from sklearn.decomposition import PCA

In [None]:
!pip  install iterative-stratification

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    
seed_everything(42)

In [None]:
import plotly.express as px

# Data loading

# About this Competition
In this competition, you will be predicting multiple targets of the Mechanism of Action (MoA) response(s) of different samples (sig_id), given various inputs such as gene expression data and cell viability data.

Two notes:

* the training data has an additional (optional) set of MoA labels that are not included in the test data and not used for scoring.
* the re-run dataset has approximately 4x the number of examples seen in the Public test.
# Files
* train_features.csv - Features for the training set. Features g- signify gene expression data, and c- signify cell viability data. cp_type indicates samples treated with a compound (cp_vehicle) or with a control perturbation (ctrl_vehicle); control perturbations have no MoAs; cp_time and cp_dose indicate treatment duration (24, 48, 72 hours) and dose (high or low).
* train_drug.csv - This file contains an anonymous drug_id for the training set only.
* train_targets_scored.csv - The binary MoA targets that are scored.
* train_targets_nonscored.csv - Additional (optional) binary MoA responses for the training data. These are not predicted nor scored.
* test_features.csv - Features for the test data. You must predict the probability of each scored MoA for each row in the test data.
* sample_submission.csv - A submission file in the correct format.

In [None]:
train_drug=pd.read_csv('/kaggle/input/lish-moa/train_drug.csv')
train_targets_scored=pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored=pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
train_features=pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features=pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

In [None]:
train_features.head(5)

In [None]:
train_targets_scored.head(5)

In [None]:
train_targets_nonscored.head(5)

In [None]:
sample_submission=pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')


# EDA

Check for empty values

In [None]:
train_features.isnull().any().sum()
test_features.isnull().any().sum()

In [None]:
print('Train dataset',train_features.shape)
print('Test dataset',test_features.shape)

In [None]:
cat_features=train_features.select_dtypes(include=["object"])


In [None]:
len(cat_features.columns)

In [None]:
cat_features=train_features.select_dtypes(include=["object"])
num_features=train_features.select_dtypes(exclude=["object"])
print(f'Categorical features {len(cat_features.columns)}, Number features {len(num_features.columns)} ' )

# Categorical Features

In [None]:
import seaborn as sns
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

In [None]:
fig,ax=plt.subplots(2,3, figsize=(8,4), dpi=100) 
sns.countplot(train_features["cp_dose"],ax=ax[0,0])
sns.countplot(train_features["cp_type"],ax=ax[0,1])
sns.countplot(train_features["cp_time"],ax=ax[0,2])
sns.countplot(test_features["cp_dose"],ax=ax[1,0])
sns.countplot(test_features["cp_type"],ax=ax[1,1])
sns.countplot(test_features["cp_time"],ax=ax[1,2])
ax[0,0].set_title('Train Doses Low/Hight')
ax[1,0].set_title('Test Doses Low/Hight')
ax[0,1].set_title('Train Compound / control treatment')
ax[1,1].set_title('Test Compound / control treatment')
ax[0,2].set_title('Train duration(hour)')
ax[1,2].set_title('Test duration(hour)')

plt.tight_layout()

In [None]:
train_features.groupby(["cp_type"])["sig_id"].count()

We see that our distribution (test and train) are very similar to each other. Samples  control perturbation( ctl_vehicle) is less than 8%.

In [None]:
gens = list(filter(lambda x: x.startswith('g-'),train_features.columns))
cells = list(filter(lambda x: x.startswith('c-'),train_features.columns))
print(f'gen len {len(gens)}, cells len {len(cells)}')

Some distribution of randomly selected columns.

In [None]:
def plot_list(plot_lists,name):
    fig,ax=plt.subplots(1,4, figsize=(10,4), dpi=100) 
    fig.suptitle(name, fontsize=16)
    for i,gen in enumerate(plot_lists):
        train_features.hist(gen,ax=ax[i])
        plt.tight_layout()

In [None]:
plot_list( [gens[np.random.randint(0, 772)] for i in range(4)],'GENE DISTRIBUTION')

In [None]:
plot_list( [cells[np.random.randint(0, 99)] for i in range(4)],'CELLS DISTRIBUTION')

We can see that our distributions of genes and cells lool like normal,with mean in zero as random following plots show.

# Training features correlation

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_features[cells].corr(),cmap='viridis')

There is a clear high correlation between cell viabilities that has to be examined.

In [None]:
all_list=cells+gens
plt.figure(figsize=(8,6))
sns.heatmap(train_features[list([all_list[np.random.randint(1,len(all_list)-1)] for i in range(90)])].corr(),cmap='viridis')

# Target analysis

In [None]:
target_non_zero=pd.DataFrame(train_targets_scored.drop(["sig_id"],axis=1).sum(axis=0).sort_values().reset_index())
target_non_zero.columns=['name','count_z']

In [None]:
plt.figure(figsize=(7,8))
sns.barplot(data=target_non_zero[-50:],x='count_z',y='name')

In [None]:
plt.figure(figsize=(7,8))
sns.barplot(data=target_non_zero[:50],x='count_z',y='name')

In [None]:
print(f'it is {target_non_zero[target_non_zero.count_z.values<20].count().values[0]} value that number of positive sample <20 , it is less than 0.1%')


In [None]:
plt.figure(figsize=(17,8))
sns.barplot(y=target_non_zero.name[-20:],x=(target_non_zero.count_z.values/train_targets_scored.shape[0]*100)[-20:])


The highest number of positive sample is 3.5% 

In [None]:
data=train_targets_scored.drop(['sig_id'], axis=1).astype(bool).sum(axis=1).reset_index()
data.columns=['row','count']
data=data.groupby("count")['row'].count().reset_index()
plt.figure(figsize=(10,5))
sns.barplot(data=data,x='count',y='row')

In [None]:
data['count'].values

In [None]:

plt.figure(figsize=(10,10))
labels = data['count'].values
explode = (0, 0.1, 0, 0, 0, 0.1, 0)  

fracs = data['row'].values/train_targets_scored.shape[0]*100
# Make figure and axes
fig, axs = plt.subplots()
# A standard pie plot
axs.pie(fracs,  autopct='%1.1f%%', shadow=True,explode=explode)

axs.legend( loc="left",labels=labels)
plt.show()

We see that 40 % of sample have zeros in all columns, and only 53% have only one active target column

In [None]:
train_targets_scored.describe()

# TRAIN & TARGET CORRELATION

In [None]:
columns=gens+cells

In [None]:
correlation_matrix = pd.DataFrame()

for t_col in train_targets_scored.columns:
    corr_list = list()
    if t_col == 'sig_id':
        continue
    for col in columns:
        res = train_features[col].corr(train_targets_scored[t_col])
        corr_list.append(res)
    correlation_matrix[t_col] = corr_list

In [None]:
correlation_matrix['train_features']=columns
correlation_matrix = correlation_matrix.set_index('train_features')
correlation_matrix


Let's see what is the higher value (absolute) of correlation for target columns with every column from train set

In [None]:
maxCol=lambda x: max(x.min(), x.max(), key=abs)
high_scores = correlation_matrix.apply(maxCol, axis=0).reset_index()
high_scores.columns=["column","corr"]

In [None]:
fig = px.bar(
    high_scores, 
    x='column', 
    y="corr", 
    orientation='v', 
    title='Best correlation with train columns for every target column', 
    width=1200,
    height=800
)

fig.show()

In [None]:
col_df = pd.DataFrame()
tr_cols = list()
tar_cols = list()

for col in correlation_matrix.columns:
    tar_cols.append(col)
    tr_cols.append(
        correlation_matrix[col].abs().sort_values(ascending=False).reset_index()['train_features'].head(1).values[0]
    )

col_df['column'] = tar_cols
col_df['train_best_column'] = tr_cols

total_scores = pd.merge(high_scores, col_df)

total_scores

Let's take some random columns from target and see their correlation

In [None]:
target_columns = train_targets_scored.columns.tolist()
target_columns.remove('sig_id')
for_analysis = [
    target_columns[
        np.random.randint(0, len(target_columns)-1)
    ] for i in range(5)
]

current_corr = correlation_matrix[for_analysis]

In [None]:
current_corr

In [None]:
col_df=pd.DataFrame()
first_col=list()
second_col=list()
tar_cols = list()
for col in current_corr.columns:
    tar_cols.append(col)
    first_col.append(current_corr[col].abs().sort_values(ascending=False).reset_index()['train_features'].values[0])
    second_col.append(current_corr[col].abs().sort_values(ascending=False).reset_index()['train_features'].values[1])
col_df['column']=tar_cols
col_df['train_1_column']=first_col
col_df['train_2_column']=second_col
col_df

In [None]:
for i in range(col_df.shape[0]):
    analysis = pd.DataFrame()
    analysis['color'] = train_targets_scored[col_df.iloc[i]['column']]
    analysis['x'] = train_features[col_df.iloc[i]['train_1_column']]
    analysis['y'] = train_features[col_df.iloc[i]['train_2_column']]
    analysis.columns = [
        'color', 
        col_df.iloc[i]['train_1_column'], 
        col_df.iloc[i]['train_2_column']
    ]
    analysis['size'] = 1
    analysis.loc[analysis['color'] == 1, 'size'] = 12
    plt.figure(figsize=(8,7))
    plt.title(col_df.iloc[i]['column'])
    sns.scatterplot(x=col_df.iloc[i]['train_1_column'],y=col_df.iloc[i]['train_2_column'], data=analysis,hue='color',size='size')

If we look to the name of target columns, it is look like the last term in the columns name is definition of a group.

In [None]:
target_columns

In [None]:
last_term={}
for col in target_columns:
    try:
        last_term[col.split('_')[-1]] += 1
    except:
        last_term[col.split('_')[-1]] = 1
last_term=pd.DataFrame(last_term.items(),columns=['name','count'])
last_term=last_term[last_term['count']>1].sort_values('count')
fig = px.bar(
    last_term, 
    x='name', 
    y="count", 
    orientation='v', 
    title='Group of target columns', 
    width=800,
    height=500
)

fig.show()
    

In [None]:
answer = list()

for group in last_term.name.tolist():
    agent_list = list()
    for item in target_columns:
        if item.split('_')[-1] == group:
            
            agent_list.append(item)
    agent_df = train_targets_scored[agent_list]
    data = agent_df.astype(bool).sum(axis=1).reset_index()
    answer.append(data[0].max())
answer_df=pd.DataFrame({'columns':last_term.name.tolist(),'value':answer})
fig = px.bar(
    answer_df, 
    x='columns', 
    y="value", 
    orientation='v', 
    title='Maximum number of active columns in one sample in every group', 
    width=800,
    height=500
)

fig.show()
    

# Target categorical column(dependecies)

In [None]:
cat_var=['cp_type','cp_time','cp_dose']
cat_df=pd.concat([train_features[cat_var],train_targets_scored.drop('sig_id',axis=1)],axis=1)
cat_df

In [None]:
def find_targets_zero(cat_var):
    dict_cat={}
    for i in cat_var:
        for cat in np.unique(cat_df[i]):
            name_columns=[]
            for col in cat_df.columns:
                if col in cat_var:
                    continue
                else: 
                    if len(cat_df[cat_df[i]== cat][col].value_counts())==1:
                        name_columns.append(col)
            dict_cat[cat]=name_columns
    return dict_cat
                    
    

In [None]:
result=find_targets_zero(cat_var)


Lets see zero targets.

In [None]:
for key,value in result.items():
    if len(value) >10 :
        print(f'the number of zero target for {key} is {len(value)}')
    else:
        print(f'{key}:{value}')


We see :
1. that for column cp_type all records are zero where cp_type=ctl_vehicle 
2. for column cp_time=24 ,records 'atp-sensitive_potassium_channel_antagonist', 'erbb2_inhibitor' is zero
3. for column cp_time=72 ,records 'atp-sensitive_potassium_channel_antagonist', 'erbb2_inhibitor' is zero
4. for column cp_dose=D2 ,records 'atp-sensitive_potassium_channel_antagonist', 'erbb2_inhibitor' is zero
 

# QuantileTransformer

In [None]:
from sklearn.preprocessing import QuantileTransformer

In [None]:
SEED_VALUE = 42

In [None]:
vec_len = train_features.shape[0]
vec_len_test =test_features.shape[0]
for col in (gens + cells):
    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
   
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)
    train_features[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

# PCA

In [None]:
# GENES
n_comp = 600

data = pd.concat([pd.DataFrame(train_features[gens]), pd.DataFrame(test_features[gens])])
data2 = (PCA(n_components=n_comp, random_state=SEED_VALUE).fit_transform(data[gens]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))


In [None]:
# CELLS
n_comp = 50

data = pd.concat([pd.DataFrame(train_features[cells]), pd.DataFrame(test_features[cells])])
data2 = (PCA(n_components=n_comp, random_state=SEED_VALUE).fit_transform(data[cells]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))

# Variance Encoding

In [None]:
from sklearn.feature_selection import VarianceThreshold

var_thresh = VarianceThreshold(0.8)
data = train_features.append(test_features)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]

train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)

test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

print('train_features: {}'.format(train_features.shape))
print('test_features: {}'.format(test_features.shape))

# ___________________________________

In [None]:
train_features.shape

In [None]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_nonscored, on='sig_id')
train = train.merge(train_drug, on='sig_id')
train = train[train['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type'] != 'ctl_vehicle'].reset_index(drop=True)

In [None]:
train = train.drop('cp_type',axis=1)
test = test.drop('cp_type',axis=1)

In [None]:
train.head(5)

In [None]:
target_cols = [x for x in train_targets_scored.columns if x != 'sig_id']
aux_target_cols = [x for x in train_targets_nonscored.columns if x != 'sig_id']
all_target_cols = target_cols + aux_target_cols

num_targets = len(target_cols)
num_aux_targets = len(aux_target_cols)
num_all_targets = len(all_target_cols)

print('num_targets: {}'.format(num_targets))
print('num_aux_targets: {}'.format(num_aux_targets))
print('num_all_targets: {}'.format(num_all_targets))

In [None]:
print(train.shape)
print(test.shape)
print(sample_submission.shape)

# Dataset Classes

In [None]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)
        }
        
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }

        return dct

In [None]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()

        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    return final_loss

def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)

        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    return preds


In [None]:
import torch
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1

        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss

# Model

In [None]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets):
        super(Model, self).__init__()
        self.hidden_size = [1500, 1250, 1000, 750]
        self.dropout_value = [0.5, 0.35, 0.3, 0.25]

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dense1 = nn.Linear(num_features, self.hidden_size[0])
        
        self.batch_norm2 = nn.BatchNorm1d(self.hidden_size[0])
        self.dropout2 = nn.Dropout(self.dropout_value[0])
        self.dense2 = nn.Linear(self.hidden_size[0], self.hidden_size[1])

        self.batch_norm3 = nn.BatchNorm1d(self.hidden_size[1])
        self.dropout3 = nn.Dropout(self.dropout_value[1])
        self.dense3 = nn.Linear(self.hidden_size[1], self.hidden_size[2])

        self.batch_norm4 = nn.BatchNorm1d(self.hidden_size[2])
        self.dropout4 = nn.Dropout(self.dropout_value[2])
        self.dense4 = nn.Linear(self.hidden_size[2], self.hidden_size[3])

        self.batch_norm5 = nn.BatchNorm1d(self.hidden_size[3])
        self.dropout5 = nn.Dropout(self.dropout_value[3])
        self.dense5 = nn.utils.weight_norm(nn.Linear(self.hidden_size[3], num_targets))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = F.leaky_relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.leaky_relu(self.dense2(x))

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.leaky_relu(self.dense3(x))

        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = F.leaky_relu(self.dense4(x))

        x = self.batch_norm5(x)
        x = self.dropout5(x)
        x = self.dense5(x)
        return x
    
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)

        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
            
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))  

In [None]:
class FineTuneScheduler:
    def __init__(self, epochs):
        self.epochs = epochs
        self.epochs_per_step = 0
        self.frozen_layers = []

    def copy_without_top(self, model, num_features, num_targets, num_targets_new):
        self.frozen_layers = []

        model_new = Model(num_features, num_targets)
        model_new.load_state_dict(model.state_dict())

        # Freeze all weights
        for name, param in model_new.named_parameters():
            layer_index = name.split('.')[0][-1]

            if layer_index == 5:
                continue

            param.requires_grad = False

            # Save frozen layer names
            if layer_index not in self.frozen_layers:
                self.frozen_layers.append(layer_index)

        self.epochs_per_step = self.epochs // len(self.frozen_layers)

        # Replace the top layers with another ones
        model_new.batch_norm5 = nn.BatchNorm1d(model_new.hidden_size[3])
        model_new.dropout5 = nn.Dropout(model_new.dropout_value[3])
        model_new.dense5 = nn.utils.weight_norm(nn.Linear(model_new.hidden_size[-1], num_targets_new))
        model_new.to(DEVICE)
        return model_new

    def step(self, epoch, model):
        if len(self.frozen_layers) == 0:
            return

        if epoch % self.epochs_per_step == 0:
            last_frozen_index = self.frozen_layers[-1]
            
            # Unfreeze parameters of the last frozen layer
            for name, param in model.named_parameters():
                layer_index = name.split('.')[0][-1]

                if layer_index == last_frozen_index:
                    param.requires_grad = True

            del self.frozen_layers[-1]  # Remove the last layer as unfrozen

# Preprocessing steps

In [None]:
def process_data(data):
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    return data

In [None]:
feature_cols = [c for c in process_data(train).columns if c not in all_target_cols]
feature_cols = [c for c in feature_cols if c not in ['kfold', 'sig_id', 'drug_id']]
num_features = len(feature_cols)
num_features

In [None]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 24
BATCH_SIZE = 128

WEIGHT_DECAY = {'ALL_TARGETS': 1e-5, 'SCORED_ONLY': 3e-6}
MAX_LR = {'ALL_TARGETS': 1e-2, 'SCORED_ONLY': 3e-3}
DIV_FACTOR = {'ALL_TARGETS': 1e3, 'SCORED_ONLY': 1e2}
PCT_START = 0.1

In [None]:
# Show model architecture
model = Model(num_features, num_all_targets)
model

# Single fold training

In [None]:
from sklearn.model_selection import KFold

def make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH):
    vc = train.drug_id.value_counts()
    vc1 = vc.loc[vc <= DRUG_THRESH].index.sort_values()
    vc2 = vc.loc[vc > DRUG_THRESH].index.sort_values()

    for seed_id in range(SEEDS):
        kfold_col = 'kfold_{}'.format(seed_id)
        
        # STRATIFY DRUGS 18X OR LESS
        dct1 = {}
        dct2 = {}

        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.groupby('drug_id')[target_cols].mean().loc[vc1]

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.index[idxV].values}
            dct1.update(dd)

        # STRATIFY DRUGS MORE THAN 18X
        skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed_id)
        tmp = train.loc[train.drug_id.isin(vc2)].reset_index(drop=True)

        for fold,(idxT, idxV) in enumerate(skf.split(tmp, tmp[target_cols])):
            dd = {k: fold for k in tmp.sig_id[idxV].values}
            dct2.update(dd)

        # ASSIGN FOLDS
        train[kfold_col] = train.drug_id.map(dct1)
        train.loc[train[kfold_col].isna(), kfold_col] = train.loc[train[kfold_col].isna(), 'sig_id'].map(dct2)
        train[kfold_col] = train[kfold_col].astype('int8')
        
    return train

SEEDS = 7
NFOLDS = 7
DRUG_THRESH = 18

train = make_cv_folds(train, SEEDS, NFOLDS, DRUG_THRESH)
train.head()

In [None]:
def run_training(fold_id, seed_id):
    seed_everything(seed_id)
    
    train_ = process_data(train)
    test_ = process_data(test)
    
    kfold_col = f'kfold_{seed_id}'
    trn_idx = train_[train_[kfold_col] != fold_id].index
    val_idx = train_[train_[kfold_col] == fold_id].index
    
    train_df = train_[train_[kfold_col] != fold_id].reset_index(drop=True)
    valid_df = train_[train_[kfold_col] == fold_id].reset_index(drop=True)
    
    def train_model(model, tag_name, target_cols_now, fine_tune_scheduler=None):
        x_train, y_train  = train_df[feature_cols].values, train_df[target_cols_now].values
        x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols_now].values
        
        train_dataset = MoADataset(x_train, y_train)
        valid_dataset = MoADataset(x_valid, y_valid)

        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=WEIGHT_DECAY[tag_name])
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                  steps_per_epoch=len(trainloader),
                                                  pct_start=PCT_START,
                                                  div_factor=DIV_FACTOR[tag_name], 
                                                  max_lr=MAX_LR[tag_name],
                                                  epochs=EPOCHS)
        
        loss_fn = nn.BCEWithLogitsLoss()
        loss_tr = SmoothBCEwLogits(smoothing=0.001)

        oof = np.zeros((len(train), len(target_cols_now)))
        best_loss = np.inf
        
        for epoch in range(EPOCHS):
            if fine_tune_scheduler is not None:
                fine_tune_scheduler.step(epoch, model)

            train_loss = train_fn(model, optimizer, scheduler, loss_tr, trainloader, DEVICE)
            valid_loss, valid_preds = valid_fn(model, loss_fn, validloader, DEVICE)
            print(f"SEED: {seed_id}, FOLD: {fold_id}, {tag_name}, EPOCH: {epoch}, train_loss: {train_loss:.6f}, valid_loss: {valid_loss:.6f}")

            if np.isnan(valid_loss):
                break
            
            if valid_loss < best_loss:
                best_loss = valid_loss
                oof[val_idx] = valid_preds
                torch.save(model.state_dict(), f"{tag_name}_FOLD{fold_id}_.pth")

        return oof

    fine_tune_scheduler = FineTuneScheduler(EPOCHS)

    pretrained_model = Model(num_features, num_all_targets)
    pretrained_model.to(DEVICE)

    # Train on scored + nonscored targets
    train_model(pretrained_model, 'ALL_TARGETS', all_target_cols)

    # Load the pretrained model with the best loss
    pretrained_model = Model(num_features, num_all_targets)
    pretrained_model.load_state_dict(torch.load(f"ALL_TARGETS_FOLD{fold_id}_.pth"))
    pretrained_model.to(DEVICE)

    # Copy model without the top layer
    final_model = fine_tune_scheduler.copy_without_top(pretrained_model, num_features, num_all_targets, num_targets)

    # Fine-tune the model on scored targets only
    oof = train_model(final_model, 'SCORED_ONLY', target_cols, fine_tune_scheduler)

    # Load the fine-tuned model with the best loss
    model = Model(num_features, num_targets)
    model.load_state_dict(torch.load(f"SCORED_ONLY_FOLD{fold_id}_.pth"))
    model.to(DEVICE)

    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = np.zeros((len(test_), num_targets))
    predictions = inference_fn(model, testloader, DEVICE)
    return oof, predictions
def run_k_fold(NFOLDS, seed_id):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold_id in range(NFOLDS):
        oof_, pred_ = run_training(fold_id, seed_id)
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [None]:
from time import time

# Averaging on multiple SEEDS
SEED = [0]
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

time_begin = time()

for seed_id in SEED:
    oof_, predictions_ = run_k_fold(NFOLDS, seed_id)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

time_diff = time() - time_begin

train[target_cols] = oof
test[target_cols] = predictions

In [None]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0

for i in range(len(target_cols)):
    score += log_loss(y_true[:, i], y_pred[:, i])

print("CV log_loss: ", score / y_pred.shape[1])

In [None]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)