# Notebook goal

The goal of this notebook is to provide basic information about the model predictions for the training dataset. 

It should be easily extendable, meaning that you can just plug your train_predictions.csv file that contains all the **target columns** + **sig_id** and you should get all the visualization.

I am using [this](https://www.kaggle.com/yasufuminakama/moa-pytorch-nn-starter) public notebook as an example.

This notebook tries to address the following questions:

* Average predictions per samples with same number of activated moas
* Which MOAs have the highest loss ?
* Grouping the predictions by the number of activated MOAs, how much is the influence of each group in the final loss ?
* What is the relation between the predicted sum of MOA activations and the actual one ?
* What is the relation between the loss of the most active MOAs and the number of training samples for each MOA ?
* How is the loss distributed through the most active MOAs ?

In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
from torch.nn.modules.loss import _WeightedLoss
import torch.nn.functional as F

import copy

import plotly.express as px
import plotly.graph_objects as go

import seaborn as sns

from tensorboardX import SummaryWriter

from collections import Counter

In [None]:
#Change this to your train predictions.
train_pred = pd.read_csv('../input/moa-pytorch-nn-starter/oof.csv')

In [None]:
train_pred = pd.read_csv('../input/moa-pytorch-nn-starter/oof.csv')

train_data = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

train_data = pd.merge(train_data, train_targets, on=['sig_id'])

train_data = train_data[train_data.cp_type == 'trt_cp'].reset_index(drop=True)

target_columns = [c for c in train_targets.columns if c != 'sig_id']
target_columns_pred = [f'{c}_pred' for c in target_columns]
target_columns_loss = [f'{c}_loss' for c in target_columns]

train_data['activated_moas'] = train_data.loc[:, target_columns].values.sum(axis=1)

In [None]:
def calculate_log_loss(predicted_df, train_df, target_columns):
    
    predicted_df = predicted_df[target_columns + ['sig_id']].reset_index(drop=True)
    predicted_df = predicted_df.sort_values(by=['sig_id'])
    predicted_df = predicted_df.drop('sig_id', axis=1)

    true_df = train_df[target_columns + ['sig_id']].reset_index(drop=True)
    true_df = true_df.sort_values(by=['sig_id'])
    true_df = true_df.drop('sig_id', axis=1)

    predicted_values = predicted_df.values
    true_values = true_df.values
    
    score = 0
    loss_per_class = []
    for moa_idx in range(predicted_values.shape[1]):        
        _score = log_loss(true_values[:, moa_idx].astype(np.float), predicted_values[:, moa_idx].astype(np.float), eps=1e-15, labels=[1,0])
        _score = _score / predicted_values.shape[1]
        score += _score
        loss_per_class.append(_score)
        

    return score, loss_per_class


def calculate_log_loss_per_row(predicted_df, train_df, target_columns):
    predicted_df = predicted_df[target_columns + ['sig_id']].reset_index(drop=True)
    predicted_df = predicted_df.sort_values(by=['sig_id'])
    
    sig_ids = predicted_df.sig_id.values
    
    predicted_df = predicted_df.drop('sig_id', axis=1)

    true_df = train_df[target_columns + ['sig_id']].reset_index(drop=True)
    true_df = true_df.sort_values(by=['sig_id'])
    true_df = true_df.drop('sig_id', axis=1)
    
    predicted_values = predicted_df.values
    true_values = true_df.values
    
    eps = 1e-15
    pred_cliped = np.clip(predicted_values, eps, 1 - eps)
    pred_cliped[true_values == 0] = 1 - pred_cliped[true_values == 0]
    loss_per_prediction = -np.log(pred_cliped)
    
    loss_per_prediction /= true_values.shape[0] * true_values.shape[1]
        
    loss_per_prediction_df = pd.DataFrame(data=loss_per_prediction, columns=[f'{c}_loss' for c in target_columns])
    loss_per_prediction_df['sig_id'] = sig_ids
    loss_per_prediction_df['sample_loss'] = loss_per_prediction.sum(axis=1)
    
    return loss_per_prediction_df

In [None]:
train_losses = calculate_log_loss_per_row(train_pred, train_data, target_columns)
total_loss, loss_per_class = calculate_log_loss(train_pred, train_data, target_columns)

In [None]:
print(f'Train loss: {total_loss}')

In [None]:
train_data = pd.merge(train_data, train_losses, on=['sig_id'])
train_pred.columns = [v if v == 'sig_id' else f'{v}_pred' for i, v in enumerate(train_pred.columns)] 
train_data = pd.merge(train_data, train_pred, on=['sig_id'])

# Average predictions per samples with same number of activated moas



In [None]:
data = []

for i in [0, 1, 2, 3, 4, 5, 7]:
    temp_mean = train_data[train_data.activated_moas == i][target_columns_pred].sum(axis=1).mean()
    data.append([i, temp_mean])
    
df = pd.DataFrame(data=data, columns=['activated_moas', 'average_prediction_per_sample'])
fig = px.line(df, x="activated_moas", y="average_prediction_per_sample", title='Average prediction sum per samples')
fig.show()

My intuition here is that trend should be linear and the differences between two consecutive *activated_moas* point should be roughly the same.
* We can see that the model is having hard time predicting samples that have 4 or 7 activated moas.
* Maybe the difference between average prediction with 0 and 1 MOA activations is too small ?



# Which MOAs have the highest loss ?

In [None]:
loss_per_moa = train_data[target_columns_loss].values.sum(axis=0)
loss_per_moa_idx = loss_per_moa.argsort()[::-1]

fig = go.Figure()

fig.add_trace(go.Scatter(x=[target_columns[i] for i in loss_per_moa_idx], y=[loss_per_moa[i] for i in loss_per_moa_idx],
                    mode='lines',
                    name='Loss per moa'))

fig.show()

# Grouping the predictions by the number of activated MOAs, how much is the influence of each group in the final loss ?

In [None]:
loss_per_moa_activations = train_data.groupby('activated_moas')['sample_loss'].sum()

df = pd.DataFrame(data=[[i, (v / total_loss) * 100] for i, v in loss_per_moa_activations.items()], columns=['activated_moas', 'loss_percentage'])
fig = px.bar(df, x="activated_moas", y="loss_percentage")
fig.show()

# What is the relation between the predicted sum of MOA activations and the actual one ?

In [None]:
grouped_by_activations = train_data.groupby(['activated_moas'])[target_columns_pred + target_columns_loss].sum()

data = [] # active moas, moa_id, no_train_samples, pred_value, loss_value
for i, row in grouped_by_activations.iterrows():
    for j, c in enumerate(target_columns_pred):
        train_len = len(train_data[(train_data[target_columns[j]] == 1) & (train_data.activated_moas == i)])
        data.append([i, target_columns[j], train_len, row[c], row[target_columns_loss[j]]])
        
grouped_by_activations_df = pd.DataFrame(data=data, columns=['activated_moas', 'moa_name', 'train_examples', 'pred_value', 'loss_value'])

most_active_moas = train_data[target_columns_pred].values.sum(axis=0)

train_samples_per_moa = train_data[target_columns].values.sum(axis=0)
most_active_moa_samples_ids = train_samples_per_moa.argsort()[::-1]
most_active_moa_names = [target_columns[i] for i in most_active_moa_samples_ids]

fig = go.Figure()

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[most_active_moas[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Sum of MOA activations'))

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[train_samples_per_moa[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Number of train samples'))

fig.show()

Here I think that the **'Sum of MOA activations'** line is better to be below the **'Number of train samples'** because the loss penalizes a lot the confident incorrect predictions.

# What is the relation between the loss of the most active MOAs and the number of training samples for each MOA ?

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[loss_per_moa[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Loss per moa'))

fig.show()

In [None]:
fig = px.scatter(grouped_by_activations_df[grouped_by_activations_df.activated_moas > 0], x="train_examples", y="loss_value", size="pred_value", color='moa_name')
fig.show()

# How is the validation loss distributed through the most active MOAs ?

Lets see the losses of the most active MOAs with respect to the activated moas in the samples.

In [None]:
df = grouped_by_activations_df[grouped_by_activations_df.moa_name.isin(most_active_moa_names[:16])].reset_index(drop=True)
fig = px.bar(df, x="moa_name", y="loss_value", color="activated_moas", title='15 most active MOAs')
fig.show()

Lets plot the distributions of the predictions for **proteasome_inhibitor** and **cyclooxygenase_inhibitor** since those two MOAs have lowest and highest loss value respectively.

In [None]:
sns.distplot(train_data['proteasome_inhibitor_pred'], color='Red')
sns.distplot(train_data['cyclooxygenase_inhibitor_pred'], color='Blue')

* We observe that the model is pretty sure when is predicting **proteasome_inhibitor**. This can be dangerous property since it can be penalized a lot by the loss function.
* On the other hand, when predicting **cyclooxygenase_inhibitor** the model is never predicting near 1. Honestly, I don't know how to interpret this but one guess is that the loss for this MOA is the highest because the model is thinking a little bit that **cyclooxygenase_inhibitor** is active in every sample and if we sum this across all the samples it adds up a lot.

# Submission Analysis

In [None]:
#Change this to point to your submission
submission = pd.read_csv('../input/moa-pytorch-nn-starter/submission.csv')

In [None]:

predictions_per_moa = submission[target_columns].values.sum(axis=0) 
valid_predictions_per_moa = train_data[target_columns_pred].values.sum(axis=0)

predictions_per_moa_mean = submission[target_columns].values.mean(axis=0) 
valid_predictions_per_moa_mean = train_data[target_columns_pred].values.mean(axis=0)


In [None]:
data = [(i, v) for i, v in enumerate(sorted(submission[target_columns].sum(axis=1)))]
    
df = pd.DataFrame(data=data, columns=['row_number', 'average_prediction_per_sample'])
fig = px.line(df, x="row_number", y="average_prediction_per_sample", title='Sum prediction per samples')
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[predictions_per_moa[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Predictions in test data'))

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[valid_predictions_per_moa[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Predictions in valid data'))

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[predictions_per_moa_mean[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Mean predictions in test data per moa'))

fig.add_trace(go.Scatter(x=most_active_moa_names, y=[valid_predictions_per_moa_mean[i] for i in most_active_moa_samples_ids],
                    mode='lines',
                    name='Mean predictions in valid data per moa'))

fig.show()