As you might already know, our targets (incl. non-scored ones) are very sparse. They are all binary targets, so I'm interested in how they co-occur...hopefully to find some patterns.

# Libraries

In [None]:
import numpy as np
import pandas as pd
import pickle
import os, sys
import gc
import math
import random
from tqdm import tqdm
from typing import List, NoReturn, Union, Tuple, Optional, Text, Generic, Callable, Dict

from tqdm import tqdm

# visualize
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
from matplotlib_venn import venn2
from matplotlib import pyplot
from matplotlib.ticker import ScalarFormatter
sns.set_context("talk")
style.use('seaborn-colorblind')
pd.options.display.max_columns = None

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
%%time
train_targets = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_non = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
# train_features = pd.read_csv('../input/lish-moa/train_features.csv')
# test_features = pd.read_csv('../input/lish-moa/test_features.csv')
    
# ss = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [None]:
print(train_targets.shape)
train_targets.head()

In [None]:
print(train_targets_non.shape)
train_targets_non.head()

# Targets Sparseness
Let's visualize how sparse our targets are.

In [None]:
scored_targets = train_targets.columns.values[1:].tolist()
non_scored_targets = train_targets_non.columns.values[1:].tolist()

print('There are {:,} scored targets and {:,} non-scored targets.'.format(len(scored_targets), len(non_scored_targets)))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax = ax.flatten()

# scored targets
sns.heatmap(train_targets[scored_targets], cbar=False, ax=ax[0])

# non-scored targets
sns.heatmap(train_targets_non[non_scored_targets], cbar=False, ax=ax[1])

plt.tight_layout()

They look like a beautiful starry sky, but they don't. They are our targets. The non-scored targets are more sparce than the scored ones --- including actually many only-0 targets.

Our targets have some naming patterns --- let's categorize them manually for now.

In [None]:
def target_category_maker(targets):
    targets_category = {'agonist': [], 'antagonist': [], 'agent': [], 'others': []}
    for t in targets:
        if ('_agonist' in t) | ('_activator' in t) | ('_stimulant' in t) | ('_secretagogue' in t) | ('_sensitizer' in t):
            targets_category['agonist'].append(t)
        elif ('_antagonist' in t) | ('_inhibitor' in t) | ('_blocker' in t):
            targets_category['antagonist'].append(t)
        elif ('_agent' in t) | ('_medium' in t):
            targets_category['agent'].append(t)
        else:
            targets_category['others'].append(t)
    return targets_category

def show_positive_ratio(train_targets, targets_category):
    # positive label ratio
    for k in list(targets_category.keys()):
        print('')
        print('----------------------------')
        print('{} ({:,} features)'.format(k, len(targets_category[k])))
        print('----------------------------')
        for t in targets_category[k]:
            print('{}: {:,} ({:.3f} %) positive.'.format(t, train_targets[t].sum(), 100 * train_targets[t].sum() / train_targets.shape[0]))
    
print('SCORED TARGETS')
scored_targets_category = target_category_maker(scored_targets)
show_positive_ratio(train_targets, scored_targets_category)

In [None]:
print('NON-SCORED TARGETS')
non_scored_targets_category = target_category_maker(non_scored_targets)
show_positive_ratio(train_targets_non, non_scored_targets_category)

# Correlation among Targets
It would be interesting to know target correlations.

In [None]:
# def plot_target_corr(train_targets, targets):
#     # Compute the correlation matrix
#     corr = train_targets.loc[train_targets[targets].sum(axis=1) > 0, targets].corr()

#     # Set up the matplotlib figure
#     f, ax = plt.subplots(figsize=(11, 9))

#     # Generate a custom diverging colormap
#     cmap = sns.diverging_palette(220, 10, as_cmap=True)

#     # Draw the heatmap with the mask and correct aspect ratio
#     sns.heatmap(corr, cmap=cmap, annot=True, 
#                 square=True, linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
#     ax.set_ylim(corr.shape[0], 0)
#     plt.yticks(rotation=0)
    
# print('SCORED TARGETS CORRELATION')
# plot_target_corr(train_targets, scored_targets)

# Target Co-Occurrence Analysis
Let's see whether some targets co-occur. Seemingly the co-occurrence is rare but there are some notable ones such as 'nfkb_inhibitor' & 'proteasome_inhibitor' (718 co-occurred!). It looks like there are many inhibitor-inhibitor pairs with co-occurrence. Also, there are too many '6 co-occurrences'...corresponding to 6 samples per drug in this experimental setup.

In [None]:
def get_cooccur(train_targets, targets):
    d = train_targets[targets].T @ train_targets[targets]
    co = np.tril(d.values, k=-1)
    df = {'target1': [], 'target2': [], 'co_occurrence': []}
    for i, t in enumerate(targets):
        l = [(t, k, co[j, i]) for j, k in enumerate(targets) if co[j, i] > 0]
        df['target1'] += [f[0] for f in l]
        df['target2'] += [f[1] for f in l]
        df['co_occurrence'] += [f[2] for f in l]
    df = pd.DataFrame.from_dict(df)
    return df

print('SCORED TARGETS CO-OCCURENCE')
scored_df = get_cooccur(train_targets, scored_targets)
scored_df = scored_df.sort_values(by=['co_occurrence'], ascending=False)
scored_df.style.background_gradient(cmap='viridis')

In [None]:
print('NON-SCORED TARGETS CO-OCCURENCE')
non_scored_df = get_cooccur(train_targets_non, non_scored_targets)
non_scored_df = non_scored_df.sort_values(by=['co_occurrence'], ascending=False)
non_scored_df.style.background_gradient(cmap='viridis')

In [None]:
print('ALL TARGETS CO-OCCURENCE')
all_df = get_cooccur(pd.concat([train_targets, train_targets_non], axis=1), scored_targets + non_scored_targets)
all_df = all_df.sort_values(by=['co_occurrence'], ascending=False)
all_df.style.background_gradient(cmap='viridis')

TO BE CONTINUED...