# Let's have a look at the correlation of the numerous targets

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# read targets file for training data
df_train_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
df_train_scored.shape

In [None]:
df_train_scored.head()

In [None]:
targets = df_train_scored.columns[1:207] # skip id
n_targets = len(targets)

In [None]:
print('Number of targets:', n_targets)

In [None]:
# calc correlation matrix for all (numeric) columns
cor_targets = df_train_scored.corr(method='pearson')
cor_targets

In [None]:
# plot correlations (due to symmetry all pairs except the diagonal appear twice!)
plt.rcParams["figure.figsize"]=(12,12)
plt.matshow(cor_targets)
plt.colorbar()
plt.show()

### Ok, only a few target pairs show mentionable correlation. Let's check those pairs.

In [None]:
# create data frame to store all results
cor_stats = pd.DataFrame(data=np.zeros((n_targets**2,4)), columns=['x','y','c','sel'])
cor_stats.x = cor_stats.x.astype(str)
cor_stats.y = cor_stats.y.astype(str)

In [None]:
# calc and store all correlations in data frame
count = 1 # count correlations exceeding threshold
row = 0
for i in range(n_targets):
    var_i = targets[i]
    for j in range(n_targets):
        var_j = targets[j]            
        cor_x = df_train_scored[var_i].corr(df_train_scored[var_j])
        # store results
        cor_stats.loc[row,'x'] = var_i
        cor_stats.loc[row,'y'] = var_j
        cor_stats.loc[row,'c'] = cor_x
        if (i>j):
            cor_stats.loc[row,'sel'] = 1 # we use this to later remove redundancies
                
        row = row + 1

In [None]:
# remove redundancies
cor_stats = cor_stats[cor_stats.sel==1] # only select "i > j" cases
cor_stats = cor_stats.drop(['sel'], axis=1)
# sort by correlation (descending)
cor_stats = cor_stats.sort_values(by=['c'], ascending=False)
cor_stats = cor_stats.reset_index(drop=True)

In [None]:
# show top 20 results
cor_stats.head(20)

In [None]:
# show end of table as well
cor_stats.tail(10)

In [None]:
# show all correlations
plt.rcParams["figure.figsize"]=(7,5)
plt.plot(cor_stats.c)
plt.grid()
plt.title('Correlations - Sorted descending')
plt.show()

In [None]:
# zoom in
plt.rcParams["figure.figsize"]=(7,5)
plt.plot(cor_stats.c[0:500])
plt.grid()
plt.title('Correlations - Sorted descending')
plt.show()

### Check a few examples using cross tables

In [None]:
# 0 : corr( proteasome_inhibitor , nfkb_inhibitor ) =  0.921340
pd.crosstab(df_train_scored.proteasome_inhibitor, df_train_scored.nfkb_inhibitor)

In [None]:
# 1 : corr( pdgfr_inhibitor , kit_inhibitor ) =  0.915603
pd.crosstab(df_train_scored.pdgfr_inhibitor, df_train_scored.kit_inhibitor)

In [None]:
# 2 : corr( kit_inhibitor , flt3_inhibitor ) =  0.758112
pd.crosstab(df_train_scored.kit_inhibitor, df_train_scored.flt3_inhibitor)

In [None]:
# 3 : corr( pdgfr_inhibitor , flt3_inhibitor ) =  0.705119
pd.crosstab(df_train_scored.pdgfr_inhibitor, df_train_scored.flt3_inhibitor)

In [None]:
# and another one with a relatively low correlation
# 17 : corr( nrf2_activator , bcl_inhibitor ) =  0.253269
pd.crosstab(df_train_scored.nrf2_activator, df_train_scored.bcl_inhibitor)

In [None]:
# an example with correlation close to zero
print('corr = ', df_train_scored.acat_inhibitor.corr(df_train_scored.acetylcholine_receptor_agonist))
pd.crosstab(df_train_scored.acat_inhibitor, df_train_scored.acetylcholine_receptor_agonist)

In [None]:
# and finally an example with (slightly) negative correlation
print('corr = ', df_train_scored.nfkb_inhibitor.corr(df_train_scored.dopamine_receptor_antagonist))
pd.crosstab(df_train_scored.nfkb_inhibitor, df_train_scored.dopamine_receptor_antagonist)

# Occurrence of multiple positive targets

In [None]:
df_train_scored['multiplicity'] = df_train_scored.iloc[:,1:207].sum(axis=1)
df_train_scored.multiplicity.value_counts()

In [None]:
plt.rcParams["figure.figsize"]=(7,4)
df_train_scored.multiplicity.value_counts().plot(kind='bar')
plt.grid()
plt.show()

In [None]:
# look e. g. at the 6 rows having 7 synchronous 1's
demo = df_train_scored[df_train_scored.multiplicity==7]
demo

#### All 6 rows show a positive effect on
* apoptosis_stimulant
* bcl_inhibitor
* ikk_inhibitor
* nfkb_inhibitor
* nitric_oxide_production_inhibitor
* nrf2_activator
* ppar_receptor_agonist


# Finally, let's quickly have a look on the target means as well

In [None]:
# remove multiplicity column first
df_train_scored = df_train_scored.drop(columns=['multiplicity'])


In [None]:
# calc means
target_means = df_train_scored.mean()

In [None]:
# and plot
plt.rcParams["figure.figsize"]=(8,36)
sns.barplot(y=target_means.index, x=target_means.values)
plt.grid()
plt.show()