In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# Import data 

## csv to dataframe 

In [None]:
data_path = '../input/lish-moa'
! ls {data_path}

In [None]:
features_file = 'train_features.csv'
targets_file = 'train_targets_scored.csv'
no_targets_file = 'train_targets_nonscored.csv'

In [None]:
df_features = pd.read_csv(os.path.join(data_path, features_file))
df_targets = pd.read_csv(os.path.join(data_path, targets_file))
df_no_targets = pd.read_csv(os.path.join(data_path, no_targets_file))

In [None]:
# merge dataframes
df_data = df_features.merge(df_targets, how='left', on='sig_id', validate='one_to_one')
df_data = df_data.merge(df_no_targets, how='left', on='sig_id', validate='one_to_one')
df_data.head(5)

## Features name lists

In [None]:
# keep columns names lists
# columns names = 'sig_id' + 'cp_type' + features_quali + features_quanti + scored_targets + no_scored_targets
scored_targets =  list(set(df_targets.columns) - set(['sig_id']))
no_scored_targets = list(set(df_no_targets.columns) - set(['sig_id']))
features_quali = ['cp_time', 'cp_dose']
features_quanti = list(set(df_data.columns) 
                       - set(scored_targets) 
                       - set(no_scored_targets)
                       - set(features_quali)
                       - set(['sig_id', 'cp_type']))
print('Scored targets count : {}'.format(len(scored_targets)))
print('No scored targets count : {}'.format(len(no_scored_targets)))
print('Features quali count : {}'.format(len(features_quali)))
print('Features quanti count : {}'.format(len(features_quanti)))

In [None]:
# separate features_quanti : gene expression and cell viability features
cells = [feature_name for feature_name in features_quanti if feature_name.find('c-') != -1]
genes = [feature_name for feature_name in features_quanti if feature_name.find('g-') != -1]
print('Features genes count : {}'.format(len(genes)))
print('Features cells count : {}'.format(len(cells)))

## General check

In [None]:
# shape
df_data.shape

In [None]:
# check sig_id is unique
test = df_data['sig_id'].is_unique
print('sig_id unique : {}'.format(test))

In [None]:
# check there are no MoA in 'control' test
moa_count = df_data[df_data['cp_type'] == 'ctl_vehicle'][scored_targets].sum().sum()
print('MoA count (control test) : {}'.format(moa_count))

In [None]:
# check nan
test = df_data.isnull().values.any()
print('Missing data : {}'.format(test))

# Features data analysis

## Separate 'control' and 'compound' data

In [None]:
# separate control and compound
df_compound = df_data[df_data['cp_type'] == 'trt_cp']
df_control = df_data[df_data['cp_type'] == 'ctl_vehicle']
print('Compound shape : {}'.format(df_compound.shape))
print('Control shape : {}'.format(df_control.shape))

## 'control' data analysis 

Definition 'control' : various forms of “inert” experiments performed to determine, by contrast, effects that are specifically due to a perturbation.

In [None]:
df_control_features = df_control[features_quali + features_quanti]
df_control_features.head(3)

### Dose and time exposure distribution 

In [None]:
# plot dose and time exposure distribution
sns.countplot(x='cp_time', hue='cp_dose', data=df_control_features)

Doses and exposure times are similarly distributed within the control samples.

### Cell viability 

The viability of the cells after exposure to the compounds is tested simultaneously on 100 human cells of different types. A viability score is associated with each cell.

In [None]:
df_control_cell = df_control[features_quali + cells]
df_control_cell.head(3)

In [None]:
# get variance for each cell and identify cells with min and max variance
cell_std = df_control_cell[cells].std().sort_values(ascending=False)
cell_std_max_min = [cell_std.iloc[[0]].index,
                    cell_std.iloc[[-1]].index]
cell_std_max_min

In [None]:
# plot distribution of sample for cell with MIN variance viability
g = sns.FacetGrid(df_control_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-98')
g.add_legend()

In [None]:
# plot distribution of sample for cell with MAX variance viability
g = sns.FacetGrid(df_control_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-18')
g.add_legend()

**Observations :**
- Some cell types show variability in the presence of the control compound. 
- The longer the exposure time to the control compound, the higher the variance in viability. Since the compound is neutral, the increase in variance over time is certainly natural.
- The dose of control compound does not seem to play a role.


In conclusion, cell viability seems to evolve naturally over time. Viability is observed to be dependent on cell type.

### Genes expressions

Cells are exposed to different compounds. Gene expression (inhibition or activation) following this exposure is evaluated.

In [None]:
df_control_gene = df_control[features_quali + genes]
df_control_gene.head(3)

In [None]:
# get variance for each gene and identify genes with min and max variance
gene_std = df_control_gene[genes].std().sort_values(ascending=False)
gene_std_max_min = [gene_std.iloc[[0]].index,
                    gene_std.iloc[[-1]].index]
gene_std_max_min

In [None]:
# plot distribution of sample for gene with MIN variance viability
g = sns.FacetGrid(df_control_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-307', bw=0.1)
g.add_legend()

In [None]:
# plot distribution of gene with MAX variance viability
g = sns.FacetGrid(df_control_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-370', bw=0.1)
g.add_legend()

**Observations :**
- The dose of the control compound seems to play an important role. For the D1 dose, gene expression is low. For dose D2, gene expression is higher.
- Exposure time does not seem to play an important role. However, a decrease in the variability of gene expression is observed with increasing duration of exposure to the control compound.

In conclusion, gene expression following a control compound varies as a function of dose and to a lesser extent as a function of exposure time.

## 'compound' data analysis

Different compounds are tested.

In [None]:
df_compound_features = df_compound[features_quali + features_quanti]
df_compound_features.head(3)

### Dose and time exposure distribution 

In [None]:
# plot dose and time exposure distribution
sns.countplot(x='cp_time', hue='cp_dose', data=df_compound_features)

Doses and exposure times are distributed in equivalent ways within the tested samples.

### Cell viability

In [None]:
df_compound_cell = df_compound[features_quali + cells]
df_compound_cell.head(3)

In [None]:
# get variance for each cell and identify cells with min and max variance
cell_std = df_compound_cell[cells].std().sort_values(ascending=False)
cell_std_max_min = [cell_std.iloc[[0]].index,
                    cell_std.iloc[[-1]].index]
cell_std_max_min

In [None]:
# plot distribution of sample for cell with MIN variance viability
g = sns.FacetGrid(df_compound_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-74')
g.add_legend()

In [None]:
# plot distribution of sample for cell with MAX variance viability
g = sns.FacetGrid(df_compound_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-63')
g.add_legend()

**Observations :**

- There is an effect on cell viability when exposed to an active ingredient: slight 'bump' on the left side of the figures.
- This effect seems to be amplified by the exposure time.
- The dose does not seem to have a significant effect.

In [None]:
# plot correlation matrix between the first 20 cells types
corr = df_compound_cell[cells[0:20]].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(9, 7))
cmap = sns.color_palette('coolwarm')
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1,
            vmax=1, square=True, cbar_kws={"shrink":.5})

The different cell types are highly correlated in terms of cell viability : all cells evolve in the same direction following exposure to a compound.

### Genes expressions

In [None]:
df_compound_gene = df_compound[features_quali + genes]
df_compound_gene.head(3)

In [None]:
gene_std = df_compound_gene[genes].std().sort_values(ascending=False)
gene_std_max_min = [gene_std.iloc[[0]].index,
                    gene_std.iloc[[-1]].index]
gene_std_max_min

In [None]:
# plot distribution of gene with MIN variance viability
g = sns.FacetGrid(df_compound_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-219')
g.add_legend()

In [None]:
# plot distribution of gene with MAX variance viability
g = sns.FacetGrid(df_compound_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-50')
g.add_legend()

**Observations :**

- There is an effect on cell viability when exposed to an active ingredient: slight 'bump' on the left or right side of the figures.
- This effect seems to weaken with increasing exposure time.
- Some genes do not react to the active ingredients (figures 1st line).
- The dose does not seem to have a significant effect.

In [None]:
# plot correlation matrix between the first 20 genes expressions
corr = df_compound_gene[genes[0:20]].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(9, 7))
cmap = sns.color_palette('coolwarm')
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1,
            vmax=1, square=True, cbar_kws={"shrink":.5})

The matrix of correlations between the different genes shows that the expressions of certain genes are correlated. Some genes react in the same way when exposed to active ingredients. However, the correlation coefficients observed are quite low.

### Genes expression / Cells viability correlations

In [None]:
# plot correlation matrix between genes expression and cells viability
corr = df_compound[genes + cells].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(10, 15))
cmap = sns.color_palette('coolwarm')
sns.heatmap(corr.loc[genes, cells], cmap=cmap, center=0, vmin=-1,
            vmax=1, cbar_kws={"shrink":.5})

## Comparison 'compound' / 'control'

Comparison of the observations made after exposure to an active compound and after exposure to a control compound makes it possible, by contrast, to determine the real effect of the active compound.

### Cell viability : cell-63 example

In [None]:
# plot distribution of sample for gene with MAX variance viability under compound
g = sns.FacetGrid(df_compound_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-63')
g.add_legend()

In [None]:
# plot distribution of same cell under control
g = sns.FacetGrid(df_control_cell, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'c-63')
g.add_legend()

### Genes expressions : gene-50 example

In [None]:
# plot distribution of gene with MAX variance viability under compound
g = sns.FacetGrid(df_compound_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-50')
g.add_legend()

In [None]:
# plot distribution of same gene under control
g = sns.FacetGrid(df_control_gene, col='cp_time', hue='cp_dose')
g.map(sns.kdeplot, 'g-50')
g.add_legend()

# Targets data (MoA) analysis

In [None]:
df_moa = df_data[scored_targets + no_scored_targets]
df_moa.head(3)

## MoA frequency

In [None]:
print('MoA scored count : {}'.format(len(scored_targets)))
print('MoA not scored count : {}'.format(len(no_scored_targets)))

In [None]:
# MoA count in train set (MoA scored)
df_moa_count = df_moa[scored_targets].sum().sort_values(ascending=False)
df_moa_count

In [None]:
# distribution of label occurence in train set (MoA scored)
ax = sns.distplot(df_moa_count, kde=False)
ax.set_xlabel('MoA occurence in dataset')
ax.set_ylabel('Compound count')

Class frequency in the dataset is unbalanced: 832 occurrences for the most frequent and 1 occurrence for the least frequent!

In [None]:
# MoA not present in dataset (MoA scored and not scored)
df_moa_count = df_moa.sum().sort_values(ascending=False)
print('MoA not labelled in dataset : {}'.format(df_moa_count[df_moa_count == 0].shape[0]))
no_moa = list(df_moa_count[df_moa_count == 0].index)

In [None]:
# MoA not present in dataset and scored
inter_moa_scored = set(no_moa) & set(scored_targets)
inter_moa_scored

In [None]:
# MoA not present in dataset and not scored
inter_moa_no_scored = set(no_moa) & set(no_scored_targets)
print('MoA not scored and not labellized : {}'.format(len(inter_moa_no_scored)))
print(list(inter_moa_no_scored))

In the dataset, all the MoA scored (MoA to be predicted) are activated at least once by an active ingredient. 71 MoA not_scored are not activated.

## MoA label per compound

In [None]:
# Number of scored MoA per compound
df_label_count = df_moa[scored_targets].sum(axis=1).sort_values(ascending=False)
df_label_count.describe()

In [None]:
# compound without scored MoA
print('Samples without scored MoA count : {}'.format(
            df_label_count[df_label_count == 0].shape[0]))

In [None]:
# distribution of scored MoA per compound
ax = sns.distplot(df_label_count, kde=False)
ax.set_xlabel('scored MoA in sample')
ax.set_ylabel('compound count')

The maximum number of MoA per active ingredient is 7 and the minimum number is 0. There is a very high number of active ingredient that does not cause any MoA (scored MoA). More than 12,000 active ingredients lead to 1 MoA.

## MoA correlations 

In [None]:
# plot correlation matrix between scored MoA
corr = df_moa[scored_targets].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(9, 7))
cmap = sns.color_palette('coolwarm')
sns.heatmap(corr, mask=mask, cmap=cmap, center=0, vmin=-1,
            vmax=1, square=True, cbar_kws={"shrink":.5})

In general, there are no correlations between MoA. However, some weakly correlated MoA can be observed. 

# Compound clustering with t-SNE

In [None]:
X = df_compound[genes + cells] # no dose and no type
X_embedded = TSNE(n_components=2, init='pca', n_jobs=4).fit_transform(X)
X_embedded.shape

In [None]:
plt.figure(figsize=(15,10))
plt.scatter(X_embedded[:,0], X_embedded[:,1], c=df_compound['nfkb_inhibitor'], alpha=0.2)
plt.title('T-SNE on genes and cells features : nfkb_inhibitor clusters (yellow)')