In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pandas
import seaborn as sns

### Exploring the data
I found @gunesevitan's notebook (https://www.kaggle.com/gunesevitan/mechanisms-of-action-moa-prediction-eda) a great starting point for exploring the data. Some of the following plots are inspired by this kernel.

In [None]:
train = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
train_targets_scored = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
train_targets_nonscored = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")
train.info()

In [None]:
train.head()

### Features
- sig_id is the unique sample id
- Features beginning `g-` represent gene expression data
- Features beginning `c-` represent cell viability
- `cp_type` indicates whether the sample has been treated with the compound (`cp_vehicle`) or if the sample is a control `ctrl_vehicle`
- `cp_time` is the treatment time (24, 48 or 72 hours)
- `cp_dose` is the dose of the treatment (low or high)

In [None]:
d1 = train.groupby(['cp_type']).size()
d2 = train.groupby(['cp_time']).size()
d3 = train.groupby(['cp_dose']).size()

fig, ax = plt.subplots(1, 3, figsize=(12,4))
sns.barplot(x = d1.index, y = d1, ax = ax[0])
sns.barplot(x = d2.index, y = d2, ax = ax[1])
sns.barplot(x = d3.index, y = d3, ax = ax[2])

In [None]:
train.info()

In [None]:
train_targets_scored.head()

In [None]:
"""
The number of rows in the train_features.csv, and train_targets_scored.csv are the same, and 
they are ordered in the same way.
"""
(train_targets_scored.sig_id == train.sig_id).sum() 

In [None]:
"""
Each column in train_targets_scored.csv represents a feature that may be present in the compound. As can be 
seen below, there can be more than one feature present in the compound.
"""
train[train.columns[1:]].sum(axis=1).head()

In [None]:
train_targets_nonscored.head()

Let us merge the training features with the training targets

In [None]:
train = train.merge(train_targets_scored, left_on='sig_id', right_on='sig_id', how='inner')

In [None]:
train.index = train.sig_id

In [None]:
target_names = train_targets_scored.columns[1:]

# Observations
In this section, we try and make some significant observations about the data. As explained in @artgor's kernel (https://www.kaggle.com/artgor/lish-moa-baseline-approach), the samples with a `cp_type` of `ctl_vehicle` have no target features. Let us confirm this.

In [None]:
d1 = train[train['cp_type'] == 'ctl_vehicle']
d1 = d1[target_names]
d1 = d1.sum(axis=1)
d1 = sum(d1)
print("The number of targets present in the control samples: " + str(d1))

Let us look at the the number of targets present for other subsets of the data

In [None]:
d1 = train.groupby(['cp_time']).mean()
d1 = d1[target_names].mean(axis=1)
sns.barplot(x=d1.index, y=d1)

In [None]:
d1 = train.groupby(['cp_dose']).mean()
d1 = d1[target_names].mean(axis=1)
sns.barplot(x=d1.index, y=d1)