In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing libraries for plotting and pca

In [None]:
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Let's read all the csv files

In [None]:
train_features = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
test_features = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')
train_targets_scored = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('/kaggle/input/lish-moa/train_targets_nonscored.csv')
sample_submission = pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')

In [None]:
print('Train shape:',train_features.shape)
print('Test shape:',test_features.shape)

In [None]:
# check for missing values in train and test data
print(train_features.isnull().sum().value_counts())
print(test_features.isnull().sum().value_counts())

Great! no missing values.
Let us take a look at the features of train and test data.

In [None]:
train_features.head()

In [None]:
test_features.head()

In [None]:
train_features['train/test'] = 'train'
test_features['train/test'] = 'test '

In [None]:
# combining train and test features for the purpose of data exploaration
train_test_concat = pd.concat([train_features, test_features])
train_test_concat

In [None]:
# helper function to plot countplots for categorical features
def plotc(x, title):
    sns.set_style('darkgrid')
    sns.catplot(x = x, hue = 'train/test', kind = 'count', data = train_test_concat, palette = 'pastel');
    plt.title('Countplot for '+ title);

In [None]:
# Plotting count plots for cp_type, cp_time and cp_dose
plotc('cp_type', 'treated and control')
plotc('cp_time', 'treatment duration')
plotc('cp_dose', 'dosage')

* There are very few control observations and these control observaions have no MoA.
* The distribution of treatment durations is almost equal for all 3 categories(24hr, 48hr, 72hr) in both train and test data.
* The distribution of dosage is equal for D1 & D2 in both train and test data.

In [None]:
# separating the gene expression columns (g-) and cell viability columns (c-)
gene_columns = [col for col in train_features.columns if col.startswith('g-')]
cell_columns = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
print("Gene expression columns:", len(gene_columns), "\nCell viability columns:", len(cell_columns))

In [None]:
# helper function for plotting distribution of gen expression and cell viability data
def plotd(col, label, color):
    sns.set_style('darkgrid')
    sns.distplot(train_features[col], kde = True, color = color, label = label);

In [None]:
# Plotting distribution of randomly selected gene expression features
fig = plt.figure(figsize = (12,12))
plt.suptitle('Distribution of random sample of gene expression features')
cols = random.sample(gene_columns, 8)
colors = plt.rcParams["axes.prop_cycle"]()
for i in range(len(cols)):
    c = next(colors)["color"]
    plt.subplot(4,2,i+1)
    plotd(cols[i], label = cols[i], color=c)
fig.tight_layout(pad = 3.0)
fig.legend(loc="upper right")
plt.show()

In [None]:
# Plotting distribution of randomly selected cell viability features
fig = plt.figure(figsize = (12,12))
plt.suptitle('Distribution of random sample of cell viability features')
cols = random.sample(cell_columns, 8)
colors = plt.rcParams["axes.prop_cycle"]()
for i in range(len(cols)):
    c = next(colors)["color"]
    plt.subplot(4,2,i+1)
    plotd(cols[i],label = cols[i], color=c)
fig.tight_layout(pad = 3.0)
fig.legend(loc="upper right")
plt.show()

In [None]:
# plotting correlation matrix for randomly selected features
selected_cols = random.sample(gene_columns, 8) + random.sample(cell_columns, 8)
corr_selected_cols = train_features[selected_cols].corr()
plt.figure(figsize = (12,12))
sns.heatmap(corr_selected_cols, cmap="YlGnBu", annot = True, fmt = '.1g', square = True)
plt.title('Correlation between random sample of gene expression and cell viability features');

In [None]:
train_targets_scored.head()

In [None]:
# getting the column wise sum of all targets (number of positive responses for each target)
x = train_targets_scored.drop('sig_id', axis = 1).sum().sort_values().reset_index()
x.columns = ['target','num_pos_responses']
x

In [None]:
# plotting the number of positive responses for each target class
fig = plt.figure(figsize = (20,10))
plt.title('Number of positive responses for each target')
ax = sns.barplot(x = 'target', y = 'num_pos_responses', data = x)
ax.set_xticklabels(x.target, rotation = 90);

In [None]:
# plotting the target classes with highest number of positive responses
fig = plt.figure(figsize = (12,8))
plt.title('Target classes with highest number of positive responses')
ax = sns.barplot(x = 'target', y = 'num_pos_responses', data = x.tail(20))
ax.set_xticklabels(x.tail(20).target, rotation = 90);

In [None]:
# plotting the target classes with lowest number of positive responses
fig = plt.figure(figsize = (12,8))
plt.title('Target classes with lowest number of positive responses')
ax = sns.barplot(x = 'target', y = 'num_pos_responses', data = x.head(20))
ax.set_xticklabels(x.head(20).target, rotation = 90);

In [None]:
# getting the row wise sum of all the unique sig_id (number of activations per sig_id)
y = train_targets_scored.sum(axis = 1)
# number of observations for each number of activations
y.value_counts()

* Majority of the sig_ids have 1 activation and a considerable number of observations have no activations
* The number of observations decreases for higher number of activations .

In [None]:
ax = sns.countplot(y, palette = 'pastel')
plt.title('Number of activations');
total = len(y)
for p in ax.patches:
    ht = p.get_height()
    ax.text(p.get_x(), ht, '{:1.2f}%'.format(ht*100/total))

In [None]:
new_train_features = train_features.copy()
new_train_features.drop(columns = ['sig_id','cp_type','cp_time','cp_dose','train/test'], inplace = True)
new_train_features

In [None]:
scaler = StandardScaler()
scaler.fit(new_train_features)

In [None]:
train_transform = scaler.transform(new_train_features)
(np.min(train_transform), np.max(train_transform))

In [None]:
print('new_train_features', new_train_features.shape)
print('train_transform', train_transform.shape)

In [None]:
pca = PCA().fit(train_transform)

In [None]:
# plotting the explained variance
sns.set_style('darkgrid')
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

* Around 400 components to explain 90% of the variance
* Around 800 components to explain almost 100% of the variance

References: 
* https://www.kaggle.com/headsortails/explorations-of-action-moa-eda
* https://www.kaggle.com/isaienkov/mechanisms-of-action-moa-prediction-eda