In [None]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import plotly
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
plotly.offline.init_notebook_mode (connected = True)

In [None]:
# reading the features data
train_data = pd.read_csv('../input/lish-moa/train_features.csv')

## High Level Exploration

In [None]:
train_data.head(10)

In [None]:
train_data.shape

In [None]:
train_data.info()


- There are a total of 876 columns and 23814 rows.<br>
- 872 float columns, 1 integer and 3 object columns.<br>
- The object columns are: sig_id, cp_time, cp_dose.<br>

In [None]:
g = (train_data.columns).map(lambda x: x.startswith('g-')).tolist()
c = (train_data.columns).map(lambda x: x.startswith('c-')).tolist()

print(f'Number of gene expression variables {sum(g)},\nNumber of cell viability variables {sum(c)}')


In [None]:
train_data.describe()

In [None]:
# categorical variables (except sig_id)
train_data.columns[train_data.nunique() < 10]


- cp_type, cp_time and cp_dose are categorical in nature.

In [None]:
# changing to object
train_data['cp_time'] = train_data['cp_time'].astype(object)

## Categorical Features

- cp_time: It is the treatment duration, let's analyze it further.<br>
- cp_type: It indicates if a sample was treated with a compound or a control perturbation<br>
- cp_dose: If the dise was high or low     

In [None]:
fig, axes = plt.subplots(figsize = (20,4), nrows = 1, ncols = 3)
sns.countplot(train_data['cp_time'], ax = axes[0])
sns.countplot(train_data['cp_type'], ax = axes[1])
sns.countplot(train_data['cp_dose'], ax = axes[2])

titles = ['CP TIME COUNTS', 'CP TYPE COUNTS', 'CP DOSE COUNTS']

for a in range(3):
    axes[a].set_title(titles[a])

plt.show()


- cp_time has almost a balanced distribution, 48 hour treatments seem to be most frequent with ~ 8000 counts.<br>
- cp_type has an imbalanced distribution, most sample are treated with trt_cp.<br>
- cp_dose also has similar distribution in the classes, D1 dose is given to most of the sample.    

## Continuous Variables<br>


- Gene Expression: g-0 to g-771    <br>
- Cell Viability: c-0 to c-99    

In [None]:
def plot_density(variable = 'g-', rows = 3, cols = 3):
    """
    Randomlt selects some columns for the given variable and plots their density function.
    """
    
    fig,axes = plt.subplots(figsize = (15,12), nrows = rows, ncols = cols)
    
    if variable == 'g-':
        m = 771
        color  = 'coral'
    else:
        m = 99
        color = 'green'
    
    v = np.random.randint(0, m, rows*cols)
    
    k = 0
    for i in range(rows):
        for j in range(cols):
            sns.kdeplot(train_data[variable + str(v[k])], ax = axes[i][j], shade = True, color = color)
            axes[i][j].set_title(f'{variable + str(v[k])} Distribution')
            k+=1
    plt.show()
    

### <b> Gene Expression

In [None]:
plot_density('g-', rows = 3, cols = 3)


- All the variables are centred around 0 are close to normal distribution. <br>
- A slight skew can be detected in the variables, g-61, g-369, g-444 etc are left skewed whereas g-289, g-264 etc are right skewed<br>

## Cell Viability

In [None]:
plot_density('c-')


- Here as well the variables have been centred to 0 and have almost normal distributions.<br>
- Most of the variables are left skewed and have a peak at -10, this could be an anomaly.

## Signal Visualisation<br>

Taking inspiration from this notebook: https://www.kaggle.com/artgor/code-for-live-pair-coding by Andrew Lukyanenko
I have visualised the gene expressions and cell viabilities for single samples. They look like some sort of signals.

In [None]:
def plot_signal(variable = 'g-',  rows = 3, ma1 = 10, ma2 = 30):
    """
    Plots the variable for randomly selected samples with transformations
    """    
    
    if variable == 'g-':
        m = 771
        title = 'Gene Expression'
    else:
        m = 99
        title = 'Cell Viability'

    colors = ['#9b59b6', '#00a8ff', '#A3CB38', '#B53471']    
    v = np.random.randint(0, m, rows)
    fig,axes = plt.subplots(figsize = (18,12), nrows = rows, ncols = 4)
    
    plt.suptitle(title, size = 30)
    for i in range(len(v)):        
        train_data.loc[:,train_data.columns.str.startswith(variable)].iloc[v[i]].plot(color = colors[0], 
                                                                                   ax = axes[i][0],title = f'Sample {v[i]} Signals')
        train_data.loc[:,train_data.columns.str.startswith(variable)].iloc[v[i]].sort_values().plot(color = colors[1], 
                                                                                                 ax = axes[i][1], title = f'Sample {v[i]} Sorted Signals')
        train_data.loc[:,train_data.columns.str.startswith(variable)].iloc[v[i]].rolling(ma1).mean().plot(color = colors[2], 
                                                                                                      ax = axes[i][2], title = f'Sample {v[i]} Window {ma1} Roling Mean')
        train_data.loc[:,train_data.columns.str.startswith(variable)].iloc[v[i]].rolling(ma2).mean().plot(color = colors[3], 
                                                                                                     ax = axes[i][3], title = f'Sample {v[i]} Window {ma2} Roling Mean')
    
    plt.show()

In [None]:
plot_signal('g-')


- The first columns shows the raw gene signals for various samples, it is noisy but there definitely seems to be some sort of a pattern. To me it looks like what we get after doing differrencing on a time series data (or any signal like data to make it stationary).<br>
- In the 3rd and 4th column I have done a rolling mean with 10 and 30 period windows respectively and we can definitely see some sort of a pattern emerging in each sample. We can use sequence models to capture this data. <br>
- Another interesting thing that was pointed out by Andrew was that we can sort the gene expressions. On doing so in column 2 we can see that a very clean graph emerges. It looks like a variant of the `tangent function` <br>

Using these insights we can calculate some features which might help us in modelling.

In [None]:
plot_signal('c-')

- In cell viability as well we can see that sort sort of patterns are visible on doing rolling mean.<br>
- The sorted signals are less smooth than the gene expressions however they also are following a similar function.

### ...to be continued