<h1><center>Mechanisms of Action (MoA) Prediction. Interesting findings</center></h1>

<center><img src="https://pharmacyinnovations.net/wp-content/uploads/pillsdrugs.png"></center>

In [None]:
import numpy as np 
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

In [None]:
SCATTER_SIZE = 600
HIST_WIDTH = 700
HIST_HEIGHT = 500

In [None]:
train = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
train_target = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
test = pd.read_csv('/kaggle/input/lish-moa/test_features.csv')

I think already all teams use inside that all targets for cp_type == ctl_vehicle equal to 0. Let's quick check it.

In [None]:
check = train_target.copy()
check['cp_type'] = train['cp_type']
zeros = check[check['cp_type'] == 'ctl_vehicle']

total_sum = 0
for col in zeros.columns:
    if col in ['sig_id', 'cp_type']:
        continue
    else:
        total_sum += zeros[col].sum()
        
print('Total sum over all columns: ', total_sum)

For target columns 'atp-sensitive_potassium_channel_antagonist' and 'erbb2_inhibitor' all rows are 0 where cp_dose == 'D2'.

In [None]:
check = train_target.copy()
check['cp_dose'] = train['cp_dose']
zeros = check[check['cp_dose'] == 'D2']

total_sum = 0
for col in zeros.columns:
    if col in ['atp-sensitive_potassium_channel_antagonist',  'erbb2_inhibitor']:
        total_sum += zeros[col].sum()
        
print('Total sum over all columns: ', total_sum)

For target columns 'atp-sensitive_potassium_channel_antagonist' and 'erbb2_inhibitor' all rows are 0 also where cp_time != 48.

In [None]:
check = train_target.copy()
check['cp_time'] = train['cp_time']
zeros = check[check['cp_time'] != 48]

total_sum = 0
for col in zeros.columns:
    if col in ['atp-sensitive_potassium_channel_antagonist',  'erbb2_inhibitor']:
        total_sum += zeros[col].sum()
        
print('Total sum over all columns: ', total_sum)

To be honest, both columns have only 1 positive sample so these two hypothesis should be checked via Public Leaderboard.

In [None]:
print('Number of samples in atp-sensitive_potassium_channel_antagonist: ', train_target['atp-sensitive_potassium_channel_antagonist'].sum())
print('Number of samples in erbb2_inhibitor: ', train_target['erbb2_inhibitor'].sum())

Let's check column 'proteasome_inhibitor' with columns c-31, c-32, c-78.

In [None]:
check = train[['c-31', 'c-32', 'c-78']].copy()
check['proteasome_inhibitor'] = train_target['proteasome_inhibitor']
check['size'] = 1
check.loc[check['proteasome_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='c-78', 
    y='c-32',
    z='c-31', 
    color="proteasome_inhibitor", 
    size="size",
    title='Scatter plot for proteasome_inhibitor',
    width=SCATTER_SIZE,
    height=SCATTER_SIZE
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='c-32', 
    y='c-31',
    color="proteasome_inhibitor", 
    title='Scatter plot for proteasome_inhibitor',
    width=SCATTER_SIZE,
    height=SCATTER_SIZE
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='c-78', 
    y='c-31',
    color="proteasome_inhibitor", 
    title='Scatter plot for proteasome_inhibitor',
    width=SCATTER_SIZE,
    height=SCATTER_SIZE
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='c-78', 
    y='c-32',
    color="proteasome_inhibitor", 
    title='Scatter plot for proteasome_inhibitor',
    width=SCATTER_SIZE,
    height=SCATTER_SIZE
)

fig.show()

As a result we see for all 3 columns (c-31, c-32, c-78) positive samples proteasome_inhibitor don't greather than -1. I still didn't check this finding for another columns, I just took 3 most correlated columns from train with proteasome_inhibitor.

In [None]:
def plot_combined_histograms(plot_list, name):
    fig = make_subplots(
        rows=2, 
        cols=3
    )
    
    traces = [
        go.Histogram(
            x=train[col], 
            nbinsx=100, 
            name=col + ' train'
        ) for col in plot_list
    ]
    
    for col in plot_list:
        traces.append(
            go.Histogram(
                x=test[col], 
                nbinsx=100, 
                name=col + ' test'
            )
        )

    for i in range(len(traces)):
        fig.append_trace(
            traces[i], 
            (i // 3) + 1, 
            (i % 3) + 1
        )

    fig.update_layout(
        title_text='Mostly correlated features with ' + name,
        height=800,
        width=1000
    )
    fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['c-31', 'c-32', 'c-78'], 
    name='proteasome_inhibitor'
)

Let's go depeer for every column.

In [None]:
def plot_histograms(column, bins=20):
    fig = go.Figure()
    fig.add_trace(
        go.Histogram(
            x=train[column], 
            nbinsx=bins, 
            name=column + ' train', 
            histnorm='percent'
        )
    )
    fig.add_trace(
        go.Histogram(
            x=test[column], 
            nbinsx=bins,  
            name=column + ' test', 
            histnorm='percent'
        )
    )

    fig.update_layout(
        barmode='overlay',
        height=HIST_HEIGHT,
        width=HIST_WIDTH,
        title_text='Normalized ' + column + ' train & test sets'
    )
    fig.update_traces(opacity=0.6)
    fig.show()

In [None]:
plot_histograms('c-31')

In [None]:
stats.ttest_ind(
    train['c-31'], 
    test['c-31']
)

In [None]:
plot_histograms('c-32')

In [None]:
stats.ttest_ind(
    train['c-32'], 
    test['c-32']
)

In [None]:
plot_histograms('c-78')

In [None]:
stats.ttest_ind(
    train['c-78'], 
    test['c-78']
)

In [None]:
check = train[['g-202', 'g-431', 'g-769']]
check['raf_inhibitor'] = train_target['raf_inhibitor']
check['size'] = 1
check.loc[check['raf_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='g-202', 
    y='g-431',
    z='g-769', 
    color='raf_inhibitor', 
    size="size",
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for raf_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-202', 
    y='g-431',
    color="raf_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for raf_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-202', 
    y='g-769',
    color="raf_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for raf_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-431', 
    y='g-769',
    color="raf_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for raf_inhibitor'
)

fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['g-202', 'g-431', 'g-769'], 
    name='raf_inhibitor'
)

In [None]:
plot_histograms('g-202')

In [None]:
plot_histograms('g-431')

In [None]:
plot_histograms('g-769')

In [None]:
check = train[['g-235', 'g-635', 'g-745']]
check['egfr_inhibitor'] = train_target['egfr_inhibitor']
check['size'] = 1
check.loc[check['egfr_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='g-235', 
    y='g-635',
    z='g-745', 
    color='egfr_inhibitor', 
    size="size",
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for egfr_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-235', 
    y='g-635',
    color="egfr_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for egfr_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-235', 
    y='g-745',
    color="egfr_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for egfr_inhibitor'
)

fig.show()

In [None]:
fig = px.scatter(
    check, 
    x='g-745', 
    y='g-635',
    color="egfr_inhibitor", 
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for egfr_inhibitor'
)

fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['g-235', 'g-635', 'g-745'], 
    name='egfr_inhibitor'
)

In [None]:
plot_histograms('g-235')

In [None]:
plot_histograms('g-635')

g-635 looks like interesting. 

In [None]:
plot_histograms('g-745')

In [None]:
check = train[['g-599', 'g-165', 'g-699']]
check['mtor_inhibitor'] = train_target['mtor_inhibitor']
check['size'] = 1
check.loc[check['mtor_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='g-599', 
    y='g-165',
    z='g-699', 
    color='mtor_inhibitor', 
    size="size",
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for mtor_inhibitor'
)

fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['g-599', 'g-165', 'g-699'], 
    name='mtor_inhibitor'
)

In [None]:
plot_histograms('g-599')

In [None]:
plot_histograms('g-165')

In [None]:
plot_histograms('g-699')

In [None]:
check = train[['g-392', 'g-361', 'c-48']]
check['tubulin_inhibitor'] = train_target['tubulin_inhibitor']
check['size'] = 1
check.loc[check['tubulin_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='g-392', 
    y='g-361',
    z='c-48', 
    color='tubulin_inhibitor', 
    size="size",
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for tubulin_inhibitor'
)

fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['g-392', 'g-361', 'c-48'], 
    name='tubulin_inhibitor'
)

In [None]:
plot_histograms('g-392')

In [None]:
plot_histograms('g-361')

In [None]:
plot_histograms('c-48')

In [None]:
check = train[['g-476', 'g-619', 'g-705']]
check['hdac_inhibitor'] = train_target['hdac_inhibitor']
check['size'] = 1
check.loc[check['hdac_inhibitor']==1, 'size'] = 5

In [None]:
fig = px.scatter_3d(
    check, 
    x='g-476', 
    y='g-619',
    z='g-705', 
    color='hdac_inhibitor', 
    size="size",
    height=SCATTER_SIZE,
    width=SCATTER_SIZE,
    title='Scatter plot for hdac_inhibitor'
)

fig.show()

In [None]:
plot_combined_histograms(
    plot_list=['g-476', 'g-619', 'g-705'], 
    name='hdac_inhibitor'
)

In [None]:
plot_histograms('g-476')

In [None]:
plot_histograms('g-619')

In [None]:
plot_histograms('g-705')

In [None]:
plot_combined_histograms(['g-392', 'g-206', 'g-100'], 'cyclooxygenase_inhibitor')

In [None]:
plot_histograms('g-392')

In [None]:
plot_histograms('g-206')

In [None]:
plot_histograms('g-100')

Let's use t-test for all features and check features with p-value < 0.05.

In [None]:
count = 0
for col in train.columns:
    if col in ['sig_id', 'cp_type', 'cp_time', 'cp_dose']:
        continue
    if stats.ttest_ind(train[col], test[col]).pvalue < 0.01:
        print(col, stats.ttest_ind(train[col], test[col]).pvalue)
        count += 1

In [None]:
print('Number of features with non acepted 0 hypothesis:', count)