In [4]:
import pandas as pd
import numpy as np

# for visualizing

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import plotly.express as px

# for adding extra statistical stuff

from scipy.stats import skew, norm

## Load Data 

In [5]:
train_data = pd.read_csv("/kaggle/input/lish-moa/train_features.csv")
train_target_scored_data = pd.read_csv("/kaggle/input/lish-moa/train_targets_scored.csv")
train_targets_non_scored_data = pd.read_csv("/kaggle/input/lish-moa/train_targets_nonscored.csv")
test_data = pd.read_csv("/kaggle/input/lish-moa/test_features.csv")

## Overview of the Data

In [8]:
print('Train Feature:')
display(train_feature.head(5))

Train Feature:


Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_000644bb2,trt_cp,24,D1,1.062,0.5577,-0.2479,-0.6208,-0.1944,-1.012,...,0.2862,0.2584,0.8076,0.5523,-0.1912,0.6584,-0.3981,0.2139,0.3801,0.4176
1,id_000779bfc,trt_cp,72,D1,0.0743,0.4087,0.2991,0.0604,1.019,0.5207,...,-0.4265,0.7543,0.4708,0.023,0.2957,0.4899,0.1522,0.1241,0.6077,0.7371
2,id_000a6266a,trt_cp,48,D1,0.628,0.5817,1.554,-0.0764,-0.0323,1.239,...,-0.725,-0.6297,0.6103,0.0223,-1.324,-0.3174,-0.6417,-0.2187,-1.408,0.6931
3,id_0015fd391,trt_cp,48,D1,-0.5138,-0.2491,-0.2656,0.5288,4.062,-0.8095,...,-2.099,-0.6441,-5.63,-1.378,-0.8632,-1.288,-1.621,-0.8784,-0.3876,-0.8154
4,id_001626bd3,trt_cp,72,D2,-0.3254,-0.4009,0.97,0.6919,1.418,-0.8244,...,0.0042,0.0048,0.667,1.069,0.5523,-0.3031,0.1094,0.2885,-0.3786,0.7125


In [9]:
print('Test Feature:')
display(test_feature.head(5))

Test Feature:


Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,c-90,c-91,c-92,c-93,c-94,c-95,c-96,c-97,c-98,c-99
0,id_0004d9e33,trt_cp,24,D1,-0.5458,0.1306,-0.5135,0.4408,1.55,-0.1644,...,0.0981,0.7978,-0.143,-0.2067,-0.2303,-0.1193,0.021,-0.0502,0.151,-0.775
1,id_001897cda,trt_cp,72,D1,-0.1829,0.232,1.208,-0.4522,-0.3652,-0.3319,...,-0.119,-0.1852,-1.031,-1.367,-0.369,-0.5382,0.0359,-0.4764,-1.381,-0.73
2,id_002429b5b,ctl_vehicle,24,D1,0.1852,-0.1404,-0.3911,0.131,-1.438,0.2455,...,-0.2261,0.337,-1.384,0.8604,-1.953,-1.014,0.8662,1.016,0.4924,-0.1942
3,id_00276f245,trt_cp,24,D2,0.4828,0.1955,0.3825,0.4244,-0.5855,-1.202,...,0.126,0.157,-0.1784,-1.12,-0.4325,-0.9005,0.8131,-0.1305,0.5645,-0.5809
4,id_0027f1083,trt_cp,48,D1,-0.3979,-1.268,1.913,0.2057,-0.5864,-0.0166,...,0.4965,0.7578,-0.158,1.051,0.5742,1.09,-0.2962,-0.5313,0.9931,1.838


In [10]:
print('Train Target:')
display(train_target.head(5))

Train Target:


Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_000644bb2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cp_type_data = train_data.groupby(['cp_type'])['sig_id'].count().reset_index()
cp_type_data.columns = ['cp_type','count']

cp_time_data = train_data.groupby(['cp_time'])['sig_id'].count().reset_index()
cp_time_data.columns = ['cp_time','count']

cp_dose_data = train_data.groupby(['cp_dose'])['sig_id'].count().reset_index()
cp_dose_data.columns = ['cp_dose','count']

fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("Treatment", "Treatment time", "Treatment dose"))

fig.add_trace(go.Bar(x = cp_type_data['cp_type'], y = cp_type_data['count'],
                     text=cp_type_data['count'],
                     textposition="outside",
                     name = 'cp_type'),
                     row = 1, col = 1)

fig.add_trace(go.Bar(x = cp_time_data['cp_time'], y = cp_time_data['cp_time'],
                     text=cp_time_data['count'],
                     textposition="outside",
                     name = 'cp_time'),
                     row = 1, col = 2)

fig.add_trace(go.Bar(x = cp_dose_data['cp_dose'], y = cp_dose_data['count'],
                    text = cp_dose_data['count'],
                    textposition = 'outside',
                    name = 'cp_dose'),
                    row = 1, col = 3)

# Update xaxis properties
fig.update_xaxes(title_text="cp_type(Compound vs Control)", row=1, col=1)
fig.update_xaxes(title_text="cp_time(in Hrs)", row=1, col=2)
fig.update_xaxes(title_text="cp_dose(High vs Low)", row=1, col=3)

# Update yaxis properties
fig.update_yaxes(title_text="Total Observations in the Dataset", row=1, col=1)
fig.update_layout(title_text = "Train dataset Features")

fig.show()