In [None]:
import pandas as pd
import numpy as np

In [None]:
# read data
data_path = '../input/lish-moa/'
test = pd.read_csv(data_path + 'test_features.csv')
train = pd.read_csv(data_path + 'train_features.csv')
drugs = pd.read_csv(data_path + "train_drug.csv")
targets_nonscored = pd.read_csv(data_path + "train_targets_nonscored.csv")
targets_scored = pd.read_csv(data_path + "train_targets_scored.csv")

# Drugs

In [None]:
drugs.head()

In [None]:
drugs.shape, train.shape

In [None]:
drugs.drug_id.value_counts()

In [None]:
train_df = train.merge(targets_scored,on="sig_id").merge(targets_nonscored,on="sig_id").merge(drugs,on="sig_id")
targets_scored_cols = list(targets_scored.columns[1:])
targets_nonscored_cols = list(targets_nonscored.columns[1:])
features_cols = train.columns[1:]
assert len(features_cols)+len(targets_scored_cols) + len(targets_nonscored_cols) + 1 == train_df.shape[1]-1
train_df.head(2)

In [None]:
train_df.shape

In [None]:
drug_num_s = train_df["drug_id"].value_counts()
drug_num_s

In [None]:
# top 9 are popular
drug_num_s.iloc[:10]

In [None]:
drug_num_s.describe()

In [None]:
most_populars = list(drug_num_s.index[:9])
least_populars = list(drug_num_s[drug_num_s==1].index)
len(most_populars), len(least_populars)

In [None]:
train_df["target_num"] = train_df[targets_scored_cols].sum(axis=1)
train_df["target_num"].describe()

In [None]:
train_df["target_nonscored_num"] = train_df[targets_nonscored_cols].sum(axis=1)
train_df["target_nonscored_num"].describe()

In [None]:
train_df.query("drug_id in @most_populars")["target_num"].describe()

In [None]:
train_df.query("drug_id in @least_populars")["target_num"].describe()

In [None]:
train_df["cp_type"] = train_df["cp_type"].map({"ctl_vehicle": 0, "trt_cp": 1})
train_df["cp_time"] = train_df["cp_time"].map({24:0, 48:1, 72:2})
train_df["cp_dose"] = train_df["cp_dose"].map({"D1":0, "D2":1})

In [None]:
for popular_drug in most_populars:
    drug_target_num = train_df.query("drug_id == @popular_drug")["target_num"]
    drug_cp_type = train_df.query("drug_id == @popular_drug")["cp_type"]
    drug_cp_time = train_df.query("drug_id == @popular_drug")["cp_time"]
    drug_cp_dose = train_df.query("drug_id == @popular_drug")["cp_dose"]
    print("drug_id: {}, count: {}, mean MoA num: {:.2f} \t Mean cp_type: {:.3f} , Num cp_time 24: {}, 48: {}, 72: {}, mean cp_dose: {:.3f},".format(popular_drug, 
                                                                                                                                            drug_target_num.count(), 
                                                                                                                                            drug_target_num.mean(),
                                                                                                                                            drug_cp_type.mean(),
                                                                                                                                            (drug_cp_time==0).sum(),
                                                                                                                                            (drug_cp_time==1).sum(),
                                                                                                                                            (drug_cp_time==2).sum(),
                                                                                                                                            drug_cp_dose.mean())
         )

In [None]:
drug_cp_type_proportion = train_df.groupby(["drug_id"])["cp_type"].mean() 
drug_with_two_cp_type = (drug_cp_type_proportion != 1) & (drug_cp_type_proportion != 0)

In [None]:
# Each drugs are only used in the same cp_type group, all the ctl_vehicle is from 'cacb2b860'
drug_with_two_cp_type.sum()

In [None]:
drug_target_num_mean = train_df.groupby(["drug_id"])["target_num"].mean() 
drug_target_nonscored_num_mean = train_df.groupby(["drug_id"])["target_nonscored_num"].mean() 

drug_with_different_target_num_mean = (drug_target_num_mean != drug_target_num_mean.astype(int))
drug_with_different_target_nonscored_num_mean = (drug_target_nonscored_num_mean != drug_target_nonscored_num_mean.astype(int))

In [None]:
# Each drug have the same number of MoA individually regardless of different cp_time and cp_dose
drug_with_different_target_num_mean.sum(), drug_with_different_target_nonscored_num_mean.sum()

In [None]:
# check whether or not the target is the same or not
drug_targets_df = train_df[["drug_id"]+targets_scored_cols].copy()
drug_targets_df.head(2)

In [None]:
# pick the MoA parts
drug_targets_hits_df = drug_targets_df.apply(axis=1, func = lambda s: set(list(s[s==1].index))).to_frame(name = "targets")
drug_targets_hits_df["drug_id"] = drug_targets_df["drug_id"]
drug_targets_hits_df.head()

In [None]:
# Take the union of targets for each drug
drug_targets_hits_s = drug_targets_hits_df.groupby("drug_id")["targets"].apply(func=lambda s: set.union(*s))
drug_targets_hits_same = drug_targets_hits_s.apply(len) == train_df.groupby("drug_id")["target_num"].mean()
# Each drug have the same MoA individually regardless of different cp_time and cp_dose
drug_targets_hits_s[~drug_targets_hits_same]