In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

### Load data

In [None]:
Y_moa_df = pd.read_csv('/kaggle/input/lish-moa/train_targets_scored.csv')
print(Y_moa_df.shape)
X_moa_df = pd.read_csv('/kaggle/input/lish-moa/train_features.csv')
print(X_moa_df.shape)
X_test_moa_df = pd.read_csv('/kaggle/input/lish-moa/test_features.csv').set_index('sig_id')
print(X_test_moa_df.shape)

### Exploratory Data Analysis

In [None]:
df_anova = pd.DataFrame()

columns = [col for col in X_moa_df.columns if col.startswith(('g-','c-'))]

for col in columns:
    F, p = stats.f_oneway(X_moa_df[col][X_moa_df['cp_type'] == 'ctl_vehicle'],
                          X_moa_df[col][X_moa_df['cp_type'] == 'trt_cp'])
    
    frame = {'feature': col, 'F-statistics': F, 'p-value': p}
    
    df_anova = df_anova.append(pd.DataFrame(data=frame, dtype=object, index=[col]))
    
df_anova = df_anova.reset_index(drop=True)

In [None]:
df_bad_columns = (df_anova.where(df_anova['p-value'] >= 0.01).dropna(how='all'))
df_bad_columns.shape

In [None]:
good_columns = list(X_moa_df.columns.difference(df_bad_columns['feature'].values))
len(good_columns)

### Train-test split

In [None]:
#X_train, X_test = train_test_split(X_moa_df[good_columns], test_size=0.25, random_state=10)

#print(X_train.shape)
#print(X_test.shape)

In [None]:
#Y_train = (Y_moa_df.where(Y_moa_df['sig_id'].isin(X_train['sig_id'].values))
#           .dropna(how='all')
#           .set_index('sig_id'))
#Y_train = Y_train.astype(int)

#Y_test = (Y_moa_df.where(Y_moa_df['sig_id'].isin(X_test['sig_id'].values))
#          .dropna(how='all')
#          .set_index('sig_id'))
#Y_test = Y_test.astype(int)

#print(Y_train.shape)
#print(Y_test.shape)

In [None]:
#Y_train.head()

In [None]:
#X_train = X_train.set_index('sig_id')
X_train = X_moa_df[good_columns].set_index('sig_id')
#X_test = X_test.set_index('sig_id')
Y_train = Y_moa_df.set_index('sig_id')
Y_train = Y_train.astype(int)

print(X_train.shape)
print(Y_train.shape)

### Define functions

In [None]:
def get_cols_pair(col, template):
    exceptions = ['cp_time', 'cp_type', 'cp_dose']
    reverse_flg = False
    
    if template == '_':
        for exception in exceptions:
            if col.startswith(exception):
                reverse_flg = True
            
    if reverse_flg:
        i = col.rfind(template, 0)
    else:
        i = col.find(template, 0)
        
    col_first = col[:i]
    col_second = col[i+len(template):]
    
    return (col_first, col_second)


def pow10(val):
    return pow(10, val)


def get_features_df(corr_df, target_list, corr_threshold):
    df = pd.DataFrame()

    for col in target_list:
        cor_target = abs(corr_df[col])

        relevant_features = cor_target[cor_target >= corr_threshold]

        # Remove the same feature, if exists
        try:
            relevant_features.drop(labels=col, inplace=True)
        except:
            pass

        targets = [col for item in relevant_features.index]
        target_flags = [1 if item in target_list else 0 for item in relevant_features.index]

        frame = {'target': targets, 
                 'relevant_feature': relevant_features.index,
                 'target_flg': target_flags,
                 'corr_val': relevant_features.values}

        df = df.append(pd.DataFrame(data=frame, dtype=object))

    df = df.reset_index(drop=True)
    
    return df


def get_features_tgt_df(df_features):
    df_target = (df_features.where(df_features['target_flg'] == 1).dropna(how='all'))
    
    return df_target
    
    
def get_target_grp_df(df_target):    
    df_target_grp = (df_target[['target','relevant_feature']]
                     .where(df_target['target_flg'] == 1)
                     .groupby('target')
                     .count()
                     .reset_index())
    df_target_grp.rename(columns={'relevant_feature': 'cnt'}, inplace=True)
    df_target_grp.sort_values(by=['cnt','target'], axis=0, ascending=[False,True], inplace=True)
    df_target_grp = df_target_grp.reset_index(drop=True)
    
    return df_target_grp


def get_inverse_tgt_df(df_target):
    df_inverse = pd.DataFrame()
    target_list = []
    
    df_target_grp = get_target_grp_df(df_target=df_target)

    for item_outer, row_outer in df_target_grp.iterrows():

        df_childs = (df_target.where((df_target['target'] == row_outer['target']) & (df_target['target_flg'] == 1))
                     .dropna(how='all'))
        target_list.append(row_outer['target'])

        for item, row in df_childs.iterrows():
            df_inverse_tmp = df_target.where((df_target['target'] == row['relevant_feature']) & 
                                             (df_target['relevant_feature'] == row_outer['target']) &
                                             (~df_target['target'].isin(target_list))).dropna(how='all')

            df_inverse = pd.concat([df_inverse, df_inverse_tmp])
        
    return df_inverse


def get_unique_feature_pairs(df):
    
    df_target = get_features_tgt_df(df_features=df)
    df_inverse = get_inverse_tgt_df(df_target=df_target)
    df_unique = pd.concat([df, df_inverse]).drop_duplicates(keep=False)
    
    return df_unique


def get_features_list(target_name, df_features, corr_threshold):
    
    relevant_features = list(df_features['relevant_feature']
                             .where((df_features['target'] == target_name) & (df_features['corr_val'] >= corr_threshold))
                             .dropna(how='all')
                             .unique())
    return relevant_features

### Feature Engineering

In [None]:
X_train_enc = pd.get_dummies(X_train)
#X_test_enc = pd.get_dummies(X_test)
X_test_enc = pd.get_dummies(X_test_moa_df)

#### Generated features templates list

In [None]:
new_features_list = ['c-10_-_c-31', 'c-12_+_g-157', 'c-13_-_c-91', 'c-15_-_g-203', 'c-18_-_c-80', 'c-21_+_g-157', 
                     'c-21_-_g-91', 'c-26_+_g-157', 'c-26_-_c-91', 'c-27_-_g-20', 'c-33_-_c-31', 'c-63_+_g-157', 
                     'c-65_-_c-26', 'c-65_-_c-45', 'c-65_-_c-51', 'c-6_-_g-91', 'c-92_+_g-157', 'c-98_+_g-60', 
                     'c-98_-_c-51', 'c-98_-_c-62', 'c-98_-_c-67', 'c-9_+_g-157', 'g-100_+_g-406', 'g-107_+_g-409', 
                     'g-110_+_g-476', 'g-113_-_g-640', 'g-119_+_g-202', 'g-125_-_c-65', 'g-125_-_c-98', 
                     'g-140_-_g-157', 'g-146_+_g-178', 'g-152_-_c-65', 'g-157_-_c-65', 'g-162_-_g-100', 
                     'g-165_-_g-207', 'g-165_-_g-48', 'g-166_-_g-628', 'g-167_-_g-202', 'g-16_+_g-100', 
                     'g-16_-_g-640', 'g-178_-_c-65', 'g-178_-_c-98', 'g-178_-_g-206', 'g-181_-_g-100', 
                     'g-186_+_g-157', 'g-189_-_g-202', 'g-18_+_g-48', 'g-195_+_g-157', 'g-201_+_g-203', 
                     'g-201_-_g-181', 'g-201_-_g-476', 'g-202_-_g-100', 'g-202_-_g-48', 'g-207_+_c-65', 
                     'g-207_+_g-202', 'g-207_-_g-165', 'g-209_-_g-202', 'g-20_+_g-476', 'g-20_+_g-90', 
                     'g-210_-_g-100', 'g-215_+_g-157', 'g-228_+_g-157', 'g-22_-_c-65', 'g-22_-_g-202', 
                     'g-22_-_g-253', 'g-254_-_g-100', 'g-257_+_g-157', 'g-260_+_g-202', 'g-263_-_g-476', 
                     'g-269_+_g-100', 'g-269_+_g-253', 'g-274_-_g-628', 'g-277_+_g-100', 'g-279_+_g-202', 
                     'g-280_-_g-157', 'g-282_-_c-98', 'g-283_+_g-202', 'g-28_-_c-65', 'g-409_+_g-107', 
                     'g-410_+_g-90', 'g-410_-_g-107', 'g-410_-_g-628', 'g-414_+_g-202', 'g-414_-_g-640', 
                     'g-418_-_g-100', 'g-418_-_g-206', 'g-420_-_g-628', 'g-424_+_c-65', 'g-431_+_c-65', 
                     'g-431_-_g-202', 'g-435_+_g-157', 'g-435_-_c-65', 'g-440_+_g-202', 'g-441_-_g-202', 
                     'g-447_-_g-100', 'g-451_-_g-253', 'g-455_-_g-476', 'g-459_-_c-65', 'g-47_+_g-48', 
                     'g-47_-_g-202', 'g-47_-_g-476', 'g-480_-_g-157', 'g-489_+_g-157', 'g-603_+_c-98', 
                     'g-603_+_g-211', 'g-604_+_g-48', 'g-615_+_g-476', 'g-619_-_g-476', 'g-620_+_c-65', 
                     'g-620_-_g-157', 'g-621_-_c-65', 'g-626_+_g-635', 'g-626_-_c-98', 'g-626_-_g-202', 
                     'g-628_+_g-0', 'g-628_-_g-274', 'g-628_-_g-636', 'g-634_+_c-65', 'g-636_-_g-628', 
                     'g-639_-_c-98', 'g-642_-_c-65', 'g-656_-_g-157', 'g-659_+_g-165', 'g-65_+_g-640', 
                     'g-677_+_g-100', 'g-68_+_g-20', 'g-84_+_g-640', 'g-90_+_g-20', 'g-94_-_g-157', 
                     'g-95_+_c-65', 'g-99_-_c-48', 'max_c-30_g-165', 'max_c-8_g-628', 'max_g-101_g-178', 
                     'max_g-125_g-157', 'max_g-183_g-157', 'max_g-22_g-178', 'max_g-278_g-178', 
                     'max_g-435_g-178', 'max_g-448_g-157', 'min_g-100_g-157', 'min_g-110_g-253', 
                     'min_g-162_g-219', 'min_g-409_g-628', 'min_g-439_g-202', 'min_g-447_g-202', 
                     'min_g-628_g-229', 'min_g-89_g-476', 'min_g-91_g-640']
len(new_features_list)

In [None]:
features_pow_ss_list = ['c-6_-_g-91', 'g-487', 'g-615', 'g-78', 'g-351', 'g-250', 'g-476', 'g-736', 'g-162', 'g-42', 'g-645', 'g-200', 'g-70', 'g-148', 'g-295', 'g-339', 'g-517', 'g-544', 'g-553', 'g-589', 'g-691', 'g-433', 'g-489', 'g-84', 'g-146_+_g-178', 'g-489_+_g-157', 'g-626_-_c-98', 'g-119', 'g-19', 'g-22', 'g-321', 'g-529', 'g-152_-_c-65', 'g-22_-_c-65', 'g-22_-_g-202', 'g-623', 'c-1', 'c-25', 'c-27', 'c-34', 'c-39', 'c-59', 'c-64', 'c-73', 'c-8', 'g-392', 'g-405', 'g-420', 'g-73', 'g-620_+_c-65', 'g-65', 'c-65_-_c-51', 'g-375', 'g-391', 'g-330', 'g-89', 'g-175', 'g-421', 'g-274_-_g-628', 'g-410_-_g-628', 'g-390', 'c-21_+_g-157', 'g-442', 'g-327', 'c-16', 'c-53', 'g-122', 'g-195', 'g-597', 'g-657', 'g-684', 'g-726', 'c-65_-_c-26', 'g-165_-_g-48', 'c-23', 'c-32', 'c-61', 'c-66', 'c-79', 'g-14', 'g-141', 'g-178', 'g-256', 'g-26', 'g-263', 'g-267', 'g-319', 'g-347', 'g-353', 'g-451', 'g-521', 'g-543', 'g-578', 'g-598', 'g-619', 'g-639', 'g-8', 'max_g-101_g-178', 'max_g-22_g-178', 'max_g-278_g-178', 'max_g-435_g-178', 'c-27_-_g-20', 'c-98_-_c-51', 'c-98_-_c-62', 'g-178_-_g-206', 'g-159', 'g-183', 'g-233', 'g-40', 'g-430', 'g-48', 'g-166_-_g-628', 'g-420_-_g-628', 'g-410', 'g-473', 'g-280_-_g-157', 'g-257_+_g-157', 'c-13_-_c-91', 'c-65_-_c-45', 'g-369', 'g-432', 'g-470', 'g-533', 'g-621', 'c-18_-_c-80', 'g-253', 'g-272', 'g-329', 'g-527', 'g-572', 'g-663', 'g-75', 'g-431_+_c-65', 'g-370', 'g-679', 'g-574', 'g-614', 'g-394', 'g-57', 'g-193', 'g-317', 'g-53', 'g-110', 'g-158', 'g-161', 'g-713', 'g-88', 'g-202_-_g-100', 'g-210_-_g-100', 'g-113', 'g-360', 'g-735', 'g-279_+_g-202', 'g-290', 'g-118', 'g-114', 'g-495', 'g-441_-_g-202', 'g-139', 'g-214', 'g-216', 'g-296', 'g-355', 'g-372', 'g-373', 'g-384', 'g-412', 'g-415', 'g-446', 'g-465', 'g-471', 'g-481', 'g-507', 'g-535', 'g-542', 'g-546', 'g-551', 'g-62', 'g-66', 'g-678', 'g-681', 'g-702', 'g-733', 'g-98', 'c-19', 'g-757', 'g-765', 'g-524', 'c-26_+_g-157', 'c-9_+_g-157', 'g-186_+_g-157', 'g-195_+_g-157', 'g-41', 'g-549', 'g-185', 'g-190', 'g-207', 'g-217', 'g-312', 'g-538', 'g-652', 'g-20_+_g-476', 'g-628_-_g-636', 'g-292', 'g-557']
len(features_pow_ss_list)

In [None]:
columns_all = new_features_list

X_train_all = X_train_enc.copy()
X_test_all = X_test_enc.copy()

# Maximum
template = 'max_'
columns_max = [get_cols_pair(col[4:], '_') for col in columns_all if col.startswith(template)]

for col in columns_max:
    X_train_all[template + col[0] + '_' + col[1]] = X_train_enc[[col[0], col[1]]].max(axis=1)
    X_test_all[template + col[0] + '_' + col[1]] = X_test_enc[[col[0], col[1]]].max(axis=1)
                                                            
# Minimum
template = 'min_'
columns_min = [get_cols_pair(col[4:], '_') for col in columns_all if col.startswith(template)]

for col in columns_min:
    X_train_all[template + col[0] + '_' + col[1]] = X_train_enc[[col[0], col[1]]].min(axis=1)
    X_test_all[template + col[0] + '_' + col[1]] = X_test_enc[[col[0], col[1]]].min(axis=1)                                                             

# Addition
template = '_+_'
columns_add = [get_cols_pair(col, template) for col in columns_all if col.find(template) != -1]

for col in columns_add:
    X_train_all[col[0] + template + col[1]] = X_train_enc[col[0]] + X_train_enc[col[1]]
    X_test_all[col[0] + template + col[1]] = X_test_enc[col[0]] + X_test_enc[col[1]]
    
# Subtraction
template = '_-_'
columns_sub = [get_cols_pair(col, template) for col in columns_all if col.find(template) != -1]

for col in columns_sub:
    X_train_all[col[0] + template + col[1]] = X_train_enc[col[0]] - X_train_enc[col[1]]
    X_test_all[col[0] + template + col[1]] = X_test_enc[col[0]] - X_test_enc[col[1]]
    
print(X_train_all.shape)
print(X_test_all.shape)

In [None]:
tr = StandardScaler()
_suffix = '_pow_ss'

X_train_all_pow = pd.DataFrame(X_train_all[features_pow_ss_list]).applymap(pow10)
X_train_all_tr = pd.DataFrame(data=tr.fit_transform(X_train_all_pow), index=X_train_all_pow.index, 
                              columns=features_pow_ss_list)
X_train_all_tr = X_train_all_tr.add_suffix(_suffix)

X_test_all_pow = pd.DataFrame(X_test_all[features_pow_ss_list]).applymap(pow10)
X_test_all_tr = pd.DataFrame(data=tr.transform(X_test_all_pow), index=X_test_all_pow.index, 
                             columns=features_pow_ss_list)
X_test_all_tr = X_test_all_tr.add_suffix(_suffix)

print(X_train_all_tr.shape)
print(X_test_all_tr.shape)

In [None]:
columns_old = list(X_train_all.columns.difference(features_pow_ss_list))

X_train_all = pd.merge(X_train_all[columns_old], X_train_all_tr, how='inner', left_index=True, right_index=True)
moa_train_df_all = pd.merge(X_train_all, Y_train, how='inner', left_index=True, right_index=True)

X_test_all = pd.merge(X_test_all[columns_old], X_test_all_tr, how='inner', left_index=True, right_index=True)
#moa_test_df_all = pd.merge(X_test_all, Y_test, how='inner', left_index=True, right_index=True)

print(moa_train_df_all.shape)
print(X_test_all.shape)
#print(moa_test_df_all.shape)

In [None]:
cor_all = moa_train_df_all.corr()
cor_all.shape

In [None]:
features_df = get_features_df(corr_df=cor_all, target_list=Y_train.columns, corr_threshold=0.1)
features_df = get_unique_feature_pairs(features_df)
print(features_df.shape)

In [None]:
targets_chain_list = ['5-alpha_reductase_inhibitor', 'acat_inhibitor', 'acetylcholine_receptor_agonist', 'acetylcholine_receptor_antagonist', 'acetylcholinesterase_inhibitor', 'adenosine_receptor_agonist', 'adenosine_receptor_antagonist', 'adenylyl_cyclase_activator', 'adrenergic_receptor_agonist', 'adrenergic_receptor_antagonist', 'akt_inhibitor', 'aldehyde_dehydrogenase_inhibitor', 'alk_inhibitor', 'ampk_activator', 'analgesic', 'androgen_receptor_antagonist', 'anesthetic_-_local', 'angiotensin_receptor_antagonist', 'anti-inflammatory', 'antiarrhythmic', 'antibiotic', 'anticonvulsant', 'antifungal', 'antihistamine', 'antimalarial', 'antioxidant', 'antiprotozoal', 'antiviral', 'atm_kinase_inhibitor', 'atp-sensitive_potassium_channel_antagonist', 'atp_synthase_inhibitor', 'atr_kinase_inhibitor', 'aurora_kinase_inhibitor', 'autotaxin_inhibitor', 'bacterial_30s_ribosomal_subunit_inhibitor', 'bacterial_50s_ribosomal_subunit_inhibitor', 'bacterial_antifolate', 'bacterial_cell_wall_synthesis_inhibitor', 'bacterial_dna_gyrase_inhibitor', 'bacterial_dna_inhibitor', 'bacterial_membrane_integrity_inhibitor', 'benzodiazepine_receptor_agonist', 'beta_amyloid_inhibitor', 'bromodomain_inhibitor', 'btk_inhibitor', 'calcineurin_inhibitor', 'calcium_channel_blocker', 'cannabinoid_receptor_agonist', 'cannabinoid_receptor_antagonist', 'carbonic_anhydrase_inhibitor', 'casein_kinase_inhibitor', 'catechol_o_methyltransferase_inhibitor', 'cc_chemokine_receptor_antagonist', 'cck_receptor_antagonist', 'cdk_inhibitor', 'chelating_agent', 'chk_inhibitor', 'cholesterol_inhibitor', 'cholinergic_receptor_antagonist', 'coagulation_factor_inhibitor', 'cyclooxygenase_inhibitor', 'cytochrome_p450_inhibitor', 'dihydrofolate_reductase_inhibitor', 'dipeptidyl_peptidase_inhibitor', 'diuretic', 'dna_alkylating_agent', 'dopamine_receptor_agonist', 'egfr_inhibitor', 'elastase_inhibitor', 'erbb2_inhibitor', 'estrogen_receptor_antagonist', 'faah_inhibitor', 'farnesyltransferase_inhibitor', 'fatty_acid_receptor_agonist', 'focal_adhesion_kinase_inhibitor', 'free_radical_scavenger', 'fungal_squalene_epoxidase_inhibitor', 'gaba_receptor_agonist', 'gaba_receptor_antagonist', 'gamma_secretase_inhibitor', 'glutamate_inhibitor', 'glutamate_receptor_agonist', 'glutamate_receptor_antagonist', 'gonadotropin_receptor_agonist', 'gsk_inhibitor', 'hcv_inhibitor', 'hdac_inhibitor', 'histamine_receptor_agonist', 'histamine_receptor_antagonist', 'histone_lysine_demethylase_inhibitor', 'histone_lysine_methyltransferase_inhibitor', 'hiv_inhibitor', 'hmgcr_inhibitor', 'hsp_inhibitor', 'igf-1_inhibitor', 'imidazoline_receptor_agonist', 'immunosuppressant', 'insulin_secretagogue', 'insulin_sensitizer', 'integrin_inhibitor', 'jak_inhibitor', 'laxative', 'leukotriene_inhibitor', 'leukotriene_receptor_antagonist', 'lipase_inhibitor', 'lxr_agonist', 'mdm_inhibitor', 'mek_inhibitor', 'membrane_integrity_inhibitor', 'mineralocorticoid_receptor_antagonist', 'monoacylglycerol_lipase_inhibitor', 'monoamine_oxidase_inhibitor', 'monopolar_spindle_1_kinase_inhibitor', 'mucolytic_agent', 'neuropeptide_receptor_antagonist', 'nicotinic_receptor_agonist', 'nitric_oxide_production_inhibitor', 'nitric_oxide_synthase_inhibitor', 'opioid_receptor_agonist', 'opioid_receptor_antagonist', 'orexin_receptor_antagonist', 'p-glycoprotein_inhibitor', 'parp_inhibitor', 'pdk_inhibitor', 'phosphodiesterase_inhibitor', 'phospholipase_inhibitor', 'pi3k_inhibitor', 'potassium_channel_activator', 'potassium_channel_antagonist', 'ppar_receptor_antagonist', 'progesterone_receptor_agonist', 'prostaglandin_inhibitor', 'prostanoid_receptor_antagonist', 'proteasome_inhibitor', 'protein_kinase_inhibitor', 'protein_phosphatase_inhibitor', 'protein_synthesis_inhibitor', 'protein_tyrosine_kinase_inhibitor', 'radiopaque_medium', 'raf_inhibitor', 'ras_gtpase_inhibitor', 'retinoid_receptor_agonist', 'retinoid_receptor_antagonist', 'rho_associated_kinase_inhibitor', 'ribonucleoside_reductase_inhibitor', 'rna_polymerase_inhibitor', 'serotonin_receptor_agonist', 'serotonin_receptor_antagonist', 'serotonin_reuptake_inhibitor', 'sigma_receptor_antagonist', 'smoothened_receptor_antagonist', 'sodium_channel_inhibitor', 'sphingosine_receptor_agonist', 'steroid', 'syk_inhibitor', 'tachykinin_antagonist', 'tgf-beta_receptor_inhibitor', 'thrombin_inhibitor', 'tlr_agonist', 'tlr_antagonist', 'tnf_inhibitor', 'topoisomerase_inhibitor', 'transient_receptor_potential_channel_antagonist', 'tropomyosin_receptor_kinase_inhibitor', 'trpv_antagonist', 'tubulin_inhibitor', 'ubiquitin_specific_protease_inhibitor', 'vitamin_b', 'vitamin_d_receptor_agonist', 'wnt_inhibitor', '11-beta-hsd1_inhibitor', 'angiogenesis_inhibitor', 'aromatase_inhibitor', 'chloride_channel_blocker', 'progesterone_receptor_antagonist', 'fgfr_inhibitor', 'glucocorticoid_receptor_agonist', 'ikk_inhibitor', 'mtor_inhibitor', 'nfkb_inhibitor', 'nitric_oxide_donor', 'p38_mapk_inhibitor', 'vegfr_inhibitor', 'pkc_inhibitor', 'estrogen_receptor_agonist', 'sigma_receptor_agonist', 'src_inhibitor', 'thymidylate_synthase_inhibitor', 'trpv_agonist', 'tyrosine_kinase_inhibitor', 'pdgfr_inhibitor', 'androgen_receptor_agonist', 'bcl_inhibitor', 'caspase_activator', 'dna_inhibitor', 'dopamine_receptor_antagonist', 'kit_inhibitor', 'lipoxygenase_inhibitor', 'atpase_inhibitor', 'norepinephrine_reuptake_inhibitor', 'bcr-abl_inhibitor', 'corticosteroid_agonist', 'flt3_inhibitor', 'ppar_receptor_agonist', 'nrf2_activator', 'apoptosis_stimulant']
print(len(targets_chain_list))

In [None]:
def get_param_value(target_name):
    
    if target_name in ['dopamine_receptor_antagonist','acetylcholine_receptor_antagonist','adrenergic_receptor_antagonist',
                      'calcium_channel_blocker','cyclooxygenase_inhibitor']:
        c = 0.06
    elif target_name in ['dna_inhibitor']:
        c = 0.07
    elif target_name in ['potassium_channel_antagonist','estrogen_receptor_agonist','androgen_receptor_antagonist',
                        'bacterial_cell_wall_synthesis_inhibitor','cc_chemokine_receptor_antagonist','cytochrome_p450_inhibitor',
                        'dopamine_receptor_agonist','elastase_inhibitor','gaba_receptor_antagonist']:
        c = 0.08
    elif target_name in ['protein_synthesis_inhibitor','tyrosine_kinase_inhibitor','prostanoid_receptor_antagonist']:
        c = 0.09
    elif target_name in ['vegfr_inhibitor','opioid_receptor_antagonist','acetylcholinesterase_inhibitor',
                        'adenosine_receptor_agonist','protein_kinase_inhibitor','acetylcholine_receptor_agonist',
                        'adenosine_receptor_antagonist','anesthetic_-_local','bacterial_dna_gyrase_inhibitor',
                        'cannabinoid_receptor_antagonist','chelating_agent','cholinergic_receptor_antagonist',
                        'glutamate_receptor_agonist']:
        c = 0.10
    elif target_name in ['immunosuppressant','bcl_inhibitor','pi3k_inhibitor','membrane_integrity_inhibitor',
                        'adrenergic_receptor_agonist','antioxidant','lipoxygenase_inhibitor','cholesterol_inhibitor',
                        'nitric_oxide_production_inhibitor','gaba_receptor_agonist','potassium_channel_activator']:
        c = 0.11
    elif target_name in ['atpase_inhibitor','anti-inflammatory','androgen_receptor_agonist']:
        c = 0.12
    elif target_name in ['ppar_receptor_agonist','src_inhibitor','thymidylate_synthase_inhibitor','antibiotic',
                        'dna_alkylating_agent']:
        c = 0.13
    elif target_name in ['pdgfr_inhibitor','bromodomain_inhibitor','aromatase_inhibitor','trpv_antagonist']:
        c = 0.14
    elif target_name in ['cdk_inhibitor','aurora_kinase_inhibitor','acat_inhibitor','antiprotozoal','angiogenesis_inhibitor',
                        'opioid_receptor_agonist','bacterial_30s_ribosomal_subunit_inhibitor',
                        'bacterial_50s_ribosomal_subunit_inhibitor','bacterial_dna_inhibitor',
                        'antiviral','benzodiazepine_receptor_agonist','cannabinoid_receptor_agonist',
                        'carbonic_anhydrase_inhibitor','casein_kinase_inhibitor','faah_inhibitor']:
        c = 0.15
    elif target_name in ['corticosteroid_agonist','phospholipase_inhibitor','chloride_channel_blocker']:
        c = 0.16
    elif target_name in ['tubulin_inhibitor','jak_inhibitor','akt_inhibitor','estrogen_receptor_antagonist','ikk_inhibitor',
                        'rna_polymerase_inhibitor','insulin_secretagogue']:
        c = 0.17
    elif target_name in ['egfr_inhibitor','topoisomerase_inhibitor','hdac_inhibitor','bcr-abl_inhibitor','fgfr_inhibitor',
                        'gsk_inhibitor','tnf_inhibitor','sigma_receptor_agonist']:
        c = 0.18
    elif target_name in ['p38_mapk_inhibitor','pkc_inhibitor']:
        c = 0.19
    elif target_name in ['mek_inhibitor','angiotensin_receptor_antagonist','nitric_oxide_donor','beta_amyloid_inhibitor',
                        'antifungal','bacterial_antifolate','cck_receptor_antagonist','fatty_acid_receptor_agonist',
                        'gamma_secretase_inhibitor']:
        c = 0.20
    elif target_name in ['mtor_inhibitor','ribonucleoside_reductase_inhibitor','syk_inhibitor']:
        c = 0.21
    elif target_name in ['11-beta-hsd1_inhibitor']:
        c = 0.22
    elif target_name in ['bcl_inhibitor','alk_inhibitor','dihydrofolate_reductase_inhibitor']:
        c = 0.23
    elif target_name in ['antihistamine','antimalarial','dipeptidyl_peptidase_inhibitor','free_radical_scavenger',
                        'fungal_squalene_epoxidase_inhibitor']:
        c = 0.25
    elif target_name in ['hmgcr_inhibitor']:
        c = 0.28
    elif target_name in ['hsp_inhibitor']:
        c = 0.29
    elif target_name in ['ampk_activator','analgesic','anticonvulsant','bacterial_membrane_integrity_inhibitor',
                        'btk_inhibitor']:
        c = 0.30
    elif target_name in ['5-alpha_reductase_inhibitor','autotaxin_inhibitor','catechol_o_methyltransferase_inhibitor']:
        c = 0.35
    elif target_name in ['antiarrhythmic','diuretic','glutamate_inhibitor']:
        c = 0.45
    elif target_name in ['glucocorticoid_receptor_agonist']:
        c = 0.49
    elif target_name in ['atr_kinase_inhibitor','coagulation_factor_inhibitor']:
        c = 0.5
    elif target_name in ['mdm_inhibitor']:
        c = 0.56
    elif target_name in ['raf_inhibitor']:
        c = 0.58
    elif target_name in ['calcineurin_inhibitor','chk_inhibitor']:
        c = 0.75
    elif target_name in ['kit_inhibitor','vitamin_d_receptor_agonist']:
        c = 0.79
    elif target_name in ['nfkb_inhibitor']:
        c = 0.83
    elif target_name in ['focal_adhesion_kinase_inhibitor']:
        c = 1.0
    elif target_name in ['flt3_inhibitor']:
        c = 1.16
    elif target_name in ['adenylyl_cyclase_activator','aldehyde_dehydrogenase_inhibitor','farnesyltransferase_inhibitor']:
        c = 1.50
    elif target_name in ['atm_kinase_inhibitor']:
        c = 2.0
    elif target_name in ['proteasome_inhibitor']:
        c = 2.04
    elif target_name in ['atp_synthase_inhibitor']:
        c = 5.0
    elif target_name in ['apoptosis_stimulant']:
        c = 29.18
    else:
        c = 0.20
        
    return c

### Real model fitting

In [None]:
df_pred = pd.DataFrame(data=X_test_all.index, index=X_test_all.index)
df_prob = pd.DataFrame(data=X_test_all.index, index=X_test_all.index)

arr = np.zeros((X_test_all.shape[0], 1))
i = 0

for moa_target in targets_chain_list:
    if moa_target in ['cdk_inhibitor','flt3_inhibitor','kit_inhibitor','glucocorticoid_receptor_agonist','apoptosis_stimulant']:
        input_cols = get_features_list(moa_target, features_df, 0.1)
    elif moa_target == 'proteasome_inhibitor':
        input_cols = get_features_list(moa_target, features_df, 0.4)
    elif moa_target == 'nfkb_inhibitor':
        input_cols = get_features_list(moa_target, features_df, 0.7)
    else:
        input_cols = X_train_all.columns
    
    if len(input_cols) > 0:
        try:
            model_code = 'LR'
            
            c = get_param_value(moa_target)

            moa_lr = LogisticRegression(penalty='l1', C=c, solver='liblinear', max_iter=100)
            moa_lr.fit(moa_train_df_all[input_cols], moa_train_df_all[moa_target])

            y_pred_lr = moa_lr.predict(X_test_all[input_cols])
            pp_lr = moa_lr.predict_proba(X_test_all[input_cols])

            X_test_all[moa_target] = y_pred_lr.astype(int)

            df_pred[moa_target] = y_pred_lr.astype(int)
            df_prob[moa_target] = pp_lr[:, 1]
            
            print('{}: {}: {}'.format(model_code, i, moa_target))

        except Exception as e:
            print('{}: {}: {}: {}'.format(model_code, i, moa_target, e))
    else:
        X_test_all[moa_target] = arr.astype(int)
            
    i+=1

df_pred = df_pred.drop(labels=['sig_id'], axis=1)

df_prob = df_prob.drop(labels=['sig_id'], axis=1)
df_prob.to_csv('/kaggle/working/predicted_proba.csv')

#### Predicted on the engineered data

In [None]:
data_values = df_pred[df_pred.columns[1:]].sum(axis=1).values

s = pd.Series(data=data_values)
s = s.groupby(s.values).count()
s.sort_values(ascending=False, inplace=True)

print(s[s.index > 0].sum())

# sns.set(font_scale=2)
plt.figure(figsize=(10,6))
ax = sns.barplot(x=np.arange(len(s)), y=s.values)

ax.set_xticklabels(s.index, rotation='horizontal', fontsize=12)

plt.title('The number of samples for each MoA target group', fontsize=20)
plt.ylabel('The number of samples', fontsize=14)
plt.xlabel('MoA target group', fontsize=14)

# Adding the text labels
rects = ax.patches
for rect, data_value in zip(rects, s.values):
    h = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, h + 5, data_value, ha='center', va='bottom', fontsize=12)
    
plt.show()
plt.close()

### Submit results

In [None]:
df_prob = pd.read_csv('/kaggle/working/predicted_proba.csv')
df_prob.shape

In [None]:
df_submit = pd.DataFrame(data=df_prob['sig_id'])
arr = np.zeros((df_prob.shape[0], 1))
target_list_pp = list(df_prob.columns[1:])

for col in Y_train.columns:
    if (col in target_list_pp):
        df_submit[col] = df_prob[col]
    else:
        df_submit[col] = arr

In [None]:
print(df_submit.shape)
df_submit.head(3)

In [None]:
df_submit.to_csv('submission.csv', index=False)

In [None]:
df_submit = pd.read_csv('/kaggle/working/submission.csv')