# Reading Data

Package import, Helper functions and data reading

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datetime as dt
from typing import Tuple

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline


# read data
in_kaggle = True

# base report output path
reports_folder = 'reports/'

def get_data_file_path(is_in_kaggle: bool) -> Tuple[str, str, str, str, str]:
    train_path = ''
    test_path = ''

    if is_in_kaggle:
        # running in Kaggle, inside the competition
        train_path = '../input/lish-moa/train_features.csv'
        train_targets_path = '../input/lish-moa/train_targets_scored.csv'
        train_targets_nonscored_path = '../input/lish-moa/train_targets_nonscored.csv'
        test_path = '../input/lish-moa/test_features.csv'
        sample_submission_path = '../input/lish-moa/sample_submission.csv'
    else:
        # running locally
        train_path = 'data/train_features.csv'
        train_targets_path = 'data/train_targets_scored.csv'
        train_targets_nonscored_path = 'data/train_targets_nonscored.csv'
        test_path = 'data/test_features.csv'
        sample_submission_path = 'data/sample_submission.csv'

    return train_path, train_targets_path, train_targets_nonscored_path, test_path, sample_submission_path

In [None]:
start_time = dt.datetime.now()
print("Started at ", start_time)

In [None]:
%%time
# get the training set and labels
train_set_path, train_set_targets_path, train_set_targets_nonscored_path, test_set_path, sample_subm_path = get_data_file_path(in_kaggle)

train_features = pd.read_csv(train_set_path)
test_features = pd.read_csv(test_set_path)

c = pd.read_csv(train_set_targets_nonscored_path)
d = pd.read_csv(train_set_targets_path)

subm = pd.read_csv(sample_subm_path)

Let's do the initial processing steps as follows
- Reading data into memory 
- Slicing the data into separate dataframes by treated vs. controlled observations
- Separating features from the training set by c- and g-groups (cell viability and genes expression-related features, respectively) 

In [None]:
%%time
# additional feature engineering routines

features_g = list(train_features.columns[4:776])
features_c = list(train_features.columns[776:876])

numeric_cols = list(train_features.columns[4:876])
categoric_cols = ["cp_type","cp_time","cp_dose"]

# map cp_time as a char in order AutoViML not to interpret it a continuous variable
def map_cp_time(train, test):
    cp_time = {24: 't_24', 48: 't_48', 72: 't_72'}
    for df in [train, test]:
        df['cp_time'] = df['cp_time'].map(cp_time)

# Function to extract common stats features
def add_stat_features(train, test):

    for df in [train, test]:
        df['g_sum'] = df[features_g].sum(axis=1)
        df['g_mean'] = df[features_g].mean(axis=1)
        df['g_std'] = df[features_g].std(axis=1)
        df['g_kurt'] = df[features_g].kurtosis(axis=1)
        df['g_skew'] = df[features_g].skew(axis=1)
        df['c_sum'] = df[features_c].sum(axis=1)
        df['c_mean'] = df[features_c].mean(axis=1)
        df['c_std'] = df[features_c].std(axis=1)
        df['c_kurt'] = df[features_c].kurtosis(axis=1)
        df['c_skew'] = df[features_c].skew(axis=1)
        df['gc_sum'] = df[features_g + features_c].sum(axis=1)
        df['gc_mean'] = df[features_g + features_c].mean(axis=1)
        df['gc_std'] = df[features_g + features_c].std(axis=1)
        df['gc_kurt'] = df[features_g + features_c].kurtosis(axis=1)
        df['gc_skew'] = df[features_g + features_c].skew(axis=1)

    return train, test

# function to add squared c-features
def add_c_squared(train, test):
    for df in [train, test]:
        for feature in features_c:
            df[f'{feature}_squared'] = df[feature] ** 2
    return train, test

# function to add squared g-features
def add_g_squared(train, test):
    for df in [train, test]:
        for feature in features_g:
            df[f'{feature}_squared'] = df[feature] ** 2
    return train, test

# function to add by-cat ratio features
def add_ratio_functions(train, test):    
    for df in [train, test]:
        for col in features_g:
            for feat in categoric_cols:
                df[f'{col}_mean_group_{feat}']=df[col]/df.groupby(feat)[col].transform('mean')
                df[f'{col}_max_group_{feat}']=df[col]/df.groupby(feat)[col].transform('max')
                df[f'{col}_min_group_{feat}']=df[col]/df.groupby(feat)[col].transform('min')
                df[f'{col}_skew_group_{feat}']=df[col]/df.groupby(feat)[col].transform('skew')
                df[f'{col}_skew_group_{feat}']=df[col]/df.groupby(feat)[col].transform('std')
    return train, test

# function to add simple genetic features
def add_genetic_features(train, test):
    small_val = 0.00001
    for df in [train, test]:
        for col1 in features_g:
            for col2 in features_g:
                if col1 != col2:
                    df[f'{col1}_plus_{col2}'] = df[col1] + df[col2]
                    df[f'{col1}_minus_{col2}'] = df[col1] - df[col2]
                    df[f'{col1}_prod_{col2}'] = df[col1] * df[col2]
                    df[f'{col1}_div_{col2}'] = df[col1] / ( df[col2] + small_val)
                    df[f'{col1}_qq_{col2}'] = (df[col1] + df[col2]) * (df[col1] - df[col2])
                    df[f'{col1}_div2_{col2}'] = (df[col1] - df[col2]) / (df[col1] + df[col2] + small_val)

    return train, test

# Cell Viability Features: Treated vs. Control Comparison (Training Set)

We are going to run Sweetviz-based comparison of c-feature stats and interactions in Treated vs. Control sample subsets within the scored Training set provided as a data input.

In [None]:
%%time
# run additional feature engineering
train_features, test_features = add_stat_features(train_features, test_features)
train_features, test_features = add_g_squared(train_features, test_features)
train_features, test_features = add_ratio_functions(train_features, test_features)
# train_features, test_features = add_genetic_features(train_features, test_features)

In [None]:
imp_features = ['g-721_mean_group_cp_type',
 'g-220_mean_group_cp_type',
 'g-72_mean_group_cp_type',
 'g-729_mean_group_cp_type',
 'g-195_mean_group_cp_type',
 'g-719_mean_group_cp_type',
 'g-353_mean_group_cp_type',
 'g-140_mean_group_cp_type',
 'g-198_mean_group_cp_type',
 'g-270_mean_group_cp_type',
 'g-196_mean_group_cp_type',
 'g-531_mean_group_cp_type',
 'g-194_mean_group_cp_type',
 'g-550_mean_group_cp_type',
 'g-55_mean_group_cp_type',
 'g-266_mean_group_cp_type',
 'g-726_mean_group_cp_type',
 'g-322_squared',
 'g-10_mean_group_cp_type',
 'g-138_min_group_cp_type',
 'g-298_mean_group_cp_type',
 'g-367_max_group_cp_type',
 'g-406_mean_group_cp_type',
 'g-203_mean_group_cp_type',
 'g-147_mean_group_cp_type',
 'g-579_mean_group_cp_type',
 'g-607_mean_group_cp_type',
 'g-100_skew_group_cp_type',
 'g-508_mean_group_cp_type',
 'g-60_mean_group_cp_type',
 'g-729_squared',
 'g-554_squared',
 'g-719_squared',
 'g-470_mean_group_cp_type',
 'g-432_mean_group_cp_type',
 'g-564_squared',
 'g-301_min_group_cp_type',
 'g-569_mean_group_cp_type',
 'g-719_min_group_cp_type',
 'g-633_mean_group_cp_type',
 'g-253_mean_group_cp_type',
 'g-107_mean_group_cp_type',
 'g-96_mean_group_cp_type',
 'g-224_mean_group_cp_type',
 'g-627_mean_group_cp_dose',
 'g-439_mean_group_cp_type',
 'g-217_mean_group_cp_type',
 'g-665_mean_group_cp_type',
 'g-64_mean_group_cp_type',
 'g-168_mean_group_cp_type',
 'g-230_mean_group_cp_type',
 'g-396_mean_group_cp_type',
 'g-656_mean_group_cp_type',
 'g-744_mean_group_cp_type',
 'g-515_mean_group_cp_type',
 'g-563_mean_group_cp_type',
 'g-529_min_group_cp_type',
 'g-374_mean_group_cp_type',
 'g-295_mean_group_cp_type',
 'g-189_mean_group_cp_type',
 'g-473_min_group_cp_type',
 'g-741_mean_group_cp_type',
 'g-487_mean_group_cp_type',
 'g-475_max_group_cp_type',
 'g-70_mean_group_cp_type',
 'g-367_squared',
 'g-50_skew_group_cp_type',
 'g-534_mean_group_cp_type',
 'g-495_mean_group_cp_type',
 'g-525_mean_group_cp_type',
 'g-729_max_group_cp_type',
 'g-764_min_group_cp_type',
 'g-283_max_group_cp_type',
 'g-762_mean_group_cp_type',
 'g-66_mean_group_cp_type',
 'g-39_max_group_cp_type',
 'g-379_squared',
 'g-38_min_group_cp_type',
 'g-57_mean_group_cp_type',
 'g-609_mean_group_cp_type',
 'g-721_max_group_cp_type',
 'g-501_mean_group_cp_time',
 'g-550_max_group_cp_type',
 'g-491_min_group_cp_type',
 'g-222_mean_group_cp_type',
 'g-411_max_group_cp_type',
 'g-410_min_group_cp_type',
 'g-509_min_group_cp_type',
 'g-680_min_group_cp_type',
 'g-663_min_group_cp_type',
 'g-683_max_group_cp_type',
 'g-424_max_group_cp_type',
 'g-689_max_group_cp_type',
 'g-75_skew_group_cp_type',
 'g-152_max_group_cp_type',
 'g-663_squared',
 'g-300_max_group_cp_type',
 'g-145_max_group_cp_type',
 'g-689_squared',
 'g-95_max_group_cp_type',
 'g-201_min_group_cp_type',
 'g-181_min_group_cp_type',
 'g-196_max_group_cp_type',
 'g-392_squared',
 'g-539_min_group_cp_type',
 'g-54_min_group_cp_type',
 'g-249_min_group_cp_type',
 'g-158_skew_group_cp_type',
 'g-229_max_group_cp_type',
 'g-635_min_group_cp_type',
 'g-488_max_group_cp_type',
 'g-568_max_group_cp_type',
 'g-249_mean_group_cp_type',
 'g-113_mean_group_cp_type',
 'g-488_mean_group_cp_type',
 'g-594_max_group_cp_type',
 'g-151_mean_group_cp_type',
 'g-317_max_group_cp_type',
 'g-31_mean_group_cp_type',
 'g-333_max_group_cp_type',
 'g-511_mean_group_cp_type',
 'g-309_min_group_cp_type',
 'g-196_squared',
 'g-742_mean_group_cp_type',
 'g-742_min_group_cp_type',
 'g-754_mean_group_cp_type',
 'g-223_min_group_cp_type',
 'g-423_max_group_cp_type',
 'g-434_max_group_cp_type',
 'g-167_mean_group_cp_type',
 'g-545_min_group_cp_type',
 'g-405_mean_group_cp_type',
 'g-134_min_group_cp_type',
 'g-287_mean_group_cp_type',
 'g-243_max_group_cp_type',
 'g-173_min_group_cp_type',
 'g-437_min_group_cp_type',
 'g-559_min_group_cp_type',
 'g-209_min_group_cp_type',
 'g-578_max_group_cp_type',
 'g-142_mean_group_cp_type',
 'g-98_max_group_cp_type',
 'g-199_mean_group_cp_type',
 'g-417_mean_group_cp_type',
 'g-494_squared',
 'g-685_mean_group_cp_type',
 'g-533_mean_group_cp_type',
 'g-275_min_group_cp_type',
 'g-0_mean_group_cp_type',
 'g-742_squared',
 'g-561_min_group_cp_type',
 'g-0_max_group_cp_type',
 'g-671_squared',
 'g-320_max_group_cp_type',
 'g-272_min_group_cp_type',
 'g-494_max_group_cp_type',
 'g-229_squared',
 'g-151_max_group_cp_type',
 'g-671_max_group_cp_type',
 'g-251_mean_group_cp_type',
 'g-505_min_group_cp_type',
 'g-612_mean_group_cp_type',
 'g-150_mean_group_cp_type',
 'g-234_mean_group_cp_type',
 'g-482_min_group_cp_type',
 'g-259_mean_group_cp_type',
 'g-110_min_group_cp_type',
 'g-542_mean_group_cp_type',
 'g-289_max_group_cp_type',
 'g-480_mean_group_cp_type',
 'g-231_mean_group_cp_type',
 'g-736_mean_group_cp_type',
 'g-613_mean_group_cp_type',
 'g-306_skew_group_cp_type',
 'g-274_mean_group_cp_type',
 'g-211_mean_group_cp_type',
 'g-570_mean_group_cp_type',
 'g-9_mean_group_cp_type',
 'g-373_max_group_cp_type',
 'g-546_min_group_cp_type',
 'g-639_mean_group_cp_type',
 'g-126_mean_group_cp_type',
 'g-661_max_group_cp_type',
 'g-438_max_group_cp_type',
 'g-139_squared',
 'g-36_mean_group_cp_type',
 'g-431_mean_group_cp_type',
 'g-714_mean_group_cp_type',
 'g-179_mean_group_cp_type',
 'g-343_mean_group_cp_type',
 'g-582_mean_group_cp_type',
 'g-187_min_group_cp_type',
 'g-358_mean_group_cp_type',
 'g-92_max_group_cp_type',
 'g-245_max_group_cp_type',
 'g-240_min_group_cp_type',
 'g-576_max_group_cp_type',
 'g-679_mean_group_cp_type',
 'g-234_max_group_cp_type',
 'g-112_min_group_cp_type',
 'g-762_max_group_cp_type',
 'g-697_min_group_cp_type',
 'g-469_mean_group_cp_type',
 'g-591_mean_group_cp_type',
 'g-492_mean_group_cp_type',
 'g-324_mean_group_cp_type',
 'g-247_mean_group_cp_type',
 'g-523_mean_group_cp_type',
 'g-608_mean_group_cp_type',
 'g-252_max_group_cp_type',
 'g-84_max_group_cp_type',
 'g-139_mean_group_cp_type',
 'g-638_mean_group_cp_type',
 'g-218_min_group_cp_type',
 'g-554_max_group_cp_type',
 'g-480_mean_group_cp_dose',
 'g-523_mean_group_cp_dose',
 'g-176_mean_group_cp_type',
 'g-466_mean_group_cp_type',
 'g-79_min_group_cp_type',
 'g-652_min_group_cp_type',
 'g-220_squared',
 'g-357_mean_group_cp_type',
 'g-377_mean_group_cp_type',
 'g-678_max_group_cp_type',
 'g-290_mean_group_cp_type',
 'g-387_mean_group_cp_type',
 'g-133_mean_group_cp_type',
 'g-717_mean_group_cp_type',
 'g-655_max_group_cp_type',
 'g-242_max_group_cp_type',
 'g-25_mean_group_cp_type',
 'g-619_mean_group_cp_type',
 'g-183_min_group_cp_type',
 'g-677_min_group_cp_type',
 'g-513_mean_group_cp_type',
 'g-418_max_group_cp_type',
 'g-194_min_group_cp_type',
 'g-690_mean_group_cp_type',
 'g-705_mean_group_cp_type',
 'g-497_max_group_cp_type',
 'g-1_mean_group_cp_type',
 'g-677_squared',
 'g-192_mean_group_cp_type',
 'g-631_mean_group_cp_type',
 'g-120_mean_group_cp_type',
 'g-527_max_group_cp_type',
 'g-250_max_group_cp_type',
 'g-643_mean_group_cp_type',
 'g-27_max_group_cp_type',
 'g-465_mean_group_cp_type',
 'g-739_max_group_cp_type',
 'g-7_min_group_cp_type',
 'g-61_min_group_cp_type',
 'g-616_max_group_cp_type',
 'g-538_mean_group_cp_type',
 'g-402_mean_group_cp_type',
 'g-54_squared',
 'g-436_mean_group_cp_type',
 'g-502_max_group_cp_type',
 'g-636_mean_group_cp_type',
 'g-249_squared',
 'g-266_max_group_cp_type',
 'g-510_mean_group_cp_type',
 'g-16_mean_group_cp_type',
 'g-251_mean_group_cp_time',
 'g-710_min_group_cp_type',
 'g-62_mean_group_cp_type',
 'g-273_min_group_cp_type',
 'g-103_max_group_cp_type',
 'g-379_min_group_cp_type',
 'g-458_max_group_cp_type',
 'g-484_mean_group_cp_type',
 'g-345_mean_group_cp_type',
 'g-396_min_group_cp_type',
 'g-653_min_group_cp_type',
 'g-225_mean_group_cp_type',
 'g-244_mean_group_cp_type',
 'g-19_mean_group_cp_type',
 'g-24_mean_group_cp_type',
 'g-769_max_group_cp_type',
 'g-148_squared',
 'g-598_min_group_cp_type',
 'g-139_min_group_cp_type',
 'g-557_mean_group_cp_type',
 'g-206_min_group_cp_type',
 'g-96_skew_group_cp_type',
 'g-600_mean_group_cp_type',
 'g-214_max_group_cp_type',
 'g-47_max_group_cp_type',
 'g-315_max_group_cp_type',
 'g-517_min_group_cp_type',
 'g-359_max_group_cp_type',
 'g-724_max_group_cp_type',
 'g-412_max_group_cp_type',
 'g-328_mean_group_cp_time',
 'g-404_min_group_cp_type',
 'g-429_max_group_cp_type',
 'g-48_squared',
 'g-740_mean_group_cp_type',
 'g-490_max_group_cp_type',
 'g-237_min_group_cp_type',
 'g-335_min_group_cp_type',
 'g-605_mean_group_cp_type',
 'g-723_skew_group_cp_type',
 'g-210_max_group_cp_type',
 'g-200_mean_group_cp_type',
 'g-165_mean_group_cp_type',
 'g-543_min_group_cp_type',
 'g-276_min_group_cp_type',
 'g-86_min_group_cp_type',
 'g-330_min_group_cp_type',
 'g-551_max_group_cp_type',
 'g-316_min_group_cp_type',
 'g-53_mean_group_cp_dose',
 'g-601_mean_group_cp_type',
 'g-18_min_group_cp_type',
 'g-353_min_group_cp_type',
 'g-650_mean_group_cp_type',
 'g-220_max_group_cp_type',
 'g-116_min_group_cp_type',
 'g-21_max_group_cp_type',
 'g-29_min_group_cp_type',
 'g-419_mean_group_cp_type',
 'g-350_max_group_cp_type',
 'g-694_max_group_cp_type',
 'g-488_squared',
 'g-521_mean_group_cp_type',
 'g-522_max_group_cp_type',
 'g-105_min_group_cp_type',
 'g-688_mean_group_cp_type',
 'g-401_min_group_cp_type',
 'g-8_min_group_cp_type',
 'g-414_min_group_cp_type',
 'g-478_mean_group_cp_type',
 'g-291_max_group_cp_type',
 'g-297_min_group_cp_type',
 'g-378_max_group_cp_type',
 'g-174_squared',
 'g-198_squared',
 'g-210_squared',
 'g-574_min_group_cp_type',
 'g-17_squared',
 'g-20_max_group_cp_type',
 'g-101_mean_group_cp_dose',
 'g-632_min_group_cp_type',
 'g-585_mean_group_cp_type',
 'g-17_max_group_cp_type',
 'g-544_max_group_cp_type',
 'g-409_max_group_cp_type',
 'g-593_min_group_cp_type',
 'g-621_max_group_cp_type',
 'g-121_mean_group_cp_type',
 'g-702_min_group_cp_type',
 'g-198_min_group_cp_type',
 'g-653_squared',
 'g-174_max_group_cp_type',
 'g-637_min_group_cp_type',
 'g-468_skew_group_cp_type',
 'g-755_mean_group_cp_type',
 'g-111_max_group_cp_type',
 'g-732_squared',
 'g-178_skew_group_cp_type',
 'g-155_max_group_cp_type',
 'g-162_max_group_cp_type',
 'g-298_skew_group_cp_type',
 'g-413_min_group_cp_type',
 'g-503_mean_group_cp_type',
 'g-146_squared',
 'g-597_max_group_cp_type',
 'g-653_mean_group_cp_type',
 'g-543_mean_group_cp_type',
 'g-474_min_group_cp_type',
 'g-483_min_group_cp_type',
 'g-590_min_group_cp_type',
 'g-214_mean_group_cp_time',
 'g-453_max_group_cp_type',
 'g-612_max_group_cp_type',
 'g-585_max_group_cp_type',
 'g-143_min_group_cp_type',
 'g-444_min_group_cp_type',
 'g-491_mean_group_cp_type',
 'g-304_skew_group_cp_type',
 'g-487_mean_group_cp_time',
 'g-495_min_group_cp_type',
 'g-172_min_group_cp_type',
 'g-659_max_group_cp_type',
 'g-432_min_group_cp_type',
 'g-441_min_group_cp_type',
 'g-455_min_group_cp_type',
 'g-398_mean_group_cp_dose',
 'g-301_mean_group_cp_time',
 'g-53_max_group_cp_type',
 'g-439_squared',
 'c-37',
 'g-732_mean_group_cp_type',
 'g-547_mean_group_cp_type',
 'g-530_mean_group_cp_type',
 'g-170_min_group_cp_type',
 'g-301_mean_group_cp_type',
 'g-530_mean_group_cp_dose',
 'g-664_squared',
 'g-447_max_group_cp_type',
 'g-60_mean_group_cp_time',
 'g-302_max_group_cp_type',
 'g-726_min_group_cp_type',
 'g-205_max_group_cp_type',
 'g-753_min_group_cp_type',
 'g-186_squared',
 'g-567_max_group_cp_type',
 'g-101_min_group_cp_type',
 'g-522_squared',
 'g-146_mean_group_cp_type',
 'g-447_mean_group_cp_type',
 'g-620_squared',
 'g-177_min_group_cp_type',
 'g-508_squared',
 'g-526_mean_group_cp_type',
 'g_kurt',
 'g-215_skew_group_cp_type',
 'g-372_mean_group_cp_dose',
 'g-755_min_group_cp_type',
 'g-764_squared',
 'g-695_max_group_cp_type',
 'g-281_min_group_cp_type',
 'g-443_max_group_cp_type',
 'g-11_min_group_cp_type',
 'g-642_min_group_cp_type',
 'g-286_min_group_cp_type',
 'g-149_max_group_cp_type',
 'g-752_max_group_cp_type',
 'g-326_min_group_cp_type',
 'g-569_max_group_cp_time',
 'g-555_max_group_cp_type',
 'g-342_mean_group_cp_time',
 'g-384_min_group_cp_type',
 'g-45_min_group_cp_type',
 'g-56_mean_group_cp_type',
 'g-148_mean_group_cp_type',
 'g-607_min_group_cp_type',
 'g-255_mean_group_cp_type',
 'g-235_max_group_cp_type',
 'g-311_mean_group_cp_type',
 'g-696_max_group_cp_type',
 'g-148_mean_group_cp_time',
 'g-736_squared',
 'g-284_max_group_cp_type',
 'g-758_min_group_cp_type',
 'g-644_mean_group_cp_time',
 'g-725_max_group_cp_type',
 'g-624_mean_group_cp_type',
 'g-335_squared',
 'g-378_mean_group_cp_type',
 'g-28_squared',
 'g-703_min_group_cp_type',
 'g-346_max_group_cp_type',
 'g-416_mean_group_cp_type',
 'g-230_squared',
 'g-666_max_group_cp_type',
 'g-133_squared',
 'g-342_mean_group_cp_dose',
 'g-745_mean_group_cp_type',
 'g-347_min_group_cp_type',
 'g-615_min_group_cp_type',
 'g-451_min_group_cp_type',
 'g-675_min_group_cp_type',
 'g-512_skew_group_cp_type',
 'g-71_min_group_cp_type',
 'g-13_min_group_cp_type',
 'g-656_mean_group_cp_dose',
 'g-449_min_group_cp_type',
 'cp_dose',
 'g-111_squared',
 'g-140_max_group_cp_type',
 'g-295_max_group_cp_type',
 'g-106_min_group_cp_type',
 'g-596_mean_group_cp_type',
 'g-376_min_group_cp_type',
 'g-727_min_group_cp_type',
 'g-640_max_group_cp_type',
 'g-428_mean_group_cp_type',
 'g-592_min_group_cp_type',
 'g-422_max_group_cp_type',
 'g-603_max_group_cp_type',
 'g-389_min_group_cp_type',
 'g-313_min_group_cp_type',
 'g-154_mean_group_cp_type',
 'g-407_min_group_cp_type',
 'g-556_min_group_cp_type',
 'g-588_max_group_cp_type',
 'g-159_mean_group_cp_type',
 'g-745_squared',
 'g-197_max_group_cp_type',
 'g-4_mean_group_cp_time',
 'g-751_mean_group_cp_time',
 'g-92_mean_group_cp_dose',
 'g-256_mean_group_cp_time',
 'g-537_mean_group_cp_dose',
 'g-129_mean_group_cp_time',
 'g-464_squared',
 'g-136_mean_group_cp_dose',
 'g-357_squared',
 'g-232_min_group_cp_type',
 'g-748_max_group_cp_type',
 'g-382_min_group_cp_type',
 'g-572_min_group_cp_type',
 'g-337_max_group_cp_type',
 'g-344_min_group_cp_type',
 'g-4_mean_group_cp_dose',
 'g-462_max_group_cp_type',
 'g-239_max_group_cp_type',
 'g-602_min_group_cp_type',
 'g-129_mean_group_cp_type',
 'g-293_min_group_cp_type',
 'g-2_min_group_cp_type',
 'g-408_mean_group_cp_dose',
 'g_skew',
 'g-19_mean_group_cp_dose',
 'g-128_skew_group_cp_time',
 'g-382_mean_group_cp_time',
 'g-87_min_group_cp_type',
 'g-35_min_group_cp_type',
 'g-341_mean_group_cp_dose',
 'g-532_mean_group_cp_dose',
 'g-402_squared',
 'g-421_mean_group_cp_type',
 'g-558_mean_group_cp_type',
 'g-140_squared',
 'g-285_min_group_cp_type',
 'g-245_squared',
 'g-321_max_group_cp_type',
 'g-308_squared',
 'c-22',
 'g-388_min_group_cp_type',
 'g-692_max_group_cp_type',
 'g-737_min_group_cp_type',
 'g-673_mean_group_cp_time',
 'g-380_min_group_cp_type',
 'g-327_squared',
 'g-746_max_group_cp_type',
 'g-183_mean_group_cp_time',
 'g-311_squared',
 'g-529_squared',
 'g-226_min_group_cp_type',
 'g-479_max_group_cp_type',
 'g-354_min_group_cp_type',
 'g-85_max_group_cp_time',
 'g-125_max_group_cp_dose',
 'g-504_squared',
 'g-399_max_group_cp_type',
 'g-30_squared',
 'g-553_squared',
 'g-43_mean_group_cp_dose',
 'g-14_min_group_cp_type',
 'g-607_squared',
 'g-264_squared',
 'g-520_min_group_cp_type',
 'g-770_squared',
 'g-426_mean_group_cp_time',
 'g-484_squared',
 'g-337_squared',
 'g-329_max_group_cp_type',
 'g-645_squared',
 'g-80_squared',
 'g-68_squared',
 'g-766_squared',
 'g-426_squared',
 'g-99_min_group_cp_type',
 'g-188_mean_group_cp_time',
 'g-665_min_group_cp_type',
 'g-599_mean_group_cp_dose',
 'g-122_mean_group_cp_time',
 'g-6_mean_group_cp_dose',
 'g_mean',
 'g-704_min_group_cp_type',
 'g-580_mean_group_cp_type',
 'g-36_squared',
 'g-362_min_group_cp_type']
len(imp_features)

# Use the above features in MultiOutputClassifier


In [None]:
X_train = train_features[imp_features][:]
X_test = test_features[imp_features][:]
print(X_train.shape, X_test.shape)

In [None]:
nums = X_train.select_dtypes(include='number').columns.tolist()
len(nums)

In [None]:
cats = X_train.select_dtypes(include='object').columns.tolist()
cats

In [None]:
X_train[cats[0]] = X_train[cats[0]].map({'D1':0,'D2':1}).values
X_train.shape

In [None]:
X_test[cats[0]] = X_test[cats[0]].map({'D1':0,'D2':1}).values
X_test.shape

In [None]:
labels = d.columns.to_list()[1:]
len(labels)

In [None]:
y_train = d[labels][:]
y_train.shape

In [None]:
######## Import some multi-output models #
from sklearn.svm import LinearSVR, LinearSVC
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
from sklearn.multioutput import RegressorChain, ClassifierChain
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import scipy as sp
from scipy.stats import uniform
max_estims = 500
import time
start_time = time.time()
from sklearn.neighbors import KNeighborsRegressor
max_neighbors = 100
modelchain4grid = XGBClassifier(n_estimators=200, random_state=1)
wrapper4grid = OneVsRestClassifier(estimator=modelchain4grid)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
grid_result = wrapper4grid.fit(X_train, y_train)
print('Time taken in secs = %0.0f' %(time.time()-start_time))

In [None]:
predictions = grid_result.predict(X_test)
predictions.shape

In [None]:
subm = pd.DataFrame(predictions,columns=labels, index=test_features.index[:])
subm['sig_id'] = test_features['sig_id'].values[:]
subm = subm[['sig_id']+labels]
print(subm.shape)
subm.head()

In [None]:
subm.to_csv('submission.csv', index=False)