**This notebook applies t-test to select genes and cells for each target class. After performing feature selection, logistic regression is used as prediction model.  **

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# Data preprocess

In [None]:
def preprocess(df):
    df = df.copy()
    df['cp_type_trt'] = np.where(df['cp_type'].values == 'trt_cp', 1, 0)
    df['cp_type_ctl'] = np.where(df['cp_type'].values == 'trt_cp', 0, 1)
    df['cp_dose_D1'] = np.where(df['cp_dose'].values == 'D1', 1, 0)
    df['cp_dose_D2'] = np.where(df['cp_dose'].values == 'D1', 0, 1)
    df['cp_time_24'] = np.where(df['cp_time'].values == 24, 1, 0)
    df['cp_time_48'] = np.where(df['cp_time'].values == 48, 1, 0)
    df['cp_time_72'] = np.where(df['cp_time'].values == 72, 1, 0)
    return df

def make_X(dt, dense_cols, cat_feats):
    X = {"dense": dt[dense_cols].to_numpy()}
    for i, v in enumerate(cat_feats):
        X[v] = dt[[v]].to_numpy()
    return X


def get_data(ROOT = '../input/lish-moa'):

    cat_feat = ['cp_dose', 'cp_time']

    train = pd.read_csv(f"{ROOT}/train_features.csv")
    test = pd.read_csv(f"{ROOT}/test_features.csv")

    GENES = [col for col in train.columns if col.startswith('g-')]
    CELLS = [col for col in train.columns if col.startswith('c-')]

    train[GENES] = (train[GENES].values + 10)/20
    train[CELLS] = (train[CELLS].values + 10) / 20
    test[GENES] = (test[GENES].values + 10) / 20
    test[CELLS] = (test[CELLS].values + 10) / 20

    label = pd.read_csv(f"{ROOT}/train_targets_scored.csv")
    label_test = pd.read_csv(f"{ROOT}/sample_submission.csv")

    train = preprocess(train)
    test = preprocess(test)

    FE = list(train)
    FE.remove('sig_id')
    FE.remove('cp_type_ctl')
    FE.remove('cp_type_trt')
    FE.remove('cp_type')
    for cat in cat_feat:
        FE.remove(cat)

    train = train.set_index('sig_id')
    test = test.set_index('sig_id')
    label = label.set_index('sig_id')
    label_test = label_test.set_index('sig_id')

    label = label.loc[train.index]
    label_test = label_test.loc[test.index]

    train = pd.concat([train, label], axis=1)
    test = pd.concat([test, label_test], axis=1)

    train['total'] = np.where(np.sum(train[list(label)].values, axis=1)>0, 0, 1)

    return train, test, FE, cat_feat, list(label)

# t-test based feature selection

In [None]:
#example for 10 genes and 10 cells

ROOT = '../input/lish-moa'
train = pd.read_csv(f"{ROOT}/train_features.csv")
label = pd.read_csv(f"{ROOT}/train_targets_scored.csv")

train = train.reset_index(drop=True)
label = label.loc[train.index]
train = pd.concat([train, label], axis=1)

GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]

selected_genes = {}
selected_cells = {}

for lab in tqdm(list(label)[1:5]):
    temp_gene = []
    for g in tqdm(GENES[:5]):
        t_test = stats.ttest_ind(train.loc[train[lab] == 0, g].values,train.loc[train[lab] == 1, g].values)
        if t_test[1]<=0.10:
            temp_gene.append(g)
    
    print(lab, temp_gene)
    selected_genes[lab] = temp_gene

    temp_cell = []
    for c in tqdm(CELLS[:5]):
        t_test = stats.ttest_ind(train.loc[train[lab] == 0, c].values, train.loc[train[lab] == 1, c].values)
        if t_test[1] <= 0.10:
            temp_cell.append(c)
    
    print(lab, temp_cell)
    selected_cells[lab] = temp_cell

# Logistic regression model for each target class

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
import pickle
import warnings
warnings.filterwarnings('ignore')

with open('../input/t-test-based-selected-variables/selected_genes.pkl', 'rb') as handle:
    selected_genes = pickle.load(handle)

with open('../input/t-test-based-selected-variables/selected_cells.pkl', 'rb') as handle:
    selected_cells = pickle.load(handle)

train, test, FE, cat_feat, labels = get_data()
train = train.reset_index(drop=True)

exog_vars = ['cp_type_trt', 'cp_type_ctl', 'cp_dose_D1',
              'cp_dose_D2', 'cp_time_24', 'cp_time_48', 'cp_time_72']

sub = pd.DataFrame()
sub['sig_id'] = test.index

scores = []
for lab in labels:
    
    if np.sum(train[lab].values)<=1:
        scores.append(0)
        sub[lab] = np.zeros(len(test))
    else:
        
        skf = StratifiedKFold(n_splits=5, random_state=54, shuffle=True)
        
        preds = []
        y_true = []
        pred_test = 0
        for train_index, test_index in skf.split(train.index, train[lab].values):
        
            logit_model = LogisticRegression(random_state=0).fit(train.loc[train_index, selected_genes[lab]+selected_cells[lab]+exog_vars].values, train.loc[train_index, lab].values)
    
            pred = logit_model.predict_proba(train.loc[test_index, selected_genes[lab]+selected_cells[lab]+exog_vars].values)[:, 1]
            preds += list(pred)
            y_true += list(train.loc[test_index, lab].values)
            
            pred_test = pred_test + logit_model.predict_proba(test[selected_genes[lab]+selected_cells[lab]+exog_vars].values)[:, 1]
            
        score = log_loss(y_true, preds, labels=[0,1])
        print(lab, ' loss:', score)
        scores.append(score)

        
        sub[lab] = pred_test/5

print('#'*150)
print('CV average:', np.mean(scores))
print('CV std:', np.std(scores))
print('#'*150)

sub = sub.set_index('sig_id')
sub.loc[test[test['cp_type_ctl']==1].index, labels]=0
sub.to_csv('submission.csv')