In [None]:
%%time
import pandas as pd
import numpy as np
import gc
import os
import random
import pickle
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.sparse import hstack,vstack,csr_matrix,save_npz,load_npz
from sklearn.decomposition import TruncatedSVD
import tensorflow as tf
import tensorflow_addons as tfa
import json

############################################################################
#----- work folder -----
############################################################################
settings = json.load(open('./settings.json'))

input_path = settings['input_path']
features_path = settings['features_path']
model_path = settings['model_path']
sub_path = settings['sub_path']

In [None]:

def zscore(x):
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        x_row = (x_row - np.mean(x_row)) / np.std(x_row)
        x_zscore.append(x_row)
    x_std = np.array(x_zscore)    
    return x_std

# senkin cite
test_cite_preds = np.load(ensemble_path+'senkin_cite_ensemble.npy')
# tmp cite
test_cite_preds2 = jl.load(ensemble_path+'tmp_cite_ensemble.joblib').values
test_cite_preds2 =  test_cite_preds2.reshape(-1,140)

test_cite_preds = zscore(test_cite_preds)
test_cite_preds2 = zscore(test_cite_preds2)
test_cite_ensemble = test_cite_preds*0.5 + test_cite_preds2*0.5

# senkin multi
test_multi_ensemble = np.load(ensemble_path+'senkin_multi_ensemble.npy')

In [None]:
metadata = pd.read_csv(input_path+'metadata.csv.zip')[['cell_id','technology']]
evaluation_ids = pd.read_csv(input_path+'evaluation_ids.csv.zip')
evaluation_ids = evaluation_ids.merge(metadata, on=['cell_id'], how='left')

# cite
train_cite_targets = pd.read_hdf(input_path+'train_cite_targets.h5')
cite_targets = train_cite_targets.columns.values.tolist()

del train_cite_targets
gc.collect()

test_preds_cite = pd.DataFrame(test_cite_ensemble, columns=cite_targets)

test_cite_inputs_id = pd.read_feather(feature_path+'test_cite_inputs_id.feather')
test_preds_cite['cell_id'] = test_cite_inputs_id['cell_id']
test_preds_cite = test_preds_cite[test_preds_cite['cell_id'].isin(evaluation_ids['cell_id'])]
test_preds_cite = pd.melt(test_preds_cite,id_vars='cell_id')
test_preds_cite.columns = ['cell_id','gene_id','target']

del test_cite_inputs_id
gc.collect()

# multi
train_multi_targets = pd.read_hdf(input_path+'train_multi_targets.h5')
multi_targets = train_multi_targets.columns.values.tolist()

del train_multi_targets
gc.collect()

test_preds_multi = pd.DataFrame(test_multi_ensemble, columns=multi_targets)

test_multi_inputs_id = pd.read_feather(feature_path+'test_multi_inputs_id.feather')
test_preds_multi['cell_id'] = test_multi_inputs_id['cell_id']
test_preds_multi = test_preds_multi[test_preds_multi['cell_id'].isin(evaluation_ids['cell_id'])]
test_preds_multi = pd.melt(test_preds_multi,id_vars='cell_id')
test_preds_multi.columns = ['cell_id','gene_id','target']

del test_multi_inputs_id
gc.collect()

# merge
test_preds = pd.concat([test_preds_cite,test_preds_multi])
evaluation_ids = evaluation_ids.merge(test_preds, on=['cell_id','gene_id'], how='left')
evaluation_ids[['row_id','target']].to_csv(sub_path+'submission.csv',index=False)