# Load feature and target

In [None]:
%%time
import pandas as pd
import numpy as np
import gc
import os
import random
import pickle
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.sparse import hstack,vstack,csr_matrix,save_npz,load_npz
from sklearn.decomposition import TruncatedSVD
import tensorflow as tf
import tensorflow_addons as tfa
import json

############################################################################
#----- work folder -----
############################################################################
settings = json.load(open('./settings.json'))

input_path = settings['input_path']
features_path = settings['features_path']
model_path = settings['model_path']
sub_path = settings['sub_path']

In [None]:
%%time

def zscore(x):
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        x_row = (x_row - np.mean(x_row)) / np.std(x_row)
        x_zscore.append(x_row)
    x_std = np.array(x_zscore)    
    return x_std

print('target')
train_df = pd.read_feather(feature_path+'train_multi_inputs_id.feather')
test_df = pd.read_feather(feature_path+'test_multi_inputs_id.feather')
train_multi_y = np.load(feature_path+'train_multi_targets.npy')    
train_multi_y = zscore(train_multi_y)

print('clr svd')
train_df_raw = pd.read_feather(feature_path+'train_multi_inputs_id_raw.feather')

multi_inputs_clr_svd = np.load(feature_path+'multi_inputs_svd_clr_100.npy')
train_multi_clr_svd = multi_inputs_clr_svd[:len(train_df_raw)]
train_multi_clr_svd = zscore(train_multi_clr_svd)

train_multi_clr_svd = pd.DataFrame(train_multi_clr_svd)
train_multi_clr_svd['cell_id'] = train_df_raw['cell_id']

train_multi_clr_svd = train_df.merge(train_multi_clr_svd, on=['cell_id'], how='left')
train_multi_clr_svd = train_multi_clr_svd.fillna(0)
train_multi_clr_svd = train_multi_clr_svd.drop(['cell_id','day','donor','cell_type'],axis=1).values

test_multi_clr_svd = multi_inputs_clr_svd[len(train_df_raw):]
test_multi_clr_svd = zscore(test_multi_clr_svd)

print('lgb1')
multi_lgb1_svd = np.load(feature_path+'multi_lgb_svd_100.npy')
train_multi_lgb1_svd = multi_lgb1_svd[:len(train_df)]
train_multi_lgb1_svd = zscore(train_multi_lgb1_svd)

test_multi_lgb1_svd = multi_lgb1_svd[len(train_df):]
test_multi_lgb1_svd = zscore(test_multi_lgb1_svd)

print('concatenate')
train_multi_X = np.concatenate([train_multi_clr_svd,
                                train_multi_lgb1_svd,  
                                ],axis=1)

test_multi_X = np.concatenate([test_multi_clr_svd,
                                test_multi_lgb1_svd,
                                ],axis=1)

# Utils

In [None]:

def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def cosine_similarity_loss(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = tf.reduce_mean(x, axis=1, keepdims=True)
    my = tf.reduce_mean(y, axis=1, keepdims=True)
    xm, ym = x - mx, y - my
    t1_norm = tf.math.l2_normalize(xm, axis = 1)
    t2_norm = tf.math.l2_normalize(ym, axis = 1)
    cosine = tf.keras.losses.CosineSimilarity(axis = 1)(t1_norm, t2_norm)
    return cosine

class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, train_X, train_y, list_IDs, shuffle, batch_size, labels, ): 
        self.train_X = train_X
        self.train_y = train_y
        self.list_IDs = list_IDs        
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.labels = labels
        self.on_epoch_end()
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.list_IDs) // self.batch_size
        return ct
    
    def __getitem__(self, idx):
        'Generate one batch of data'
        indexes = self.list_IDs[idx*self.batch_size:(idx+1)*self.batch_size]
    
        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        if self.labels: return X, y
        else: return X
 
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange( len(self.list_IDs) )
        if self.shuffle: 
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'    
        X = self.train_X[list_IDs_temp]
        y = self.train_y[list_IDs_temp]        
        return X, y
    
def nn_kfold(train_df, train_multi_X, train_multi_y, test_df, test_multi_X, network, folds, model_name):
    oof_preds = np.zeros((train_df.shape[0],23418))
    sub_preds = np.zeros((test_df.shape[0],23418))
    cv_corr = []
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df,)):          
        print (n_fold)
        train_x = train_multi_X[train_idx]
        valid_x = train_multi_X[valid_idx]
        train_y = train_multi_y[train_idx]
        valid_y = train_multi_y[valid_idx]

        train_x_index = train_df.iloc[train_idx].reset_index(drop=True).index
        valid_x_index = train_df.iloc[valid_idx].reset_index(drop=True).index
        
        model = network(train_multi_X.shape[1])
        filepath = model_name+'_'+str(n_fold)+'.h5'
        es = tf.keras.callbacks.EarlyStopping(patience=8, mode='min', verbose=1) 
        checkpoint = tf.keras.callbacks.ModelCheckpoint(monitor='val_loss', filepath=filepath, save_best_only=True,save_weights_only=True,mode='min') 
        reduce_lr_loss = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=LR_FACTOR, patience=5, verbose=1)
    
        train_dataset = DataGenerator(
            train_x,
            train_y,
            list_IDs=train_x_index, 
            shuffle=True, 
            batch_size=BATCH_SIZE, 
            labels=True,
        )
        
        valid_dataset = DataGenerator(
            valid_x,
            valid_y,
            list_IDs=valid_x_index, 
            shuffle=False, 
            batch_size=BATCH_SIZE, 
            labels=True,
        )
    
        hist = model.fit(train_dataset,
                        validation_data=valid_dataset,  
                        epochs=EPOCHS, 
                        callbacks=[checkpoint,es,reduce_lr_loss],
                        workers=4,
                        verbose=1)  
    
        model.load_weights(filepath)
        
        oof_preds[valid_idx] = model.predict(valid_x, 
                                batch_size=BATCH_SIZE,
                                verbose=1)
        
        oof_corr = correlation_score(valid_y,  oof_preds[valid_idx])
        cv_corr.append(oof_corr)
        print (cv_corr)       
        
        sub_preds += model.predict(test_multi_X, 
                                batch_size=BATCH_SIZE,
                                verbose=1) / folds.n_splits 
            
        del model
        gc.collect()
        tf.keras.backend.clear_session()    
    cv = correlation_score(train_multi_y,  oof_preds)
    print ('Overall:',cv)           
    return oof_preds,sub_preds    

# Model1 - cosine similarity loss

In [None]:
%%time

def multi_cos_sim_model(len_num):
    
    #######################  svd  #######################   
    input_num = tf.keras.Input(shape=(len_num))     
    
    x = tf.keras.layers.Dense(600,activation ='swish',)(input_num)    
    x = tf.keras.layers.GaussianDropout(0.3)(x)   
    x = tf.keras.layers.Dense(600,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.3)(x)   
    x = tf.keras.layers.Dense(600,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.3)(x)    
    
    output = tf.keras.layers.Dense(23418, activation='linear')(x)

    model = tf.keras.models.Model(input_num, output)
    lr=0.001
    adam = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, epsilon=None, )
    model.compile(loss=cosine_similarity_loss, optimizer=adam,)
    model.summary()
    return model


BATCH_SIZE = 150
EPOCHS = 100
LR_FACTOR = 0.1
SEED = 666
N_FOLD = 5
folds = KFold(n_splits= N_FOLD, shuffle=True, random_state=SEED)     
oof_preds_cos,sub_preds_cos = nn_kfold(train_df, train_multi_X, train_multi_y,test_df, test_multi_X, multi_cos_sim_model, folds, 'multi_cos_model')

# Model2 - huber loss

In [None]:
%%time

def multi_huber_model(len_num):
    
    #######################  svd  #######################   
    input_num = tf.keras.Input(shape=(len_num))     

    x = tf.keras.layers.Dense(500,activation ='swish',)(input_num)    
    x = tf.keras.layers.GaussianDropout(0.3)(x)   
    x = tf.keras.layers.Dense(500,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.3)(x)   
    x = tf.keras.layers.Dense(500,activation ='swish',)(x) 
    x = tf.keras.layers.GaussianDropout(0.3)(x)    
    
    output = tf.keras.layers.Dense(23418, activation='linear')(x) 

    model = tf.keras.models.Model(input_num, output)    
    
    lr=0.001
    decay = lr / 10
    adam = tf.keras.optimizers.Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=decay)
    
    model.compile(loss=tf.keras.losses.Huber(delta=4.0), optimizer=adam,)
    model.summary()
    return model

BATCH_SIZE = 150
EPOCHS = 100
LR_FACTOR = 0.1
SEED = 666
folds = KFold(n_splits= 5, shuffle=True, random_state=SEED)    

oof_preds_huber,sub_preds_huber = nn_kfold(train_df, train_multi_X, train_multi_y,test_df, test_multi_X, multi_huber_model, folds, 'multi_huber_model')

# Weighted Average

In [None]:
%%time
oof_preds_cos = zscore(oof_preds_cos)
oof_preds_huber = zscore(oof_preds_huber)
oof_preds = oof_preds_cos*0.5 + oof_preds_huber*0.5
cv = correlation_score(train_cite_y,  oof_preds)
print ('Blend:',cv)     

sub_preds_cos = zscore(sub_preds_cos)
sub_preds_huber = zscore(sub_preds_huber)
sub_preds = sub_preds_cos*0.5 + sub_preds_huber*0.5


# Save for submission

In [None]:
np.save(ensemble_path+'senkin_multi_ensemble.npy', sub_preds)

# Merge Cite and Multi  submission

In [None]:
metadata = pd.read_csv(input_path+'metadata.csv.zip')[['cell_id','technology']]
evaluation_ids = pd.read_csv(input_path+'evaluation_ids.csv.zip')
evaluation_ids = evaluation_ids.merge(metadata, on=['cell_id'], how='left')

# cite
train_cite_targets = pd.read_hdf(input_path+'train_cite_targets.h5')
cite_targets = train_cite_targets.columns.values.tolist()

del train_cite_targets
gc.collect()

test_preds_cite = np.load(ensemble_path+'cite_sub_preds.npy')
test_preds_cite = pd.DataFrame(test_preds_cite, columns=cite_targets)

test_cite_inputs_id = pd.read_feather(feature_path+'test_cite_inputs_id.feather')
test_preds_cite['cell_id'] = test_cite_inputs_id['cell_id']
test_preds_cite = test_preds_cite[test_preds_cite['cell_id'].isin(evaluation_ids['cell_id'])]
test_preds_cite = pd.melt(test_preds_cite,id_vars='cell_id')
test_preds_cite.columns = ['cell_id','gene_id','target']

del test_cite_inputs_id
gc.collect()

# multi
train_multi_targets = pd.read_hdf(input_path+'train_multi_targets.h5')
multi_targets = train_multi_targets.columns.values.tolist()

del train_multi_targets
gc.collect()

test_preds_multi = pd.DataFrame(sub_preds, columns=multi_targets)

test_multi_inputs_id = pd.read_feather(feature_path+'test_multi_inputs_id.feather')
test_preds_multi['cell_id'] = test_multi_inputs_id['cell_id']
test_preds_multi = test_preds_multi[test_preds_multi['cell_id'].isin(evaluation_ids['cell_id'])]
test_preds_multi = pd.melt(test_preds_multi,id_vars='cell_id')
test_preds_multi.columns = ['cell_id','gene_id','target']

del test_multi_inputs_id
gc.collect()

# merge
test_preds = pd.concat([test_preds_cite,test_preds_multi])
evaluation_ids = evaluation_ids.merge(test_preds, on=['cell_id','gene_id'], how='left')
evaluation_ids[['row_id','target']].to_csv(sub_path+'submission.csv',index=False)