# do
* 64次元にpcaしたデータのうち32次元使用
* 500エポック
* batch=256
* lr=0.001

## experiment

In [1]:
MODEL_NAME = 'keras60'
DREDUCTION = 'svd'
VERSION = 'original'
N_COMPONENTS = 512
SLICE_FEAT = 40
EPOCH = 5000
BATCH_SIZE = 512
LEARNING_RATE = 0.001

## drive mount

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## install libraries

In [3]:
! pip install tables colorama tensorflow-determinism

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## import libraries

In [4]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse
import math

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization,Activation
import random
import warnings
warnings.filterwarnings("ignore")

## global variables

In [5]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")
FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")
FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")
OOF_NAME = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_oof/{MODEL_NAME}.pkl"
OOF_NAME_PRIVATE = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_oof/pri-{MODEL_NAME}.pkl"
SUBMISSION_NAME = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_sub/{MODEL_NAME}.csv"
SUBMISSION_NAME_PRIVATE = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_sub/pri-{MODEL_NAME}.csv"
VERBOSE = 0
N_SPLIT = 3

## set seed

In [6]:
def set_seed(seed=200):
    tf.random.set_seed(seed)
    # optional
    # for numpy.random
    np.random.seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_seed(0)

## cal score

In [7]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## loading data & pca test

In [8]:
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="multiome"]
cell_index =np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["index"]
meta = metadata_df.reindex(cell_index)
meta_new=meta.reset_index(drop=True)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/{DREDUCTION}_{VERSION}_Varianced_train_multi_inputs{N_COMPONENTS}.pkl','rb') as f: X = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/{DREDUCTION}_{DREDUCTION}_{VERSION}_Varianced_train_multi_{N_COMPONENTS}.pkl','rb') as f: pca_train = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/{DREDUCTION}_{DREDUCTION}_original_Varianced_target_multi_{N_COMPONENTS}.pkl','rb') as f: pca_target = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/{DREDUCTION}_original_Varianced_Y_multi_inputs{N_COMPONENTS}.pkl','rb') as f: Y = pickle.load(f)
origin_Y = scipy.sparse.load_npz('/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_values.sparse.npz')
multi_test_x = scipy.sparse.load_npz(f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/Varianced_{VERSION}_test_multi_inputs_values.sparse.npz")
multi_test_x = pca_train.transform(multi_test_x)

## select features

In [9]:
X = X[:,:SLICE_FEAT]
multi_test_x = multi_test_x[:,:SLICE_FEAT]
Y.shape, X.shape ,multi_test_x.shape

((105942, 512), (105942, 40), (55935, 40))

## pre-processing

In [10]:
def std(x):
    return (x - np.mean(x)) / np.std(x)

In [11]:
st = StandardScaler()
both = np.vstack([X, multi_test_x])
both = st.fit_transform(both)
#both = np.apply_along_axis(std, 1, both)
X = both[:105942]
Xt = both[105942:]

## create model

In [12]:
def create_model():
    
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape =(X.shape[1],))

    x0 = Dense(256, 
              kernel_regularizer = REG1,
               activation = activation,
             )(inputs)
    #x0 = BatchNormalization()(x0)
    #x0 = Activation(activation)(x0)
    x0 = Dropout(DROP)(x0)

    
    x1 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x0)
    #x1 = BatchNormalization()(x1)
    #x1 = Activation(activation)(x1)
    x1 = Dropout(DROP)(x1)
    
    
    x2 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x1) 
    #x2 = BatchNormalization()(x2)
    #x2 = Activation(activation)(x2)
    x2= Dropout(DROP)(x2)
    
    x3 = Dense(Y.shape[1],
               kernel_regularizer = REG1,
               activation = activation,
             )(x2)
    #x3 = BatchNormalization()(x3)
    #x3 = Activation(activation)(x3)
    x3 = Dropout(DROP)(x3)

         
    x = Concatenate()([
                x0, 
                x1, 
                x2, 
                x3
                ])
    
    x = Dense(Y.shape[1], 
                kernel_regularizer = REG2,
                activation='linear',
                )(x)
    
    
    model = Model(inputs, x)
    

    return model

## private cv

In [13]:
kf = GroupKFold(n_splits = N_SPLIT)
#train_pred = np.zeros((X.shape[0], 23418), dtype='float16')
score_list = []
for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.donor)):
    
    tr_day_idx = meta_new.iloc[idx_tr][meta_new.day!=7].index
    va_day_idx=meta_new[meta_new.day==7].index
    
    X_tr = X[tr_day_idx]
    y_tr = Y[tr_day_idx]
    X_va = X[va_day_idx]
    y_va = Y[va_day_idx] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =EPOCH,
              verbose = VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks = [es,lr]
             )
    pred = model.predict(X_va)
    #train_pred[idx_va] = pred
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,pred),2)}')
    corrscore = correlation_score(origin_Y[va_day_idx].todense(), pred@pca_target.components_)
    print(f"Fold {fold}: {es.stopped_epoch:3} epochs, private corr =  {corrscore:.5f}")
    score_list.append(corrscore)        
    del X_tr,X_va,y_tr,y_va
    gc.collect()
print(f"{Fore.GREEN}{Style.BRIGHT}Average private corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")
#pred_df = pd.DataFrame(data=train_pred)
#pred_df.to_csv(OOF_NAME)


 --------- FOLD 0 -----------
Mean squared error = 15.40999984741211
Fold 0: 104 epochs, private corr =  0.60422

 --------- FOLD 1 -----------
Mean squared error = 15.210000038146973
Fold 1:  59 epochs, private corr =  0.60513

 --------- FOLD 2 -----------
Mean squared error = 15.149999618530273
Fold 2:  68 epochs, private corr =  0.60516
[32m[1mAverage private corr = 0.60484[0m


In [14]:
kf = GroupKFold(n_splits = N_SPLIT)
train_pred = np.zeros((X.shape[0], 23418), dtype='float16')
score_list = []
for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.day)):
    
    X_tr = X[idx_tr]
    y_tr = Y[idx_tr]
    X_va = X[idx_va]
    y_va = Y[idx_va] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =EPOCH,
              verbose = VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks = [es,lr]
             )
    pred = model.predict(X_va)
    train_pred[idx_va] = pred@pca_target.components_
    filename = f"model_{fold}"
    model.save(filename)
    corrscore = correlation_score(origin_Y[idx_va].todense(), pred@pca_target.components_)
    #print(f"Fold {fold}: {es.stopped_epoch:3} epochs, public corr =  {corrscore:.5f}")
    score_list.append(corrscore)
    gc.collect()
#print(f"{Fore.GREEN}{Style.BRIGHT}Average public corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")
pred_df = pd.DataFrame(data=train_pred)
pred_df.to_pickle(OOF_NAME_PRIVATE)



## create submission

In [15]:
%%time
test_pred = np.zeros((multi_test_x.shape[0], 23418), dtype='float16')
for fold in range(N_SPLIT):
    print(f'fold {fold} prediction')
    model = tf.keras.models.load_model(f"model_{fold}")
    test_pred += (model.predict(Xt)@pca_target.components_)/N_SPLIT
    gc.collect()
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/evaluation.parquet")
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
y_columns = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)
gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))
valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)
submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'
submission.to_csv(SUBMISSION_NAME_PRIVATE)

fold 0 prediction
fold 1 prediction
fold 2 prediction
CPU times: user 4min 30s, sys: 12.8 s, total: 4min 43s
Wall time: 4min 2s


## public cv

In [16]:
kf = GroupKFold(n_splits = N_SPLIT)
train_pred = np.zeros((X.shape[0], 23418), dtype='float16')
score_list = []
for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.donor)):
    
    X_tr = X[idx_tr]
    y_tr = Y[idx_tr]
    X_va = X[idx_va]
    y_va = Y[idx_va] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =EPOCH,
              verbose = VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks = [es,lr]
             )
    pred = model.predict(X_va)
    train_pred[idx_va] = pred@pca_target.components_
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,pred),2)}')
    filename = f"model_{fold}"
    model.save(filename)
    print('model saved :',filename)
    corrscore = correlation_score(origin_Y[idx_va].todense(), pred@pca_target.components_)
    print(f"Fold {fold}: {es.stopped_epoch:3} epochs, public corr =  {corrscore:.5f}")
    score_list.append(corrscore)  
    del X_tr,X_va,y_tr,y_va
    gc.collect()
print(f"{Fore.GREEN}{Style.BRIGHT}Average public corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")
pred_df = pd.DataFrame(data=train_pred)
pred_df.to_pickle(OOF_NAME)


 --------- FOLD 0 -----------
Mean squared error = 10.210000038146973
model saved : model_0
Fold 0: 103 epochs, public corr =  0.66505

 --------- FOLD 1 -----------
Mean squared error = 9.65999984741211
model saved : model_1
Fold 1: 117 epochs, public corr =  0.67130

 --------- FOLD 2 -----------
Mean squared error = 9.670000076293945
model saved : model_2
Fold 2: 116 epochs, public corr =  0.66846
[32m[1mAverage public corr = 0.66827[0m


## create submission

In [17]:
%%time
test_pred = np.zeros((multi_test_x.shape[0], 23418), dtype='float16')
for fold in range(N_SPLIT):
    print(f'fold {fold} prediction')
    model = tf.keras.models.load_model(f"model_{fold}")
    test_pred += (model.predict(Xt)@pca_target.components_)/N_SPLIT
    gc.collect()
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/evaluation.parquet")
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
y_columns = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)
gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))
valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)
submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'
submission.to_csv(SUBMISSION_NAME)

fold 0 prediction
fold 1 prediction
fold 2 prediction
CPU times: user 4min 32s, sys: 15.6 s, total: 4min 48s
Wall time: 3min 44s
