# do
* 64次元にpcaしたデータのうち32次元使用
* 500エポック
* batch=256
* lr=0.001

## experiment

In [1]:
MODEL_NAME = 'keras21'
N_COMPONENTS = 64
SLICE_FEAT = 32
EPOCH = 500
BATCH_SIZE = 256
LEARNING_RATE = 0.001

## drive mount

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## install libraries

In [3]:
! pip install tables colorama tensorflow-determinism

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting tensorflow-determinism
  Downloading tensorflow-determinism-0.3.0.tar.gz (12 kB)
Building wheels for collected packages: tensorflow-determinism
  Building wheel for tensorflow-determinism (setup.py) ... [?25l[?25hdone
  Created wheel for tensorflow-determinism: filename=tensorflow_determinism-0.3.0-py3-none-any.whl size=9158 sha256=a046ff8df77c2b92d883f46eb54e381fc5b0935a698c5e0bc7e7314e2d3219c7
  Stored in directory: /root/.cache/pip/wheels/d2/be/33/2b27e81e5d40b4bfb7c103ac6c6c5e81fdbcf40d2af5078529
Successfully built tensorflow-determinism
Installing collected packages: tensorflow-determinism, colorama
Successfully installed colorama-0.4.6 tensorflow-determinism-0.3.0


## import libraries

In [4]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse
import math

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization
import random
import warnings
warnings.filterwarnings("ignore")

## global variables

In [5]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")
FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")
FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")
SUBMISSION_NAME = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_sub/{MODEL_NAME}.csv"
VERBOSE = 0
N_SPLIT = 3

## set seed

In [6]:
def set_seed(seed=200):
    tf.random.set_seed(seed)
    # optional
    # for numpy.random
    np.random.seed(seed)
    # for built-in random
    random.seed(seed)
    # for hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

set_seed(0)

## cal score

In [7]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    It is assumed that the predictions are not constant.
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## loading data & pca test

In [8]:
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="multiome"]
cell_index =np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["index"]
meta = metadata_df.reindex(cell_index)
meta_new=meta.reset_index(drop=True)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_{N_COMPONENTS}.pkl','rb') as f: X = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/pca_train_{N_COMPONENTS}.pkl','rb') as f: pca_train = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/pca_target_{N_COMPONENTS}.pkl','rb') as f: pca_target = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/Y_{N_COMPONENTS}.pkl','rb') as f: Y = pickle.load(f)
origin_Y = scipy.sparse.load_npz('/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_values.sparse.npz')
multi_test_x = scipy.sparse.load_npz("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_values.sparse.npz")
multi_test_x = pca_train.transform(multi_test_x)

## select features

In [9]:
X = X[:,:SLICE_FEAT]
multi_test_x = multi_test_x[:,:SLICE_FEAT]

In [10]:
Y.shape, X.shape ,multi_test_x.shape

((105942, 64), (105942, 32), (55935, 32))

## create model

In [11]:
def create_model():
    
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape =(X.shape[1],))

    x0 = Dense(256, 
              kernel_regularizer = REG1,
              activation = activation,
             )(inputs)
    x0 = Dropout(DROP)(x0)
    
    
    x1 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x0)
    x1 = Dropout(DROP)(x1)
    
    
    x2 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x1) 
    x2= Dropout(DROP)(x2)
    
    x3 = Dense(Y.shape[1],
               kernel_regularizer = REG1,
               activation = activation,
             )(x2)
    x3 = Dropout(DROP)(x3)

         
    x = Concatenate()([
                x0, 
                x1, 
                x2, 
                x3
                ])
    
    x = Dense(Y.shape[1], 
                kernel_regularizer = REG2,
                activation='linear',
                )(x)
    
    
    model = Model(inputs, x)
    

    return model

## private cv

In [12]:
kf = GroupKFold(n_splits = N_SPLIT)
score_list = []
for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.donor)):
    
    tr_day_idx = meta_new.iloc[idx_tr][meta_new.day!=7].index
    va_day_idx=meta_new[meta_new.day==7].index
    
    X_tr = X[tr_day_idx]
    y_tr = Y[tr_day_idx]
    X_va = X[va_day_idx]
    y_va = Y[va_day_idx] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =EPOCH,
              verbose = VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks = [es,lr]
             )
    pred = model.predict(X_va)
    
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,pred),2)}')
    corrscore = correlation_score(origin_Y[va_day_idx].todense(), pred@pca_target.components_)
    print(f"Fold {fold}: {es.stopped_epoch:3} epochs, private corr =  {corrscore:.5f}")
    score_list.append(corrscore)        
    del X_tr,X_va,y_tr,y_va
    gc.collect()
print(f"{Fore.GREEN}{Style.BRIGHT}Average private corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")


 --------- FOLD 0 -----------
Mean squared error = 72.43000030517578
Fold 0:  73 epochs, private corr =  0.60357

 --------- FOLD 1 -----------
Mean squared error = 70.93000030517578
Fold 1:  71 epochs, private corr =  0.60464

 --------- FOLD 2 -----------
Mean squared error = 69.23999786376953
Fold 2:  69 epochs, private corr =  0.60561
[32m[1mAverage private corr = 0.60461[0m


## public cv

In [13]:
kf = GroupKFold(n_splits = N_SPLIT)
score_list = []
for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.donor)):
    
    X_tr = X[idx_tr]
    y_tr = Y[idx_tr]
    X_va = X[idx_va]
    y_va = Y[idx_va] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =EPOCH,
              verbose = VERBOSE,
              batch_size=BATCH_SIZE,
              callbacks = [es,lr]
             )
    pred = model.predict(X_va)
    
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,pred),2)}')
    filename = f"model_{fold}"
    model.save(filename)
    print('model saved :',filename)
    corrscore = correlation_score(origin_Y[idx_va].todense(), pred@pca_target.components_)
    print(f"Fold {fold}: {es.stopped_epoch:3} epochs, public corr =  {corrscore:.5f}")
    score_list.append(corrscore)  
    del X_tr,X_va,y_tr,y_va
    gc.collect()
print(f"{Fore.GREEN}{Style.BRIGHT}Average public corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")


 --------- FOLD 0 -----------
Mean squared error = 31.049999237060547
model saved : model_0
Fold 0:  76 epochs, public corr =  0.66459

 --------- FOLD 1 -----------
Mean squared error = 27.760000228881836
model saved : model_1
Fold 1:  71 epochs, public corr =  0.67071

 --------- FOLD 2 -----------
Mean squared error = 27.649999618530273
model saved : model_2
Fold 2:  78 epochs, public corr =  0.66805
[32m[1mAverage public corr = 0.66778[0m


## create submission

In [14]:
%%time
test_pred = np.zeros((multi_test_x.shape[0], 23418), dtype='float16')
for fold in range(N_SPLIT):
    print(f'fold {fold} prediction')
    model = tf.keras.models.load_model(f"model_{fold}")
    test_pred += (model.predict(multi_test_x)@pca_target.components_)/N_SPLIT
    gc.collect()
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/evaluation.parquet")
# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))
eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
y_columns = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)
gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))
valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)
submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'
submission.to_csv(SUBMISSION_NAME)

fold 0 prediction
fold 1 prediction
fold 2 prediction
CPU times: user 3min 36s, sys: 11.7 s, total: 3min 48s
Wall time: 3min 29s
