In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
MODEL_NAME = 'keras2'
N_COMPONENTS = 512

In [3]:
! pip install tables colorama

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [4]:
import os, gc, pickle, datetime, scipy.sparse
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from colorama import Fore, Back, Style

from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD,PCA
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from cycler import cycler
from IPython.display import display

import scipy.sparse
import math

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, LearningRateScheduler, EarlyStopping
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, BatchNormalization

DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")
SUBMISSION_NAME = f"/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/multi_sub/{MODEL_NAME}.csv"

VERBOSE = 0

In [5]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [6]:
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="multiome"]
metadata_df.shape

(161877, 4)

In [7]:
cell_index =np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["index"]
meta = metadata_df.reindex(cell_index)

In [8]:
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_{N_COMPONENTS}.pkl','rb') as f: X = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/pca_train_{N_COMPONENTS}.pkl','rb') as f: pca_train = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/pca_target_{N_COMPONENTS}.pkl','rb') as f: pca_target = pickle.load(f)
with open(f'/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/Y_{N_COMPONENTS}.pkl','rb') as f: Y = pickle.load(f)

In [9]:
X = X[:,:40]
X.shape

(105942, 40)

In [10]:
Y.shape, X.shape

((105942, 512), (105942, 40))

In [11]:
meta_new=meta.reset_index(drop=True)

In [12]:
LR_START = 0.01
BATCH_SIZE = 512

def create_model():
    
    reg1 = 9.613e-06
    reg2 = 1e-07
    REG1 = tf.keras.regularizers.l2(reg1)
    REG2 = tf.keras.regularizers.l2(reg2)
    DROP = 0.1

    activation = 'selu'
    inputs = Input(shape =(X.shape[1],))

    x0 = Dense(256, 
              kernel_regularizer = REG1,
              activation = activation,
             )(inputs)
    x0 = Dropout(DROP)(x0)
    
    
    x1 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x0)
    x1 = Dropout(DROP)(x1)
    
    
    x2 = Dense(512, 
               kernel_regularizer = REG1,
               activation = activation,
             )(x1) 
    x2= Dropout(DROP)(x2)
    
    x3 = Dense(Y.shape[1],
               kernel_regularizer = REG1,
               activation = activation,
             )(x2)
    x3 = Dropout(DROP)(x3)

         
    x = Concatenate()([
                x0, 
                x1, 
                x2, 
                x3
                ])
    
    x = Dense(Y.shape[1], 
                kernel_regularizer = REG2,
                activation='linear',
                )(x)
    
    
    model = Model(inputs, x)
    

    return model

In [13]:
import warnings
warnings.filterwarnings("ignore")

np.random.seed(1)
tf.random.set_seed(1)

N_SPLIT = 3
kf = GroupKFold(n_splits=N_SPLIT)
score_list = []

for fold,(idx_tr, idx_va) in enumerate(kf.split(X,groups=meta.donor)):
    
    tr_day_idx = meta_new.iloc[idx_tr][meta_new.day!=7].index
    va_day_idx=meta_new[meta_new.day==7].index
    X_tr = X[tr_day_idx]
    y_tr = Y[tr_day_idx]
    
    X_va = X[va_day_idx]
    y_va = Y[va_day_idx] 
    
    model = create_model()
    
    lr = ReduceLROnPlateau(
                monitor = "val_loss",
                factor = 0.9, 
                patience = 4, 
                verbose = VERBOSE)
    
    es = EarlyStopping(
                monitor = "val_loss",
                patience = 30, 
                verbose = VERBOSE,
                mode = "min", 
                restore_best_weights = True)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss = 'mse',
                  metrics=None)
    model.fit(X_tr,
              y_tr,
              validation_data=(X_va,y_va),
              epochs =1000,
              verbose = VERBOSE,
              batch_size=256,
              callbacks = [es,lr]
             )
    y_va_pred = model.predict(X_va)
    
    print(f'\n --------- FOLD {fold} -----------')
    print(f'Mean squared error = {np.round(mean_squared_error(y_va,y_va_pred),2)}')
   
    filename = f"model_{fold}"
    model.save(filename)
    print('model saved :',filename)
    corrscore = correlation_score(y_va, y_va_pred)
        
    del X_tr,X_va,y_tr,y_va
    gc.collect()
    print(f"Fold {fold}: {es.stopped_epoch:3} epochs, corr =  {corrscore:.5f}")
    score_list.append(corrscore)
print(f"{Fore.GREEN}{Style.BRIGHT}Average  corr = {np.array(score_list).mean():.5f}{Style.RESET_ALL}")


 --------- FOLD 0 -----------
Mean squared error = 15.539999961853027
model saved : model_0
Fold 0:  48 epochs, corr =  0.91975

 --------- FOLD 1 -----------
Mean squared error = 15.09000015258789
model saved : model_1
Fold 1:  69 epochs, corr =  0.92163

 --------- FOLD 2 -----------
Mean squared error = 14.84000015258789
model saved : model_2
Fold 2:  59 epochs, corr =  0.92213
[32m[1mAverage  corr = 0.92117[0m


In [14]:
multi_test_x = scipy.sparse.load_npz("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_values.sparse.npz")
multi_test_x = pca_train.transform(multi_test_x)
multi_test_x = multi_test_x[:,:40]
multi_test_x.shape

(55935, 40)

In [15]:
test_pred = np.zeros((multi_test_x.shape[0], 23418), dtype='float16')

for fold in range(N_SPLIT):
    print(f'fold {fold} prediction')
    model = tf.keras.models.load_model(f"model_{fold}")
    test_pred += (model.predict(multi_test_x)@pca_target.components_)/N_SPLIT

    gc.collect()

fold 0 prediction
fold 1 prediction
fold 2 prediction


In [16]:
%%time
# Read the table of rows and columns required for submission
eval_ids = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/evaluation.parquet")

# Convert the string columns to more efficient categorical types
#eval_ids.cell_id = eval_ids.cell_id.apply(lambda s: int(s, base=16))

eval_ids.cell_id = eval_ids.cell_id.astype(pd.CategoricalDtype())
eval_ids.gene_id = eval_ids.gene_id.astype(pd.CategoricalDtype())

CPU times: user 17.9 s, sys: 2.05 s, total: 19.9 s
Wall time: 21 s


In [17]:
# Prepare an empty series which will be filled with predictions
submission = pd.Series(name='target',
                       index=pd.MultiIndex.from_frame(eval_ids), 
                       dtype=np.float32)
submission

row_id    cell_id       gene_id        
0         c2150f55becb  CD86              NaN
1         c2150f55becb  CD274             NaN
2         c2150f55becb  CD270             NaN
3         c2150f55becb  CD155             NaN
4         c2150f55becb  CD112             NaN
                                           ..
65744175  2c53aa67933d  ENSG00000134419   NaN
65744176  2c53aa67933d  ENSG00000186862   NaN
65744177  2c53aa67933d  ENSG00000170959   NaN
65744178  2c53aa67933d  ENSG00000107874   NaN
65744179  2c53aa67933d  ENSG00000166012   NaN
Name: target, Length: 65744180, dtype: float32

In [18]:
%%time
y_columns = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/train_multi_targets_idxcol.npz",
                   allow_pickle=True)["columns"]

test_index = np.load("/content/drive/MyDrive/Colab Notebooks/kaggle/MSCI/dataset/test_multi_inputs_idxcol.npz",
                    allow_pickle=True)["index"]

CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 1.13 s


In [19]:
cell_dict = dict((k,v) for v,k in enumerate(test_index)) 
assert len(cell_dict)  == len(test_index)

gene_dict = dict((k,v) for v,k in enumerate(y_columns))
assert len(gene_dict) == len(y_columns)

In [20]:
eval_ids_cell_num = eval_ids.cell_id.apply(lambda x:cell_dict.get(x, -1))
eval_ids_gene_num = eval_ids.gene_id.apply(lambda x:gene_dict.get(x, -1))

valid_multi_rows = (eval_ids_gene_num !=-1) & (eval_ids_cell_num!=-1)

In [21]:
submission.iloc[valid_multi_rows] = test_pred[eval_ids_cell_num[valid_multi_rows].to_numpy(),
eval_ids_gene_num[valid_multi_rows].to_numpy()]

In [22]:
del eval_ids_cell_num, eval_ids_gene_num, valid_multi_rows, eval_ids, test_index, y_columns
gc.collect()

139

In [23]:
submission.head()

row_id  cell_id       gene_id
0       c2150f55becb  CD86      NaN
1       c2150f55becb  CD274     NaN
2       c2150f55becb  CD270     NaN
3       c2150f55becb  CD155     NaN
4       c2150f55becb  CD112     NaN
Name: target, dtype: float32

In [24]:
submission.reset_index(drop=True, inplace=True)
submission.index.name = 'row_id'
# with open("partial_submission_multi.pickle", 'wb') as f:
#     pickle.dump(submission, f)
# submission

In [25]:
submission.to_csv(SUBMISSION_NAME)