# LightGBM (CITEseq)

In [1]:
import os, gc
import numpy as np
import pandas as pd

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold
from sklearn.multioutput import MultiOutputRegressor

In [None]:
#pip install --quiet tables

## Import Raw Data

In [None]:
DATA_DIR = "../data/open-problems-multimodal"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

#raw training inputs: gene expressions
FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
#raw training targets: protein levels
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
#raw test inputs
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

#raw training inputs: chromatin accessibility
FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
#raw training targets: gene expression
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
#raw test inputs
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

#sample submission file
FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

## Evaluation Metric

In [None]:
def correlation_score(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## Data Preprocess

In [None]:
#import the raw datametadata.csv file 
metadata_df = pd.read_csv(FP_CELL_METADATA, index_col='cell_id')
metadata_df = metadata_df[metadata_df.technology=="citeseq"]
metadata_df.shape

#generate donor-time labels for the cells
conditions = [
    metadata_df['donor'].eq(27678) & metadata_df['day'].eq(2),
    metadata_df['donor'].eq(27678) & metadata_df['day'].eq(3),
    metadata_df['donor'].eq(27678) & metadata_df['day'].eq(4),
    metadata_df['donor'].eq(27678) & metadata_df['day'].eq(7),
    metadata_df['donor'].eq(13176) & metadata_df['day'].eq(2),
    metadata_df['donor'].eq(13176) & metadata_df['day'].eq(3),
    metadata_df['donor'].eq(13176) & metadata_df['day'].eq(4),
    metadata_df['donor'].eq(13176) & metadata_df['day'].eq(7),
    metadata_df['donor'].eq(31800) & metadata_df['day'].eq(2),
    metadata_df['donor'].eq(31800) & metadata_df['day'].eq(3),
    metadata_df['donor'].eq(31800) & metadata_df['day'].eq(4),
    metadata_df['donor'].eq(31800) & metadata_df['day'].eq(7),
    metadata_df['donor'].eq(32606) & metadata_df['day'].eq(2),
    metadata_df['donor'].eq(32606) & metadata_df['day'].eq(3),
    metadata_df['donor'].eq(32606) & metadata_df['day'].eq(4),
    metadata_df['donor'].eq(32606) & metadata_df['day'].eq(7)
    ]

# create a list of the values we want to assign for each condition
values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

# create a new column and use np.select to assign values to it using our lists as arguments
metadata_df['comb'] = np.select(conditions, values)

#reindex the training data
X = pd.read_hdf(FP_CITE_TRAIN_INPUTS)
cell_index = X.index
meta = metadata_df.reindex(cell_index)
del X
gc.collect()

#import the feature processed training set: 512 truncated SVD dimensions + 364 important columns
cite_train_x = pd.read_csv('../result/fe/X_876.csv').values
cite_train_y = pd.read_hdf(FP_CITE_TRAIN_TARGETS).values
print(cite_train_x.shape)
print(cite_train_y.shape)

#import the feature processed test set
cite_test_x = pd.read_csv('../result/fe/Xt_876.csv').values

## Modelling and Evaluation

In [None]:
#hyperparameters of the LightGBM model
params = {
    'n_estimators': 300, 
    'learning_rate': 0.1, 
    'max_depth': 10, 
    'num_leaves': 200,
    'min_child_samples': 250,
    'colsample_bytree': 0.8, 
    'subsample': 0.6, 
    "seed": 1,
    }

In [None]:
test_pred = 0
N_SPLITS_ANN = len(meta['comb'].value_counts())

#use GroupKFold to balance the number of distinct donor-time groups in each fold
kf = GroupKFold(n_splits=N_SPLITS_ANN)

for fold, (idx_tr, idx_va) in enumerate(kf.split(cite_train_x, groups=meta.comb)):
    
    model = None
    gc.collect()
    
    #split the data into training and validation sets
    X_train = cite_train_x[idx_tr] 
    y_train = cite_train_y[idx_tr]
    X_val = cite_train_x[idx_va]
    y_val = cite_train_y[idx_va]
    
    #use MultiOutputRegressor on top of LightGBM to cope with multiple outputs
    model = MultiOutputRegressor(lgb.LGBMRegressor(**params))
    
    #train the model
    model.fit(X_train, y_train)
    
    #make predictions
    y_pred = model.predict(X_val)
    
    #compute the mean-square-error and correlation scores
    mse = mean_squared_error(y_val, y_pred)
    corrscore = correlation_score(y_val, y_pred)
    print(mse, corrscore)

    #cumulate (blend) the outputs of the models on the test set as the final predictions
    test_pred = test_pred + model.predict(cite_test_x)        

In [None]:
submission = pd.read_csv( DATA_DIR +'/sample_submission.csv')   
submission.loc[:48663*140-1,'target'] = test_pred.reshape(-1)
submission.to_csv(f'../result/cite/LGBM_submission.csv', index=False) 