In [1]:
import pickle
with open('/kaggle/input/result-models-smlg-pickle/result_dictionnary.pickle', 'rb') as handle:
    trained_models_with_results = pickle.load(handle)

In [2]:
import gc
gc.collect()

21

In [3]:
#Import packages
import numpy as np
import time
import pandas as pd

In [4]:
def mse(y_true, y_pred):
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    sse = 0
    for i in range(len(y_true)):
        sse += np.sum(np.square(np.subtract(y_true[i], y_pred[i])))
    return sse / (len(y_true)*len(y_true[0]))

In [5]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

In [6]:
train_multi_inputs_idxcol = np.load("/kaggle/input/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_idxcol.npz", allow_pickle=True)
train_multi_targets_idxcol = np.load("/kaggle/input/multimodal-single-cell-as-sparse-matrix/train_multi_targets_idxcol.npz", allow_pickle=True)

In [7]:
input_to_keep = 40
targets_to_keep = 40

In [8]:
input_values = np.load("/kaggle/input/multiome-svd-500-components/train_multi_input_500_svd.npz")["arr_0"][:,:input_to_keep]
targets_values = np.load("/kaggle/input/multiome-svd-500-components/train_multi_targets_500_svd.npz")["arr_0"][:,:targets_to_keep]

In [9]:
input_df = pd.DataFrame(data = input_values, 
                  index = train_multi_inputs_idxcol["index"], 
                  columns = [str(i)+"_input" for i in range(input_to_keep)])
targets_df = pd.DataFrame(data = targets_values, 
                  index = train_multi_targets_idxcol["index"], 
                  columns = [str(i)+"_targets" for i in range(targets_to_keep)])
del input_values
del targets_values
del input_to_keep
gc.collect()

53

In [10]:
# Reading the metadata file
metadata = pd.read_csv("/kaggle/input/open-problems-multimodal/metadata.csv")
# Keeping the metadata of our citeSeq data and droping useless column
metadata = metadata[metadata["cell_id"].isin(train_multi_inputs_idxcol["index"])].drop(["technology"], axis=1)
# One Hot encoding for cell-types
df_cell_type_dummies = pd.get_dummies(metadata.cell_type)
metadata = metadata.drop(["cell_type"], axis=1)
metadata["group"] = metadata["day"].astype(str)+"_"+metadata["donor"].astype(str)

In [11]:
df_database = metadata.join(df_cell_type_dummies).join(input_df, on="cell_id").join(targets_df, on="cell_id")
groups = np.unique(metadata["group"])
del df_cell_type_dummies
del input_df
del targets_df
del train_multi_inputs_idxcol
del metadata
gc.collect()
df_train = df_database[df_database["day"]!=7]

In [12]:
train_multi_targets_500_svd_VT = np.load("/kaggle/input/multiome-svd-500-components/train_multi_targets_500_svd_VT.npz")["arr_0"][:targets_to_keep,:]
del targets_to_keep
gc.collect()

29

In [13]:
combs_donors = [[13176], [31800], [32606], [13176, 31800], [31800, 32606], [13176, 32606], [13176, 31800, 32606]]

In [14]:
models = [
    'Linear Regression',
    'ElasticNet',
    'Ridge Regression',
    'Lasso',
    'BayesianRidge',
    'ARDRegression'
    ]

# 1. Testing our models results on data

In [15]:
for comb_donor in combs_donors:
    gc.collect()
    for model in models:
        gc.collect()
        print("Test on", comb_donor, "from model:", model)
        print("Testing on train data")
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        df_train_temp = df_train[df_train["donor"].isin(comb_donor)]
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        df_real_target_train = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_multi_targets.h5").loc[df_train_temp["cell_id"]]
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        del df_train_temp
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        train_predictions = np.dot(trained_models_with_results[str(comb_donor)][model]["train_predictions"].T, train_multi_targets_500_svd_VT)
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        mse_train = mse(df_real_target_train, train_predictions)
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        pearson_train = correlation_score(df_real_target_train, train_predictions)
        del train_predictions
        del df_real_target_train
        print("MSE Train:", mse_train)
        print("Pearson Train:", pearson_train)
        print("Testing on train data over - Testing on test data")
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        df_train_temp = df_train[df_train["donor"].isin(comb_donor)]
        df_test_temp = df_database[df_database["cell_id"].isin(df_train_temp["cell_id"])==False]
        del df_train_temp
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        df_real_targets_test = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_multi_targets.h5").loc[df_test_temp["cell_id"]]
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        del df_test_temp
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        test_predictions = np.dot(trained_models_with_results[str(comb_donor)][model]["test_predictions"].T, train_multi_targets_500_svd_VT)
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        mse_test = mse(df_real_targets_test, test_predictions)
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        pearson_test = correlation_score(df_real_targets_test, test_predictions)
        del df_real_targets_test
        del test_predictions
        time.sleep(3)
        gc.collect()
        time.sleep(3)
        print("MSE Test:", mse_test)
        print("Pearson Test:", pearson_test)

Test on [13176] from model: Linear Regression
Testing on train data
MSE Train: 2.0079933571407973
Pearson Train: 0.6774791943165258
Testing on train data over - Testing on test data
MSE Test: 2.0634971667293285
Pearson Test: 0.6534106182190976
Test on [13176] from model: ElasticNet
Testing on train data
MSE Train: 2.033119657800535
Pearson Train: 0.6733163363116472
Testing on train data over - Testing on test data
MSE Test: 2.1063749909146843
Pearson Test: 0.6464517590547172
Test on [13176] from model: Ridge Regression
Testing on train data
MSE Train: 2.0080587203087044
Pearson Train: 0.6774593556030049
Testing on train data over - Testing on test data
MSE Test: 2.0640543744778315
Pearson Test: 0.6532929651066602
Test on [13176] from model: Lasso
Testing on train data
MSE Train: 2.009180220261093
Pearson Train: 0.677230377906204
Testing on train data over - Testing on test data
MSE Test: 2.0669969601464575
Pearson Test: 0.652735770763913
Test on [13176] from model: BayesianRidge
Testin

In [16]:
gc.collect()

21

# 2. Finding the maximum scoring we can have by comparing the TruncatedSVD's targets to real data

In [17]:
for comb_donor in combs_donors:
    gc.collect()
    print("Test on", str(comb_donor), "for max value")
    print("Testing on train data")
    gc.collect()
    df_train_temp = df_train[df_train["donor"].isin(comb_donor)]
    gc.collect()
    df_real_target_train = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_multi_targets.h5").loc[df_train_temp["cell_id"]]
    gc.collect() 
    train_max_predictions = pd.DataFrame(data = np.dot(df_database[df_database.columns[-40:]].to_numpy(),train_multi_targets_500_svd_VT), 
                                         index = train_multi_targets_idxcol["index"], 
                                         columns = train_multi_targets_idxcol["columns"]).loc[df_train_temp["cell_id"]]
    gc.collect()
    del df_train_temp
    gc.collect()
    mse_train = mse(df_real_target_train, train_max_predictions)
    gc.collect()
    pearson_train = correlation_score(df_real_target_train, train_max_predictions)
    del train_max_predictions
    del df_real_target_train
    print("MSE Train:", mse_train)
    print("Pearson Train:", pearson_train)
    print("Testing on train data over - Testing on test data")
    gc.collect()
    df_train_temp = df_train[df_train["donor"].isin(comb_donor)]
    df_test_temp = df_database[df_database["cell_id"].isin(df_train_temp["cell_id"])==False]
    del df_train_temp
    gc.collect()
    df_real_targets_test = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_multi_targets.h5").loc[df_test_temp["cell_id"]]
    gc.collect()
    test_max_predictions = pd.DataFrame(data = np.dot(df_database[df_database.columns[-40:]].to_numpy(),train_multi_targets_500_svd_VT),
                                         index = train_multi_targets_idxcol["index"],
                                         columns = train_multi_targets_idxcol["columns"]).loc[df_test_temp["cell_id"]]
    gc.collect()
    del df_test_temp
    gc.collect()
    mse_test = mse(df_real_targets_test, test_max_predictions)
    gc.collect()
    pearson_test = correlation_score(df_real_targets_test, test_max_predictions)
    del test_max_predictions
    del df_real_targets_test
    gc.collect()
    print("MSE Test:", mse_test)
    print("Pearson Test:", pearson_test)

Test on [13176] for max value
Testing on train data
MSE Train: 1.9196821889867746
Pearson Train: 0.6890853888563644
Testing on train data over - Testing on test data
MSE Test: 1.9267642844175619
Pearson Test: 0.6741239522465106
Test on [31800] for max value
Testing on train data
MSE Train: 1.9329019310444036
Pearson Train: 0.6866911067524338
Testing on train data over - Testing on test data
MSE Test: 1.9218530145939359
Pearson Test: 0.6747890761066963
Test on [32606] for max value
Testing on train data
MSE Train: 1.9206671035776006
Pearson Train: 0.6898810868739786
Testing on train data over - Testing on test data
MSE Test: 1.926325423638778
Pearson Test: 0.6740799046642877
Test on [13176, 31800] for max value
Testing on train data
MSE Train: 1.9264377528257797
Pearson Train: 0.6878618607775832
Testing on train data over - Testing on test data
MSE Test: 1.9230948710797728
Pearson Test: 0.6666106353083232
Test on [31800, 32606] for max value
Testing on train data
MSE Train: 1.9270522703