# 1. Installation of packages

In [1]:
#Installation of AutoGluon
!pip3 install -U pip
!pip3 install -U setuptools wheel
!pip3 install torch==1.12+cpu torchvision==0.13.0+cpu torchtext==0.13.0 -f https://download.pytorch.org/whl/cpu/torch_stable.html
!pip3 install autogluon

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 22.1.2
    Uninstalling pip-22.1.2:
      Successfully uninstalled pip-22.1.2
Successfully installed pip-22.3.1
Collecting setuptools
  Downloading setuptools-65.6.3-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting wheel
  Downloading wheel-0.38.4-py3-none-any.whl (36 kB)
Installing collected packages: wheel, setuptools
  Attempting uninstall: wheel
    Found existing installation: wheel 0.37.1
    Uninstalling wheel-0.37.1:
      Successfully uninstalled wheel-0.37.1
  Attempting uninstall: setuptools
    Found existing installation: setuptools 59.8.0
    Uninstalling setupto

In [2]:
#Restating the kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# 2. Importation of packages and files

In [3]:
#Import packages
import numpy as np
import pandas as pd
import seaborn as sns
import gc
from tqdm.notebook import tqdm_notebook
import autogluon
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

#Import files from Kaggle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/open-problems-multimodal/sample_submission.csv
/kaggle/input/open-problems-multimodal/train_cite_targets.h5
/kaggle/input/open-problems-multimodal/metadata_cite_day_2_donor_27678.csv
/kaggle/input/open-problems-multimodal/test_multi_inputs.h5
/kaggle/input/open-problems-multimodal/evaluation_ids.csv
/kaggle/input/open-problems-multimodal/train_cite_inputs.h5
/kaggle/input/open-problems-multimodal/train_multi_targets.h5
/kaggle/input/open-problems-multimodal/train_multi_inputs.h5
/kaggle/input/open-problems-multimodal/metadata.csv
/kaggle/input/open-problems-multimodal/test_cite_inputs_day_2_donor_27678.h5
/kaggle/input/open-problems-multimodal/test_cite_inputs.h5


# 3. Opening files

In [4]:
df_features = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_cite_inputs.h5")#features_input

In [5]:
df_target = pd.read_hdf("/kaggle/input/open-problems-multimodal/train_cite_targets.h5")#target

# 4. Data Pre-processing

## 4.1. Removing the columns with only 0

In [6]:
df_features.loc['Total']= df_features.sum()
unwanted = [column for column in df_features.columns 
            if df_features[column]["Total"]==0]
df_features.drop(unwanted, axis=1, inplace=True)
df_features.drop("Total", inplace=True)

## 4.2. Finding the genes directly linked to the target protein levels

We use naming conventions to find the genes directly linked to our targeted protein levels by finding in each gene if the name of the protein is contained in it.

In [7]:
features_list = []
for protein in df_target.columns:
    for gene in df_features.columns:
        if protein in gene:
            features_list.append(gene)
features_list = np.unique(features_list)

## 4.3. Performing PCA on the other features

In [8]:
#Spliting the training set into two: the genes directly linked to our proteins and the others
df_to_pca = df_features.drop(features_list, axis=1)
df_features = df_features[features_list]

In [9]:
# Separating out the features
x = df_to_pca.values

In [10]:
#Performing PCA and keeping the 500 biggest components
pca = PCA(n_components=500)
principalComponents = pca.fit_transform(x)

In [11]:
#Creating a new features dataframe using PCA and our directly-linked genes
df_features = df_features.join(pd.DataFrame(data=principalComponents, index=df_features.index), on="cell_id")

In [12]:
#Optimizing the RAM by removing useless variables
del pca, df_to_pca, x, principalComponents
gc.collect()

126

## 4.4. Creating the train and test datasets

In [13]:
metadata = pd.read_csv("/kaggle/input/open-problems-multimodal/metadata.csv")#reading the metadata file 
metadata = metadata[metadata["cell_id"].isin(df_features.index)].drop(["technology"], axis=1)#keeping the metadata of our citeSeq data and droping useless columns

In [14]:
#Creating the train and test dataset based on the dates. Splting on a 80/20 split and keeping the days 2/3 and a part of day 4 on the train set and tesing on the rest.
df_autogluon = metadata.join(df_features, on="cell_id").join(df_target, on="cell_id")
df_autogluon_train_1 = df_autogluon[df_autogluon["day"]<4]
df_autogluon_day_4 = df_autogluon[df_autogluon["day"]==4]
df_autogluon_train_2 = df_autogluon_day_4.sample(n = int(df_autogluon.shape[0]*0.80 - df_autogluon_train_1.shape[0]), random_state=4)
df_autogluon_test = df_autogluon_day_4.drop(df_autogluon_train_2.index)

del df_autogluon, df_autogluon_day_4
gc.collect()

df_autogluon_train = pd.concat([df_autogluon_train_1, df_autogluon_train_2])

del df_autogluon_train_1, df_autogluon_train_2
gc.collect()

0

# 5. Creation of the models using AutoGluon

## 5.1. The scoring function

The competition had a special metric: for every row, it computes the Pearson correlation between y_true and y_pred, and then all these correlation coefficients are averaged.

In order to compare our results with other competitors, we use the same metric.

In [15]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

## 5.2. Model training and testing per donor

In [16]:
#list of all the donors
donors = np.unique(df_autogluon_train["donor"])

In [17]:
predicator_list = []

#Creating model with LightGBM and XGBoost
hyperparameters = {
    'GBM': [
        {}
    ]
}

train_va = []
test_va = []

train_preds = []
test_preds = []

starting_gene = 105
genes_to_predict = 140

for i in tqdm_notebook(range(starting_gene, genes_to_predict), desc="AutoGluon Progress"):
    for donor in donors:
        df_train = df_autogluon_train.loc[df_autogluon_train.index[np.where(df_autogluon_train[["donor"]]==donor)[0]]]
        df_test = df_autogluon_test.loc[df_autogluon_test.index[np.where(df_autogluon_test[["donor"]]==donor)[0]]]
        predicator = TabularPredictor(label=df_target.columns[i], problem_type="regression", eval_metric="pearsonr")
        df_train = df_train[df_features.columns.tolist()+["cell_type"]+[df_target.columns[i]]]
        df_test = df_test[df_features.columns.tolist()+["cell_type"]+[df_target.columns[i]]]
        predicator.fit(df_train, presets='best_quality', hyperparameters=hyperparameters, num_bag_folds=4)
        train_va = np.concatenate((train_va, df_train[df_target.columns[i]].to_numpy()), axis=None)
        test_va = np.concatenate((test_va, df_test[df_target.columns[i]].to_numpy()), axis=None)
        train_preds = np.concatenate((train_preds, predicator.predict(df_train)), axis=None)
        test_preds = np.concatenate((test_preds, predicator.predict(df_test)), axis=None)
        predicator_list.append(predicator)
        
train_va = np.reshape(train_va, (genes_to_predict-starting_gene, -1))
test_va = np.reshape(test_va, (genes_to_predict-starting_gene, -1))

train_preds = np.reshape(train_preds, (genes_to_predict-starting_gene, -1))
test_preds = np.reshape(test_preds, (genes_to_predict-starting_gene, -1))

AutoGluon Progress:   0%|          | 0/35 [00:00<?, ?it/s]

No path specified. Models will be saved in: "AutogluonModels/ag-20221217_173923/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=4, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221217_173923/"
AutoGluon Version:  0.6.1
Python Version:     3.7.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sat Dec 10 10:12:26 UTC 2022
Train Data Rows:    17919
Train Data Columns: 614
Label Column: CD127
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    31724.22 MB
	Train Data (Original)  Memory Usage: 45.02 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		

No path specified. Models will be saved in: "AutogluonModels/ag-20221217_174149/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=4, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20221217_174149/"
AutoGluon Version:  0.6.1
Python Version:     3.7.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sat Dec 10 10:12:26 UTC 2022
Train Data Rows:    19648
Train Data Columns: 614
Label Column: CD127
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    31306.51 MB
	Train Data (Original)  Memory Usage: 49.36 MB (0.2% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		

In [18]:
np.save("train_va.npy", train_va)
np.save("test_va.npy", test_va)

np.save("train_preds.npy", train_preds)
np.save("test_preds.npy", test_preds)

In [19]:
mse_train = mean_squared_error(train_va, train_preds)
corrscore_train = correlation_score(train_va, train_preds)

mse_test = mean_squared_error(test_va, test_preds)
corrscore_test = correlation_score(test_va, test_preds)

## 5.3. Model results

In [20]:
print("The MSE train is:", mse_train)
print("The correlation score train is:", corrscore_train)
print("The MSE test is:", mse_test)
print("The correlation score test is:", corrscore_test)

The MSE train is: 2.520250059004459
The correlation score train is: 0.4850869862418375
The MSE test is: 3.351813003865147
The correlation score test is: 0.39516252988488515
