# Feature Engineering (Multiome)

In [1]:
import os, gc
import numpy as np
import pandas as pd

import scipy 
import scipy.sparse

import sklearn

## Import the Raw Data

In [None]:
DATA_DIR = "../data/open-problems-multimodal"
FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

#raw training inputs: gene expressions
FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_cite_inputs.h5")
#raw training targets: protein levels
FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_cite_targets.h5")
#raw test inputs
FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,"test_cite_inputs.h5")

#raw training inputs: chromatin accessibility
FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
#raw training targets: gene expression
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
#raw test inputs
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

#sample submission file
FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

## Strategy 1

1. dimension reduction with truncated SVD
2. standardization
3. selected the top 64 most infomrative features

In [None]:
#import the train and test sets as sparse matrices and combine them
X = scipy.sparse.load_npz("../data/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz")
Xt = scipy.sparse.load_npz("../data/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz")
both = scipy.sparse.vstack([X, Xt])

#dimension reduction with truncated SVD
pca = sklearn.decomposition.TruncatedSVD(n_components=512, random_state=64)
both = pca.fit_transform(both)

#standardization
both -= both.mean(axis=1).reshape(-1,1)
both /= both.std(axis=1, ddof=1).reshape(-1,1)

#select the top 64 informative features
both = both[:,:64]
X = both[:105942]
Xt = both[105942:]
del both
gc.collect()

#store the processed features
pd.DataFrame(X).to_csv('../result/fe/X_64.csv', index=False)
pd.DataFrame(Xt).to_csv('../result/fe/Xt_64.csv', index=False)

X0 = pd.read_csv('../result/fe/X_64.csv').values
Xt0 = pd.read_csv('../result/fe/Xt_64.csv').values

## Strategy 2: 

1. standardization
2. applied L2 regulaization and log1p transformation
3. dimension reduction with truncated SVD
4. standardization
5. selected the top 100 most informative features

## TF-IDF Transformation

In [2]:
class tfidfTransformer():
    
    def __init__(self):
        self.idf = None
        self.fitted = False

    def fit(self, X):
        self.idf = X.shape[0] / X.sum(axis=0)
        self.fitted = True

    def transform(self, X):
        if not self.fitted:
            raise RuntimeError('Transformer was not fitted on any data')
        if scipy.sparse.issparse(X):
            tf = X.multiply(1 / X.sum(axis=1))
            return tf.multiply(self.idf)
        else:
            tf = X / X.sum(axis=1).reshape(-1,1)
            return tf * self.idf

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [None]:
del X, Xt
gc.collect()

#import the train and test sets as sparse matrices and combine them
X = scipy.sparse.load_npz("../data/multimodal-single-cell-as-sparse-matrix/train_multi_inputs_values.sparse.npz")
Xt = scipy.sparse.load_npz("../data/multimodal-single-cell-as-sparse-matrix/test_multi_inputs_values.sparse.npz")
both = scipy.sparse.vstack([X, Xt])

#dimension reduction with truncated SVD
pca = sklearn.decomposition.TruncatedSVD(n_components=512, random_state=64)

#l2 normlaization
normalizer = sklearn.preprocessing.Normalizer(norm="l2")
both = normalizer.fit_transform(both)

#fit and TF-IDF transform the features
TfidfTransformer = tfidfTransformer()
both = TfidfTransformer.fit_transform(both)

#l2 normlaization
normalizer = sklearn.preprocessing.Normalizer(norm="l2")
both = normalizer.fit_transform(both)

#log1p transform
both = np.log1p(both * 1e4)

#dimension reduction with truncated SVD
pca = sklearn.decomposition.TruncatedSVD(n_components=512, random_state=64)
both = pca.fit_transform(both)

#standardization
both -= both.mean(axis=1).reshape(-1,1)
both /= both.std(axis=1, ddof=1).reshape(-1,1)

#select the top 100 informative features
both = both[:,:100]

#split the training and test set
X = both[:105942]
Xt = both[105942:]
del both
gc.collect()

#combine these 100 features with the previously obtain 64 features
pd.DataFrame(X).to_csv('../result/fe/X_164_l2.csv', index=False)
pd.DataFrame(Xt).to_csv('../result/fe/Xt_164_l2.csv', index=False)

#store the generate features as csv files
pd.DataFrame(X).to_csv('../result/fe/X_164_l2.csv', index=False)
pd.DataFrame(Xt).to_csv('../result/fe/Xt_164_l2.csv', index=False)