Importing Libraries

In [None]:
from fastai.tabular.all import *
from sklearn.decomposition import PCA

path = Path('/kaggle/input/lish-moa/')
path.ls()

train_features = pd.read_csv(path/'train_features.csv')
train_targets_scored = pd.read_csv(path/'train_targets_scored.csv')
train_targets_nonscored = pd.read_csv(path/'train_targets_nonscored.csv')
folds = pd.read_csv('/kaggle/input/pytorch-starter/folds.csv',index_col=0)
test_features = pd.read_csv(path/'test_features.csv')

Looking at the categorical variables

In [None]:
train_features['cp_type'].value_counts()

In [None]:
train_features['cp_time'].value_counts()

In [None]:
train_features['cp_dose'].value_counts()

**Observation**: 
* **cp_type**:
    * has two values, `trt_cp` and `ctl_vehicle`. As per the data description, `trt_cp` are treated with a compound and the `ctl_vehicle` are treated with control perturbations. 
    * The `ctl_vehicle` has a significantly lower count when compared to `trt_cp`. 
    * The description states that the rows marked `ctl_vehicle` do not have MoA. We can use this for post processing. We will also check this later.
* **cp_time**
    * Has three values `24`, `48` and `72`. This is the treatment duration in hours. 
    * Should be treated as an ordered category
* **cp_dose**:
    * Has two values `D1` and `D2` for low and high doses. 

In [None]:
gene_cols = list(filter(lambda x: 'g-' in x , train_features.columns))
cell_cols = list(filter(lambda x: 'c-' in x , train_features.columns))

In [None]:
cp_time = [24,48,72]

train_features['cp_time'] = train_features['cp_time'].astype('category')
train_features['cp_time'].cat.set_categories(cp_time, ordered=True, inplace=True)

In [None]:
procs = [Categorify, Normalize]

In [None]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.9)
selector.fit_transform(train_features[gene_cols+cell_cols])
col_mask = selector.get_support()
gene_mask = col_mask[:len(gene_cols)]
cell_mask = col_mask[len(gene_cols):]

In [None]:
from sklearn.preprocessing import QuantileTransformer

train_feat = train_features.copy()

for col in gene_cols + cell_cols:
    transformer = QuantileTransformer(n_quantiles=500, output_distribution='normal',random_state=0)
    vec_len = len(train_features[col].values)
    vec_len_test = len(test_features[col].values)
    raw_vec = train_features[col].values.reshape(vec_len, 1)
    transformer.fit(raw_vec)
    train_feat[col] = transformer.transform(raw_vec).reshape(1,vec_len)[0]
    test_features[col] = transformer.transform(test_features[col].values.reshape(vec_len_test, 1)).reshape(1, vec_len_test)[0]

In [None]:
df = train_feat.merge(folds, on='sig_id', how='left')
df = df.merge(train_targets_scored, on='sig_id', how='left')
df = df.merge(train_targets_nonscored, on='sig_id', how='left')

In [None]:
cont, cat = cont_cat_split(train_feat, 3)
cat.remove('sig_id')

## Using the unscored columns for training

In [None]:
scored_targets = train_targets_scored.drop('sig_id', axis=1).columns.tolist()
non_scored_targets = train_targets_nonscored.drop('sig_id', axis=1).columns.tolist()
y_names = scored_targets+non_scored_targets

In [None]:
learners = {}
pca_models_cells = {}
pca_models_genes = {}

In [None]:
def valid_loss_scored(inputs, targ):
    n_targs = len(scored_targets)
    inp = inputs[:,:n_targs]
    targets = targ[:,:n_targs]
    return BCEWithLogitsLossFlat()(inp, targets)

In [None]:
N_FOLDS = 5
N_COMPONENTS_C = 80
N_COMPONENTS_G = 600

In [None]:
cell_pca_cols = [f"c-pca-{i+1}" for i in range(N_COMPONENTS_C)]
gene_pca_cols = [f"g-pca-{i+1}" for i in range(N_COMPONENTS_G)]
# cont = cell_cols + gene_cols + cell_pca_cols + gene_pca_cols

Creating PCA Features

In [None]:
pca_df = {}
for fold in range(N_FOLDS):
    cond = df['folds'] == fold
    train_idx = np.where(~cond)[0]
    valid_idx = np.where(cond)[0]
    splits = (list(train_idx), list(valid_idx))
    pca = PCA(n_components=N_COMPONENTS_C)
    pca.fit(df.loc[train_idx, cell_cols].loc[:, cell_mask])
    pca_features = pca.transform(df.loc[:, cell_cols].loc[:, cell_mask])[:,:N_COMPONENTS_C]
    pca_models_cells[fold] = pca
    df[cell_pca_cols] = pca_features

    pca = PCA(n_components=N_COMPONENTS_G)
    pca.fit(train_features.loc[train_idx, gene_cols].loc[:, gene_mask])
    pca_features = pca.transform(df.loc[:, gene_cols].loc[:, gene_mask])[:,:N_COMPONENTS_G]
    pca_models_genes[fold] = pca
    df[gene_pca_cols] = pca_features
    pca_df[fold] = df

In [None]:
pca_df[0]

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
def fit_model( n_comp_c, n_comp_g):
    layers = [100,500,200]
    N_COMPONENTS_C = n_comp_c
    N_COMPONENTS_G = n_comp_g
    cell_pca_cols = [f"c-pca-{i+1}" for i in range(N_COMPONENTS_C)]
    gene_pca_cols = [f"g-pca-{i+1}" for i in range(N_COMPONENTS_G)]
    cont = cell_pca_cols + gene_pca_cols
    for fold in range(N_FOLDS):
        cond = df['folds'] == fold
        train_idx = np.where(~cond)[0]
        valid_idx = np.where(cond)[0]
        splits = (list(train_idx), list(valid_idx))
        

        to = TabularPandas(pca_df[fold], procs, cat, cont,y_names=y_names,y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits )
        dls = to.dataloaders(1024)

        learn = tabular_learner(dls, layers = layers, metrics=[valid_loss_scored])
        learn.fit_one_cycle(20,lr_max=1e-2,pct_start=0.3)
        learners[fold] = learn
    return dls

In [None]:
dls = fit_model(70,100)

In [None]:
predictions = []
for fold in range(N_FOLDS):
    test_features[cell_pca_cols] = pca_models_cells[fold].transform(test_features[cell_cols].loc[:, cell_mask])
    test_features[gene_pca_cols] = pca_models_genes[fold].transform(test_features[gene_cols].loc[:, gene_mask])
    dl_test = dls.test_dl(test_features)
    predictions.append(learners[fold].get_preds(dl=dl_test)[0])

In [None]:
submission = torch.zeros(predictions[0].shape)
for p in predictions:
    submission += p
submission/=5
subs= pd.read_csv('/kaggle/input/lish-moa/sample_submission.csv')
subs.iloc[:,1:] = submission[:,:len(scored_targets)].detach().numpy()
subs.loc[test_features['cp_type'] == 'ctl_vehicle',subs.columns[1:]] = 0
subs.to_csv('submission.csv',index=False)

In [None]:
for fold in range(N_FOLDS):
    learners[fold].export(f'{fold}.pth')