In [None]:
import numpy as np
import scipy.stats as ss
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tsfresh as tsf

from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import mutual_info_classif, f_classif

import os
os.listdir('/kaggle/input/lish-moa/')

In [None]:
!lscpu | grep -P '^CPU\(s\)'

In [None]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')

In [None]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [None]:
def to_str(row):
    return "".join(row[1:].tolist())

def encode_multilabel(y: pd.DataFrame):
    return y.astype('str').apply(to_str,1).astype('category').cat.codes

y_enc = encode_multilabel(train_targets_scored)

In [None]:
def q_transform(X, q):
    X_transformed = np.zeros_like(X)
    for i in range(X.shape[1]):
        transformer = QuantileTransformer(n_quantiles=q,random_state=0, output_distribution="normal")
        X_transformed[:,i:(i+1)] = transformer.fit_transform(X[:,i:(i+1)])
    return X_transformed

def q_transform(X, q, col_idx=[]):
    """
        col_idx = bool array of size same as columns in X
    """
    X_transformed = X.copy()
    
    if len(col_idx):
        idx = np.where(col_idx)[0]
    else:
        idx = range(X.shape[1])
        
    for i in idx:
        transformer = QuantileTransformer(n_quantiles=q,random_state=0, output_distribution="normal")
        X_transformed[:,i:(i+1)] = transformer.fit_transform(X[:,i:(i+1)])
    return X_transformed

def get_mutual_info_classif(X,y,**kwarg):
    info = mutual_info_classif(X,y,**kwarg)
#     info = f_classif(X,y,**kwarg)[1] # p-values
    info[info<0] = 0
    return info

def summarize_loss_difference(x):
    """
    Usage:
        x = np.array([-1,-2,3,1,2])
        summarize_loss_difference(x)
    """
    return {'il_mean' :x.mean(),
            'il_median' :np.median(x),
            'il_skew' :np.round(ss.skew(x),4) ,
            'il_percentage_features':(x>0).mean(), 
            'n_features': len(x),
            'il_n_features' :(x>0).sum(),
            'il_quantiles_features':np.quantile(x>0, [0.1,0.2,0.4,0.5,0.6,0.8,0.9]),
            'il_hist': np.histogram(x, 20),
           }

def test_Qs(X, y, qs, skew_thresh=0.8):
    org_info = get_mutual_info_classif(X, y)
    skewness = np.abs(ss.skew(X)) > skew_thresh
    org_info = org_info[skewness]
    
    info_losses = []
    for q in qs:
        X_transformed = q_transform(X, q, skewness)
        new_info = get_mutual_info_classif(X_transformed[:,skewness], y)
        info_loss = org_info - new_info
        log = {
                'q':q,
                'org_info':org_info.mean(),
                'new_info':new_info.mean(),
                'info_loss':info_loss.mean(),
        }
        print(pd.Series(log))
        log.update(summarize_loss_difference(info_loss))
        info_losses.append(log)
    return info_losses


In [None]:
# qs = np.linspace(2200, 4000, 15, dtype=np.int)
qs = np.logspace(0.5, 4.2, 15, dtype=np.int)
qs

In [None]:
X = train_features[GENES[:150]].values

info_losses = test_Qs(X, y_enc, qs)


In [None]:
gene_info_loss = pd.DataFrame(info_losses)
gene_info_loss.to_pickle('gene_info_loss.pkl')
gene_info_loss.iloc[:60,:11]

In [None]:
def plot_loss(df):
    fig, ax = plt.subplots(3,1, figsize=(12,6), sharex=True)
    ax[0].plot(df['q'], df[['il_mean']], 'o--', label="il_mean")
    ax[1].plot(df['q'], df['il_skew'], 'o--', label="il_skew");
    ax[2].plot(df['q'], df['il_percentage_features'], 'o--', label="il_percentage_features");
    plt.tight_layout()
    [axi.legend(loc="upper left") for axi in ax]
    [axi.set_xscale('log') for axi in ax]
    ax[0].xticks
    plt.show()

plot_loss(gene_info_loss)

In [None]:
X = train_features[CELLS[:]].values

info_losses = test_Qs(X, y_enc, qs=qs)


In [None]:
cell_info_loss = pd.DataFrame(info_losses)
cell_info_loss.to_pickle('cell_info_loss.pkl')
cell_info_loss.iloc[:60,:11]

In [None]:
plot_loss(cell_info_loss)

# Toy Example

In [None]:
# dataset_size = 12000
# nfeatures = 13
# nclasses = 3

# np.random.seed(1291)
# X = np.random.random((dataset_size,nfeatures))
# y = np.random.randint(0,nclasses,(dataset_size,))

# # f_classif(X, y)
# # mutual_info_classif(X, y)

# X_transformed = np.zeros_like(X)
# for i in range(X.shape[1]):
#     transformer = QuantileTransformer(n_quantiles=120,random_state=0, output_distribution="normal")
#     X_transformed[:,i:(i+1)] = transformer.fit_transform(X[:,i:(i+1)])

# # def get_mutual_info_classif(X,y,**kwarg):
# #     info = mutual_info_classif(X,y,**kwarg)
# #     info[info<0] = 0
# #     return info

# org_info = get_mutual_info_classif(X, y)
# new_info = get_mutual_info_classif(X_transformed, y)

# info_loss = org_info - new_info
# org_info, new_info, info_loss

# Check sparsity

In [None]:
X = train_features[GENES[:1]].values

i=0
X_transformed = np.zeros_like(X)
transformer = QuantileTransformer(n_quantiles=20,random_state=0, output_distribution="normal")
X_transformed[:,i:(i+1)] = transformer.fit_transform(X[:,i:(i+1)])

plt.plot(X[:,:1], '.', alpha=0.5)
plt.plot(X_transformed[:,:1], '.', alpha=0.1)

X_transformed[:,:1]

In [None]:
precision = 3
plt.hist(X_transformed[:,0].round(precision), bins=330)
# sns.distplot(X_transformed[:,0], bins=2)
np.unique(X_transformed[:,:1].round(precision)).shape, X_transformed[:,0].round(precision)