In [None]:
import sys
sys.path.append("../input/moa-scripts")
from moa import load
from metrics import logloss

import numpy as np
import pandas as pd

import warnings
import joblib
import gc
from tqdm.auto import tqdm
from collections import Counter
from datetime import datetime 

import matplotlib.pyplot as plt 
import seaborn as sns

from scipy import stats 
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.svm import OneClassSVM, SVC
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
import umap

from tensorflow.keras import models

sns.set_style("white")
%matplotlib inline
warnings.filterwarnings("ignore")

In [None]:
X, y, genes, cells, classnames, features, X_test, test_control, submission = load()
print(str(datetime.now()))

## Low dim (viz)

In [None]:
# botlleneck autoencoder
encoder = models.load_model("../input/moa-dae/encoder.h5")
enc_qt = joblib.load("../input/moa-dae/transformer.pkl")
Xenc = enc_qt.transform(X[:, 4:])
Xenc = encoder.predict((Xenc[:, :772], Xenc[:, 772:]))

# pca
pca = PCA(0.8, random_state=2020)
Xpca = pca.fit_transform(X)
# umap
Xumap = umap.UMAP(n_neighbors=10, min_dist=0.5, metric='euclidean', verbose=0).fit_transform(X)
# pca(emb)
pca_emb = PCA(0.8, random_state=2020)
Xpca_emb = pca_emb.fit_transform(Xenc)
Xumap_emb = umap.UMAP(n_neighbors=10, min_dist=0.5, metric='euclidean', verbose=0).fit_transform(Xenc)

In [None]:
def eval_clusters(stat, cluster):
    ttest_stats = pd.concat((stat.iloc[:, :206], stat[cluster]), axis=1).groupby(cluster).agg(['mean', 'std', 'count'])
    pvalues = np.zeros(206, dtype=float)
    statistics = np.zeros(206, dtype=float)
    for i in range(206):
        ttets_label_stats = ttest_stats[i].loc[0].tolist()+ttest_stats[i].loc[1].tolist()
        s, p = stats.ttest_ind_from_stats(*ttets_label_stats, False)
        pvalues[i] = p
        statistics[i] = np.abs(s)
    # print('pval<0.05:', f'{(result < 0.05).mean():.2f}')
    # print('pval<0.01:', f'{(result < 0.01).mean():.2f}')
    print(f'ttest statistic {statistics.mean():.2f}')
    print(f'support {stat[cluster].sum()}')
    
def plot_clusters(clusters=None, n=2, s=20, palette='colorblind'):
    fig, ax = plt.subplots(1,4, figsize=(25, 6))
    ax =ax.flatten()
    sns.scatterplot(Xpca[:, 0], Xpca[:, 1], ax=ax[0], s=s, hue=clusters, palette=sns.color_palette(palette, n))
    sns.scatterplot(Xumap[:, 0], Xumap[:, 1], ax=ax[1], s=s, hue=clusters, palette=sns.color_palette(palette, n))
    sns.scatterplot(Xpca_emb[:, 0], Xpca_emb[:, 1], ax=ax[2], s=s, hue=clusters, palette=sns.color_palette(palette, n))
    sns.scatterplot(Xumap_emb[:, 0], Xumap_emb[:, 1], ax=ax[3], s=s, hue=clusters, palette=sns.color_palette(palette, n))
    ax[0].set_title("PCA")
    ax[1].set_title("UMAP")
    ax[2].set_title("PCAemb")
    ax[3].set_title("UMAPemb")
    for i in range(4):
        ax[i].legend().set_visible(False)
    plt.show();
    
def eval_multiple_clusters(stat, cluster):
    statistics = np.zeros(206, dtype=float)
    for i in range(206):
        samples = [sample[i].values for _, sample in stat[[cluster, i]].groupby(cluster)]
        statistics[i] = np.abs(stats.kruskal(*samples))[0]
    support = stat[cluster].value_counts().mean()
    nclu = len(set(stat[cluster]))
    print(f'clusters {nclu}')
    print(f'support {support}')
    print(f'ttest statistic [sqrt] {statistics.mean()/np.sqrt(nclu):.2f}')
    print(f'ttest statistic [log] {statistics.mean()/np.log(nclu):.2f}')
    return statistics.mean()/np.sqrt(nclu)

In [None]:
plot_clusters()

In [None]:
stat = pd.DataFrame(y)

## DBSCAN -> SVM

In [None]:
# dbscan
db = DBSCAN(20, min_samples=800, leaf_size=50)
db.fit(Xpca)
y_db = db.labels_.copy()
y_db[y_db==-1] = 1

# -> SVM
svm = SVC()
svm.fit(Xpca, y_db)
stat['dbscan'] = (db.labels_>-1).astype(int)
stat['dbscan_svm'] = svm.predict(Xpca)

cluster = 'dbscan'
plot_clusters(stat[cluster])
eval_clusters(stat, cluster)
stat["random"] = np.random.random(len(y)) < stat[cluster].mean()
print('vs random')
eval_clusters(stat, 'random')

joblib.dump(svm, 'dbscan_svm.pkl')

## Envelope

In [None]:
ee = EllipticEnvelope(contamination=0.05, random_state=2020)
ee.fit(Xpca)
stat['envelope'] = ee.predict(Xpca) == -1

cluster = 'envelope'
plot_clusters(stat[cluster])
eval_clusters(stat, cluster)
stat["random"] = np.random.random(len(y)) < stat[cluster].mean()
print('vs random')
eval_clusters(stat, 'random')

joblib.dump(ee, 'envelope.pkl')

## 1class SVM

In [None]:
%%time 
svm1 = OneClassSVM(nu=0.1, gamma=1e-3)
svm1.fit(Xpca)

stat['oneclasssvm'] = svm1.predict(Xpca) == -1
cluster = 'oneclasssvm'
plot_clusters(stat[cluster])
eval_clusters(stat, cluster)
stat["random"] = np.random.random(len(y)) < stat[cluster].mean()
print('vs random')
eval_clusters(stat, 'random')

joblib.dump(svm1, 'svm1.pkl')

## Kmeans

In [None]:
# outlier detection (2 classes)
kmeans = KMeans(2, n_init=5, max_iter=100, random_state=2020)
kmeans.fit(Xpca)

stat['kmeans'] = kmeans.predict(Xpca)
cluster = 'kmeans'
plot_clusters(stat[cluster])
eval_clusters(stat, cluster)
stat["random"] = np.random.random(len(y)) < stat[cluster].mean()
print('vs random')
eval_clusters(stat, 'random')

joblib.dump(kmeans, 'kmeans.pkl')

In [None]:
# history = []
# for c in [14, 18]:
#     print('='*100)
#     kmeans = KMeans(c, n_init=5, max_iter=100, random_state=2020)
#     kmeans.fit(Xpca)
#     stat['kmeans2'] = kmeans.predict(Xpca)
#     stat['random'] = stat['kmeans2'].copy(deep=True)
#     stat['random'] = stat['random'].sample(frac=1.0).values
#     history.append(eval_multiple_clusters(stat, 'kmeans2'))
#     eval_multiple_clusters(stat, 'random')
#     # plot_clusters(stat['kmeans2'], len(set(stat['kmeans2'])))
# sns.lineplot(x=np.arange(len(history))*2, y=history);

In [None]:
kmeans = KMeans(18, n_init=5, max_iter=100, random_state=2020)
kmeans.fit(Xpca)
stat['kmeans2'] = kmeans.predict(Xpca)
plot_clusters(stat['kmeans2'], len(set(stat['kmeans2'])), s=15)

joblib.dump(kmeans, 'kmeans14.pkl')

## GMM

In [None]:
# outlier det (2 comps)
gmm = GaussianMixture(2, n_init=1, covariance_type='tied', random_state=2020)
gmm.fit(Xpca)
cluster = 'gmm'
stat[cluster] = gmm.predict(Xpca)
plot_clusters(stat[cluster])
eval_clusters(stat, cluster)
stat["random"] = np.random.random(len(y)) < stat[cluster].mean()
print('vs random')
eval_clusters(stat, 'random')

joblib.dump(gmm, 'gmm.pkl')

In [None]:
# history = []
# cluster = 'gmm2'
# x = np.arange(22, 27, 1)
# for c in x:
#     print('='*100)
#     print(str(datetime.now()))
#     gmm = GaussianMixture(c, n_init=1, covariance_type='tied', random_state=2020, max_iter=30)
#     gmm.fit(Xpca)
#     stat[cluster] = gmm.predict(Xpca)
#     history.append(eval_multiple_clusters(stat, cluster))

In [None]:
gmm = GaussianMixture(11, n_init=1, covariance_type='tied', random_state=2020, max_iter=30)
gmm.fit(Xpca)
cluster = 'gmm2'
stat[cluster] = gmm.predict(Xpca)
plot_clusters(stat[cluster], len(set(stat[cluster])), s=10, palette='Paired')
eval_multiple_clusters(stat, cluster)

In [None]:
output = stat.iloc[:, 206:].copy(deep=True)
output.drop('random', axis=1, inplace=True)
joblib.dump(output, 'output.pkl')

## TEST check

In [None]:
X_test_pca = pca.transform(X_test)
X_test_emb = enc_qt.transform(X_test[:, 4:])
X_test_emb = encoder.predict((X_test_emb[:, :772], X_test_emb[:, 772:]))
X_test_umap = umap.UMAP(n_neighbors=10, min_dist=0.5, metric='euclidean', verbose=0).fit_transform(X_test)

In [None]:
def plot_test_clusters(clusters=None):
    fig, ax = plt.subplots(1,3, figsize=(19, 6))
    ax =ax.flatten()
    sns.scatterplot(X_test_pca[:, 0], X_test_pca[:, 1], ax=ax[0], s=20, hue=clusters)
    sns.scatterplot(X_test_umap[:, 0], X_test_umap[:, 1], ax=ax[1], s=20, hue=clusters)
    sns.scatterplot(X_test_emb[:, 0], X_test_emb[:, 1], ax=ax[2], s=20, hue=clusters)
    ax[0].set_title("PCA")
    ax[1].set_title("UMAP")
    ax[2].set_title("PCAemb")
    plt.show();

plot_test_clusters()

In [None]:
plot_test_clusters(svm.predict(X_test_pca))

In [None]:
plot_test_clusters(svm1.predict(X_test_pca)==-1)

In [None]:
plot_test_clusters(ee.predict(X_test_pca)==-1)

## Pairs + triplets

In [None]:
# pairs = []
# for r in trg[df.targets==2].iterrows():
#     pairs.append(tuple(r[-1][r[-1]==1].index))
# pairs = Counter(pairs)
# triplets = []
# for r in trg[df.targets==3].iterrows():
#     triplets.append(tuple(r[-1][r[-1]==1].index))
# triplets = Counter(triplets)
# trg_corr=trg.corr()
# np.fill_diagonal(trg_corr.values, 0)
# top_corr = {}
# for r in trg_corr.iterrows():
#     d1 = r[0]
#     d2 = list(r[1][r[1]<-0.3])
#     d3 = list(r[1][r[1]>0.4])
#     if len(d2) or len(d3):
#         top_corr[d1] = (d2, d3)

## t.test

In [None]:
trg = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
df = pd.read_csv('../input/lish-moa/train_features.csv')
nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')
ts = pd.read_csv('../input/lish-moa/test_features.csv')
trg = trg[df['cp_type']!='ctl_vehicle']
nonscored = nonscored[df['cp_type']!='ctl_vehicle']
df = df[df['cp_type']!='ctl_vehicle']
ts = ts[ts['cp_type']!='ctl_vehicle']
trg.set_index('sig_id', inplace=True)
df.set_index('sig_id', inplace=True)
nonscored.set_index('sig_id', inplace=True)
df['targets'] = trg.sum(axis=1)

df.loc[df['targets']>=3, 'targets'] = 3
df['targetsb'] = df['targets'].copy()
df.loc[df['targets']>=1, 'targetsb'] = 1
means = df.loc[:, 'g-0':'targetsb'].groupby('targetsb').agg(['mean', 'std', 'count']).stack()
pvalue = {f:stats.ttest_ind_from_stats(*means.loc[:, f].values, False)[-1] for f in means.columns[:-1]}
pvalue = pd.Series(pvalue)

joblib.dump(pvalue, "ttest_pvalue.pkl")
(pvalue >= 0.01).sum()

## Interactions & numeric transformation

In [None]:
features_ttest = {f: np.abs( stats.ttest_ind_from_stats(*means.loc[:, f].values, False)[0] ) for f in means.columns[:-1]}

In [None]:
interactions = []
flist = [f for f in list(features_ttest) if pvalue[f] > 1e-5 and 'g-' in f]
df_inter = df[['targetsb']].copy(deep=True)

In [None]:
### Multiply
for i1 in tqdm(range(len(flist))):
    f1 = flist[i1]
    for i2 in range(i1+1, len(flist)):
        f2 = flist[i2]
        df_inter['f'] = df[f1] * df[f2]
        ttest, pval = stats.ttest_ind_from_stats(*df_inter.groupby('targetsb').agg(['mean', 'std', 'count']).stack().values, False)
        if pval < min(pvalue[f1], pvalue[f2]) and pval < 1e-10:
            r = ('*',f1,f2,pval)
            interactions.append(r)
            print(r)

In [None]:
### Sum
for i1 in tqdm(range(len(flist))):
    f1 = flist[i1]
    for i2 in range(i1+1, len(flist)):
        f2 = flist[i2]
        df_inter['f'] = df[f1] + df[f2]
        ttest, pval = stats.ttest_ind_from_stats(*df_inter.groupby('targetsb').agg(['mean', 'std', 'count']).stack().values, False)
        if pval < min(pvalue[f1], pvalue[f2]) and pval < 1e-10:
            r = ('+',f1,f2,pval)
            interactions.append(r)
            #print(r)

In [None]:
### Square
for i1 in tqdm(range(len(flist))):
    f1 = flist[i1]
    df_inter['f'] = df[f1] ** 2
    ttest, pval = stats.ttest_ind_from_stats(*df_inter.groupby('targetsb').agg(['mean', 'std', 'count']).stack().values, False)
    if pval < 1e-10:
        r = ('**',f1,None,pval)
        interactions.append(r)

In [None]:
joblib.dump(interactions, 'interactions.pkl');
print(
    len(interactions)
    ,len([_ for i in interactions if i[-1]<1e-15])
    ,len([_ for i in interactions if i[-1]<1e-30])
    ,len([_ for i in interactions if i[-1]<1e-50])
)

### KMeans 10: splits

In [None]:
pca = PCA(0.8, random_state=2020)
Xpca = pca.fit_transform(np.concatenate([X, y], axis=1))
print(Xpca.shape)

In [None]:
kmeans = KMeans(100, n_init=5, max_iter=100, random_state=2020)
kmeans.fit(Xpca)
stat['split'] = kmeans.predict(Xpca)
plot_clusters(stat['split'], len(set(stat['split'])), s=12)
eval_multiple_clusters(stat, "split")
joblib.dump(stat['split'].values, 'split_kmeans_100.pkl')