In [1]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.mixture import GaussianMixture
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook as tqdm
import numpy as np, pandas as pd, os
from sklearn.cluster import KMeans
from sklearn.covariance import OAS

In [2]:
## Seed
seed = 42
np.random.seed(seed)
## Number of clusters as a parametr in make_clussification 
clusters_per_class = 3

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
def get_precision(X, y, clusters_per_class):
    precisions = []
    # Get clusters for rows
    for i in (0,1):
        Xy = X[y==i]
        ## Find cluster for each target
        kmeans = KMeans(n_clusters = clusters_per_class)
        clusters = kmeans.fit_predict(Xy)
        # Get initial precisions matrices for clusters
        for i in range(clusters_per_class):
            C = Xy[clusters==i]
            oas = OAS().fit(C)
            precisions.append(oas.precision_)
    return np.stack(precisions)

In [4]:
cols = [c for c in train.columns if c not in ['id', 'target']]

train_preds = pd.Series(np.zeros(len(train)), index = train.index)
test_preds = pd.Series(np.zeros(len(test)), index = test.index)

# Build model for each wheezy-copper-turtle-magic
for i in tqdm(range(512)):
    train_idx = train['wheezy-copper-turtle-magic'] == i
    test_idx = test['wheezy-copper-turtle-magic'] == i
    
    # Feature selection
    var_thr = VarianceThreshold(threshold=2)
    X_train = var_thr.fit_transform(train.loc[train_idx, cols])
    X_test = var_thr.transform(test.loc[test_idx, cols])
    y = train.loc[train_idx, 'target'].values
    X = np.vstack((X_train, X_test))
    
    # Precisions matrices
    prec_init = get_precision(X_train, y, clusters_per_class)
    
    # GMM
    gmm = GaussianMixture(n_components=6, init_params='random', covariance_type='full', 
                          tol=1e-5, max_iter=500, precisions_init = prec_init)
    gmm.fit(X)

    train_preds[train_idx] = gmm.predict_proba(X_train)[:, clusters_per_class:].sum(axis=1)
    test_preds[test_idx] = gmm.predict_proba(X_test)[:, clusters_per_class:].sum(axis=1)
    
print('Train AUC - {:1.5f}'.format(roc_auc_score(train['target'], train_preds)))

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


Train AUC - 0.97538


In [5]:
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = test_preds
sub.to_csv('submission.csv', index=False)