### Loading Libraries

In [None]:
from tqdm import tqdm
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np 
import pandas as pd 
from sklearn.metrics import roc_auc_score
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold

### Loading Data

In [None]:
%%time

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

## Identifying Clusters and Predicting Classes

In what follows, I will assume that there are 2 clusters per class in the data set. To identify these clusters let's run `GMM` on positive and negative instances separately. In each case, our goal is to label instances that belong to two different clusters. We will strore the mean and covariance matricies for all four clusters for future use.

In [None]:
%%time

NFOLDS=11
RS=42

oof=np.zeros(len(train))
preds=np.zeros(len(test))

magic_max=train['wheezy-copper-turtle-magic'].max()
magic_min=train['wheezy-copper-turtle-magic'].min()

auc_all=np.array([])

print(f"Computing centroids and covariances for the four clusters (two per class).")

# BUILD 512 SEPARATE NON-LINEAR MODELS
#for i in tqdm(range(10)): 
for i in tqdm(range(magic_min, magic_max+1)):  
    # EXTRACT SUBSET OF DATASET WHERE WHEEZY-MAGIC EQUALS i     
    X = train[train['wheezy-copper-turtle-magic']==i].copy()
    Y = X.pop('target').values
    X_test = test[test['wheezy-copper-turtle-magic']==i].copy()
    idx_train = X.index 
    idx_test = X_test.index
    X.reset_index(drop=True,inplace=True)

    cols = [c for c in X.columns if c not in ['id', 'wheezy-copper-turtle-magic']]

    X = X[cols].values             # numpy.ndarray
    X_test = X_test[cols].values   # numpy.ndarray

    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    vt = VarianceThreshold(threshold=1.5).fit(X)
    X = vt.transform(X)            # numpy.ndarray
    X_test = vt.transform(X_test)  # numpy.ndarray   

    # STRATIFIED K FOLD
    auc_folds=np.array([])
    
    folds = StratifiedKFold(n_splits=NFOLDS, random_state=RS)

    for fold_num, (train_index, val_index) in enumerate(folds.split(X, Y), 1):

        X_train, Y_train = X[train_index, :], Y[train_index]
        X_val, Y_val = X[val_index, :], Y[val_index]

        X_train_0 = X_train[Y_train==0]
        Y_train_0 = Y_train[Y_train==0].reshape(-1, 1)

        X_train_1 = X_train[Y_train==1]
        Y_train_1 = Y_train[Y_train==1].reshape(-1, 1)

        params={'n_components' : 2, 
                'init_params': 'random', 
                'covariance_type': 'full', 
                'tol':0.001, 
                'reg_covar': 0.001,#0.001, 
                'max_iter': 100, 
                'n_init': 10,
               }

        clf_0 = GaussianMixture(**params)

        clf_0.fit(X_train_0)
        #labels_0 = clf_0.predict(X_train_0)
        means_0 = clf_0.means_
        covs_0 = clf_0.covariances_
        ps_0 = [np.linalg.inv(m) for m in covs_0]
        
        clf_1 = GaussianMixture(**params)
        
        clf_1.fit(X_train_1)
        #labels_1 = clf_1.predict(X_train_1)
        means_1 = clf_1.means_
        covs_1 = clf_1.covariances_
        ps_1 = [np.linalg.inv(m) for m in covs_1]
        
        #MEANS AND COVARIANCES FOR THE CLUSTERS       
        ms = np.stack((means_0[0], means_0[1], means_1[0], means_1[1]))
        ps = np.stack((ps_0[0], ps_0[1], ps_1[0], ps_1[1]))
        
        #PARAMETERS FOR THE MAIN CLASSIFIER
        params={'n_components' : 4, 
                'init_params': 'random', 
                'covariance_type': 'full', 
                'tol':0.001, 
                'reg_covar': 0.001, 
                'max_iter': 100, 
                'n_init': 10, 
                'means_init': ms, 
                'precisions_init': ps,
               }
        
        #INSTANTIATING THE MAIN CLASSIFIER
        clf = GaussianMixture(**params)   
        
        clf.fit(np.concatenate([X_train, X_test], axis = 0))
        
        oof[idx_train[val_index]] = np.sum(clf.predict_proba(X_val)[:, 2:], axis=1)
        preds[idx_test] += np.sum(clf.predict_proba(X_test)[:,2: ], axis=1)/NFOLDS
        
#         oof[idx_train[val_index]] = clf.predict_proba(X_val)[:,1]
#         preds[idx_test] += clf.predict_proba(X_test)[:,1]/NFOLDS

        auc = roc_auc_score(Y_val, oof[idx_train[val_index]])
        auc_folds = np.append(auc_folds, auc)

    auc_all = np.append(auc_all, np.mean(auc_folds))

# PRINT CROSS-VALIDATION AUC FOR THE CLASSFIER
auc_combo = roc_auc_score(train['target'].values, oof)
auc_folds_average = np.mean(auc_all)
std = np.std(auc_all)/(np.sqrt(NFOLDS)*np.sqrt(magic_max+1))

print(f'The combined AUC CV score is {round(auc_combo,5)}.')    
print(f'The folds average AUC CV score is {round(auc_folds_average,5)}.')
print(f'The standard deviation is {round(std, 5)}.')

### Creating the submission file

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = preds
sub.to_csv('submission.csv',index=False)