In [None]:
import numpy as np, pandas as pd, os
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm_notebook as tqdm
from sklearn.mixture import GaussianMixture as GM

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train.head()

In [None]:
if test.shape[0] < 150000:
    sub = pd.read_csv('../input/sample_submission.csv')
    sub.to_csv('submission.csv',index=False)
    6 + 'j'

# Step 1 and 2 - Build first QDA model and predict test

In [None]:
# INITIALIZE VARIABLES
cols = [c for c in train.columns if c not in ['id', 'target']]
cols.remove('wheezy-copper-turtle-magic')
oof = np.zeros(len(train))
preds = np.zeros(len(test))
shapes = []
# BUILD 512 SEPARATE MODELS
for i in tqdm(range(512)):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    full = np.vstack([train3, test3])
    shapes.append(train3.shape[1])
    # GMM
    n_components = 6
    n_init = 10
    best_reg_covar = {
        33: 1.5,
        34: 2.,
        35: 2.5,
        36: 3.5,
        37: 4.,
        38: 4.5,
        39: 5.,
        40: 5.5,
        41: 6.,
        42: 6.5,
        43: 7.,
        44: 7.5,
        45: 8.,
        46: 8.5,
        47: 9.
    }
    reg_covar = best_reg_covar[train3.shape[1]]
    X = np.vstack([train3, test3])
    gm = GM(n_components=n_components, n_init=n_init, 
            random_state=0, init_params='random', reg_covar=reg_covar)
    gm.fit(X)

    res = pd.DataFrame(data=gm.predict(train3), columns=['cluster'])
    res['target'] = train2['target'].values
    res = res.groupby('cluster')['target'].mean()
    ones_indices = []
    for j in range(len(res)):
        if res[j] > 0.5:
            ones_indices.append(j)
    oof[idx1] = gm.predict_proba(train3)[:, ones_indices].sum(axis=1)
    preds[idx2] = gm.predict_proba(test3)[:, ones_indices].sum(axis=1)

   
# PRINT CV AUC
auc = roc_auc_score(train['target'],oof)
print('QDA scores CV =',round(auc,5))

# Submit Predictions

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = preds
sub.to_csv('submission.csv',index=False)

import matplotlib.pyplot as plt
plt.hist(preds,bins=100)
plt.title('Final Test.csv predictions')
plt.show()