In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook
import warnings
import multiprocessing
from scipy.optimize import minimize  
import time
from sklearn.model_selection import GridSearchCV, train_test_split

warnings.filterwarnings('ignore')
# STEP 2
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]
print(train.shape, test.shape)
# STEP 3
oof = np.zeros(len(train))
preds = np.zeros(len(test))
params = [{'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]}]



# 512 models
reg_params = np.zeros(512)
for i in tqdm_notebook(range(512)):
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)

    data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
    pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())])
    data2 = pipe.fit_transform(data[cols])
    train3 = data2[:train2.shape[0]]; test3 = data2[train2.shape[0]:]

    skf = StratifiedKFold(n_splits=11, random_state=42)
    for train_index, test_index in skf.split(train2, train2['target']):

        qda = QuadraticDiscriminantAnalysis()
        clf = GridSearchCV(qda, params, cv=4)
        clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
        reg_params[i] = clf.best_params_['reg_param']
        oof[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
        preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits

auc = roc_auc_score(train['target'], oof)
print(f'AUC: {auc:.5}')

# STEP 4
for itr in range(10):
    test['target'] = preds
    test.loc[test['target'] > 0.955, 'target'] = 1 # initial 94
    test.loc[test['target'] < 0.045, 'target'] = 0 # initial 06
    usefull_test = test[(test['target'] == 1) | (test['target'] == 0)]
    new_train = pd.concat([train, usefull_test]).reset_index(drop=True)
    print(usefull_test.shape[0], "Test Records added for iteration : ", itr)
    new_train.loc[oof > 0.995, 'target'] = 1 # initial 98
    new_train.loc[oof < 0.005, 'target'] = 0 # initial 02
    oof2 = np.zeros(len(train))
    preds = np.zeros(len(test))
    for i in tqdm_notebook(range(512)):

        train2 = new_train[new_train['wheezy-copper-turtle-magic']==i]
        test2 = test[test['wheezy-copper-turtle-magic']==i]
        idx1 = train[train['wheezy-copper-turtle-magic']==i].index
        idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)

        data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
        pipe = Pipeline([('vt', VarianceThreshold(threshold=2)), ('scaler', StandardScaler())])
        data2 = pipe.fit_transform(data[cols])
        train3 = data2[:train2.shape[0]]
        test3 = data2[train2.shape[0]:]

        skf = StratifiedKFold(n_splits=11, random_state=time.time)
        for train_index, test_index in skf.split(train2, train2['target']):
            oof_test_index = [t for t in test_index if t < len(idx1)]
            
            clf = QuadraticDiscriminantAnalysis(reg_params[i])
            clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
            if len(oof_test_index) > 0:
                oof2[idx1[oof_test_index]] = clf.predict_proba(train3[oof_test_index,:])[:,1]
            preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
    auc = roc_auc_score(train['target'], oof2)
    print(f'AUC: {auc:.5}')
    
# STEP 5
# sub = pd.read_csv('../input/sample_submission.csv')
# sub['target'] = preds
# sub.to_csv('submission.csv',index=False)

In [None]:
test['target'] = preds
test.loc[test['target'] > 0.9, 'target'] = 1
test.loc[test['target'] < 0.1, 'target'] = 0
usefull_test = test[(test['target'] == 1) | (test['target'] == 0)]
new_train = pd.concat([train, usefull_test]).reset_index(drop=True)
new_train.loc[oof > 0.999, 'target'] = 1
new_train.loc[oof < 0.001, 'target'] = 0

In [None]:
import numpy as np, pandas as pd, os
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from sklearn.covariance import EmpiricalCovariance
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sympy 


In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

# train.head()

In [None]:
from sklearn.covariance import GraphicalLasso, OAS

def get_mean_cov(x,y):
    model = OAS()
    ones = (y==1).astype(bool)
    x2 = x[ones]
    model.fit(x2)
    p1 = model.precision_
    m1 = model.location_
    
    onesb = (y==0).astype(bool)
    x2b = x[onesb]
    model.fit(x2b)
    p2 = model.precision_
    m2 = model.location_
    
    ms = np.stack([m1,m2])
    ps = np.stack([p1,p2])
    return ms,ps


In [None]:
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.cluster import Birch
from sklearn.cluster import DBSCAN

# INITIALIZE VARIABLES
cols = [c for c in train.columns if c not in ['id', 'target']]
cols.remove('wheezy-copper-turtle-magic')
oof2 = np.zeros(len(train))
preds2 = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for i in tqdm(range(512)):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = new_train[new_train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train[train['wheezy-copper-turtle-magic']==i].index
    idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    # STRATIFIED K-FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3, train2['target']):
        oof_test_index2 = [t for t in test_index if t < len(idx1)]        
        x_train, y_train = train3[train_index,:], train2.loc[train_index]['target'].values       
        x_train_0 = x_train[y_train==0]
        x_train_1 = x_train[y_train==1]
        
#         brc = BayesianGaussianMixture(n_components=3, covariance_type='full', weight_concentration_prior=1e-2, 
#                                       weight_concentration_prior_type='dirichlet_process', mean_precision_prior=1e-2, covariance_prior=1e0 * np.eye(2),
#                                       init_params="random", max_iter=100, random_state=666)#Birch(branching_factor=50, n_clusters=3, threshold=0.4, compute_labels=True)
        brc = Birch(branching_factor=50, n_clusters=3, threshold=0.6, compute_labels=True)
        labels_0 = brc.fit_predict(x_train_0)
        labels_1 = brc.fit_predict(x_train_1) 
        
        zero_mean = []
        zero_cov = []
        for l in np.unique(labels_0):
            model = OAS()
            model.fit(x_train_0[labels_0==l])
            p = model.precision_
            m = model.location_
            
            zero_mean.append(m)
            zero_cov.append(p)
            
        one_mean = []
        one_cov = []
        for l in np.unique(labels_1):
            model = OAS()
            model.fit(x_train_1[labels_1==l])
            p = model.precision_
            m = model.location_
            
            one_mean.append(m)
            one_cov.append(p)
       
            
            
        
#         print(np.array(zero_mean).mean(axis=0))
        
        ms = np.stack(zero_mean + one_mean)
        ps = np.stack(zero_cov +  one_cov)
        
      
        gm = GaussianMixture(n_components=6, init_params='kmeans', 
                             covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,
                             means_init=ms, precisions_init=ps, random_state=666)
        gm.fit(np.concatenate([train3[train_index,:],test3],axis = 0))
        if len(oof_test_index2) > 0:
            oof2[idx1[oof_test_index2]] = gm.predict_proba(train3[oof_test_index2,:])[:, 0:3].mean(axis=1)
        preds2[idx2] += gm.predict_proba(test3)[:, 0:3].mean(axis=1) / skf.n_splits
#     print('AUC ', i, roc_auc_score(1- train2['target'], oof2[idx1]))    

        
# PRINT CV AUC
auc = roc_auc_score(1 - train['target'],oof2)
print('QDA scores CV =',round(auc,5))

In [None]:
auc = roc_auc_score(1-train['target'],oof2)
print('QDA scores CV =',round(auc,5))

# Submit Predictions

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
sub['target'] = preds2
sub.to_csv('submission.csv',index=False)

import matplotlib.pyplot as plt
plt.hist(preds2,bins=100)
plt.title('Final Test.csv predictions')
plt.show()