### Pseudo Labelling + Lasso + Gaussian Mixture + PCA + QDA

This notebook is a merge of  [Pseudo labelling with PCA-QDA](http://https://www.kaggle.com/rdekou/pseudo-labelling-with-pca-qda) and [GraphicalLasso + GaussianMixture](http://https://www.kaggle.com/christofhenkel/graphicallasso-gaussianmixture). Thanks to the original authors for their work. First QDA model is replaced with Lasso and Gaussian Mixture which slightly boosts the score on the LB.


## Load Data

In [None]:
import numpy as np, pandas as pd, os
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from tqdm import tqdm
from sklearn.covariance import EmpiricalCovariance
from sklearn.covariance import GraphicalLasso
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
# from sklearn.pipeline import Pipeline

from sklearn.svm import NuSVC
from sklearn import svm, neighbors, linear_model, neural_network
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm_notebook

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

train.head()

In [None]:
def get_mean_cov(x,y):
    model = GraphicalLasso()
    ones = (y==1).astype(bool)
    x2 = x[ones]
    model.fit(x2)
    p1 = model.precision_
    m1 = model.location_
    
    onesb = (y==0).astype(bool)
    x2b = x[onesb]
    model.fit(x2b)
    p2 = model.precision_
    m2 = model.location_
    
    ms = np.stack([m1,m2])
    ps = np.stack([p1,p2])
    return ms,ps


In [None]:
# INITIALIZE VARIABLES
cols = [c for c in train.columns if c not in ['id', 'target']]
cols.remove('wheezy-copper-turtle-magic')
oof = np.zeros(len(train))
preds = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for i in tqdm(range(512)):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
    
    # STRATIFIED K-FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3, train2['target']):
        
        # MODEL AND PREDICT WITH QDA
        ms, ps = get_mean_cov(train3[train_index,:],train2.loc[train_index]['target'].values)
        
        gm = GaussianMixture(n_components=2, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=100, n_init=1,means_init=ms, precisions_init=ps)
        gm.fit(np.concatenate([train3,test3],axis = 0))
        oof[idx1[test_index]] = gm.predict_proba(train3[test_index,:])[:,0]
        preds[idx2] += gm.predict_proba(test3)[:,0] / skf.n_splits

        
# PRINT CV AUC
auc = roc_auc_score(train['target'],oof)
print('QDA scores CV =',round(auc,5))


In [None]:
cat_dict = dict()

# INITIALIZE VARIABLES
cols = [c for c in train.columns if c not in ['id', 'target']]
cols.remove('wheezy-copper-turtle-magic')

for i in range(512):

    
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==i]
    test2 = test[test['wheezy-copper-turtle-magic']==i]
    idx1 = train2.index; idx2 = test2.index
    train2.reset_index(drop=True,inplace=True)
    
    
    
    sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
        
    cat_dict[i] = train3.shape[1]


In [None]:
pd.DataFrame(list(cat_dict.items()))[1].value_counts().plot.barh()

# Add pseudo label data from PCA-QDA and run a new model Variance-QDA

In [None]:

# INITIALIZE VARIABLES
test['target'] = preds
oof_qda = np.zeros(len(train))
preds_qda = np.zeros(len(test))
oof_knn = np.zeros(len(train))
preds_knn = np.zeros(len(test))
oof_svnu = np.zeros(len(train))
preds_svnu = np.zeros(len(test))
oof_svc = np.zeros(len(train))
preds_svc = np.zeros(len(test))
oof_rf = np.zeros(len(train))
preds_rf = np.zeros(len(test))
oof_mlp = np.zeros(len(train))
preds_mlp = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for k in range(512):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==k] 
    train2p = train2.copy(); idx1 = train2.index 
    test2 = test[test['wheezy-copper-turtle-magic']==k]
    
    # ADD PSEUDO LABELED DATA
    test2p = test2[ (test2['target']<=0.01) | (test2['target']>=0.99) ].copy()
    test2p.loc[ test2p['target']>=0.5, 'target' ] = 1
    test2p.loc[ test2p['target']<0.5, 'target' ] = 0 
    train2p = pd.concat([train2p,test2p],axis=0)
    train2p.reset_index(drop=True,inplace=True)
    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    pca = PCA(n_components=cat_dict[k], random_state= 1234)
    pca.fit(train2p[cols])
    train3p = pca.transform(train2p[cols])
    train3 = pca.transform(train2[cols])
    test3 = pca.transform(test2[cols])
    
    # STRATIFIED K FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3p, train2p['target']):
        test_index3 = test_index[ test_index<len(train3) ] # ignore pseudo in oof
        
        clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_qda[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_qda[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = neighbors.KNeighborsClassifier(n_neighbors=17, p=2.9)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_knn[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_knn[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.59, coef0=0.053)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_svnu[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_svnu[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = svm.SVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=42)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_svc[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_svc[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = RandomForestClassifier(n_estimators=100,random_state=1)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_rf[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_rf[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = neural_network.MLPClassifier(random_state=3,  activation='relu', solver='lbfgs', tol=1e-06, hidden_layer_sizes=(250, ))
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_mlp[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_mlp[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
       
       
    if k%32==0: print(k)
        
# PRINT CV AUC
auc = roc_auc_score(train['target'],oof_qda)
print('Pseudo Labeled QDA scores CV =',round(auc,5)) #0.97035 #0.96917..?why different?
auc = roc_auc_score(train['target'],oof_knn)
print('Pseudo Labeled KNN scores CV =',round(auc,5))
auc = roc_auc_score(train['target'],oof_svnu)
print('Pseudo Labeled SVNU scores CV =',round(auc,5)) #0.9578
auc = roc_auc_score(train['target'],oof_svc) #0.96004
print('Pseudo Labeled SVC scores CV =',round(auc,5))
auc = roc_auc_score(train['target'],oof_rf) #0.87847
print('Pseudo Labeled RF scores CV =',round(auc,5))
auc = roc_auc_score(train['target'],oof_mlp)
print('Pseudo Labeled MLP scores CV =',round(auc,5))

# Step 3 & 4 - Add pseudo label data and build second mode

In [None]:
# INITIALIZE VARIABLES
test['target'] = preds
oof_qda2 = np.zeros(len(train))
preds_qda2 = np.zeros(len(test))
oof_knn2 = np.zeros(len(train)) 
preds_knn2 = np.zeros(len(test))
oof_svnu2 = np.zeros(len(train)) 
preds_svnu2 = np.zeros(len(test))
oof_svc2 = np.zeros(len(train)) 
preds_svc2 = np.zeros(len(test))
oof_rf2 = np.zeros(len(train)) 
preds_rf2 = np.zeros(len(test))
oof_mlp2 = np.zeros(len(train)) 
preds_mlp2 = np.zeros(len(test))

# BUILD 512 SEPARATE MODELS
for k in range(512):
    # ONLY TRAIN WITH DATA WHERE WHEEZY EQUALS I
    train2 = train[train['wheezy-copper-turtle-magic']==k] 
    train2p = train2.copy(); idx1 = train2.index 
    test2 = test[test['wheezy-copper-turtle-magic']==k]
    
    # ADD PSEUDO LABELED DATA
    test2p = test2[ (test2['target']<=0.01) | (test2['target']>=0.99) ].copy()
    test2p.loc[ test2p['target']>=0.5, 'target' ] = 1
    test2p.loc[ test2p['target']<0.5, 'target' ] = 0 
    train2p = pd.concat([train2p,test2p],axis=0)
    train2p.reset_index(drop=True,inplace=True)

    
    # FEATURE SELECTION (USE APPROX 40 OF 255 FEATURES)
    sel = VarianceThreshold(threshold=1.5).fit(train2p[cols])     
    train3p = sel.transform(train2p[cols])
    train3 = sel.transform(train2[cols])
    test3 = sel.transform(test2[cols])
           
        
    # STRATIFIED K FOLD
    skf = StratifiedKFold(n_splits=11, random_state=42, shuffle=True)
    for train_index, test_index in skf.split(train3p, train2p['target']):
        test_index3 = test_index[ test_index<len(train3) ] # ignore pseudo in oof
        
        clf = QuadraticDiscriminantAnalysis(reg_param=0.5)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_qda2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_qda2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        # Model add svnu, svc, rf
        clf = neighbors.KNeighborsClassifier(n_neighbors=17, p=2.9)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_knn2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_knn2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.59, coef0=0.053)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_svnu2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_svnu2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = svm.SVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=42)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_svc2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_svc2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = RandomForestClassifier(n_estimators=100,random_state=1)
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_rf2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_rf2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
        clf = neural_network.MLPClassifier(random_state=3,  activation='relu', solver='lbfgs', tol=1e-06, hidden_layer_sizes=(250, ))
        clf.fit(train3p[train_index,:],train2p.loc[train_index]['target'])
        oof_mlp2[idx1[test_index3]] = clf.predict_proba(train3[test_index3,:])[:,1]
        preds_mlp2[test2.index] += clf.predict_proba(test3)[:,1] / skf.n_splits
        
       
    if k%32==0: print(k)
        
# PRINT CV AUC
auc = roc_auc_score(train['target'],oof_qda2) #0.97033
print('Pseudo Labeled QDA scores CV =',round(auc,5))
print('----------------')
print('knn', roc_auc_score(train['target'], oof_knn2)) 
print('svc', roc_auc_score(train['target'], oof_svc2)) #0.9457979278710315
print('svnu', roc_auc_score(train['target'], oof_svnu2)) #0.9607969615628373
print('rf', roc_auc_score(train['target'], oof_rf2)) #0.8681422696974617
print('mlp', roc_auc_score(train['target'], oof_mlp2))

# Final model accuracy

In [None]:
# auc = roc_auc_score(train['target'],0.5*(oof_var+ oof_var2) )
# print('Pseudo Labeled QDA scores CV =',round(auc,5)) #0.96959

# auc = roc_auc_score(train['target'],0.5*(oof_svnu+ oof_svnu2) )
# print('Pseudo Labeled SVNU scores CV =',round(auc,5)) #0.96067

In [None]:
auc = roc_auc_score(train['target'],oof_qda2*0.6+oof_svnu2*0.25 + oof_svc2*0.05 +oof_rf2*0.1)
print('Pseudo Labeled BLEND scores CV =',round(auc,5)) #0.96858

In [None]:
auc = roc_auc_score(train['target'],oof_qda2*0.5+oof_svnu2*0.3 + oof_svc2*0.05 +  oof_knn2*0.025 + oof_rf2*0.1 +  oof_mlp2*0.025)
print('Pseudo Labeled BLEND2 scores CV =',round(auc,5))

# Submit Predictions

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
# sub['target'] = 0.5* preds_var + 0.5*preds_var2
sub['target'] = preds_qda2
sub.to_csv('submission.csv',index=False)

import matplotlib.pyplot as plt
plt.hist(preds,bins=100)
plt.title('Final Test.csv predictions')
plt.show()

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
# sub['target'] = 0.6*+preds_var2 + 0.4*preds_svnu2
sub['target'] = preds_qda2*0.6+preds_svnu2*0.25 + preds_svc2*0.05 +preds_rf2*0.1
sub.to_csv('submission_blend.csv',index=False)

import matplotlib.pyplot as plt
plt.hist(preds,bins=100)
plt.title('Blend Test.csv predictions')
plt.show()

In [None]:
sub = pd.read_csv('../input/sample_submission.csv')
# oof_qda2*0.5+oof_svnu2*0.3 + oof_svc2*0.05 +  oof_knn2*0.025 + oof_rf2*0.1 +  oof_mlp2*0.025
sub['target'] = preds_qda2*0.5+preds_svnu2*0.3 + preds_svc2*0.05 +  preds_knn2*0.025 + preds_rf2*0.1 +  preds_mlp2*0.025
sub.to_csv('submission_blend2.csv',index=False)

import matplotlib.pyplot as plt
plt.hist(preds,bins=100)
plt.title('Blend2 Test.csv predictions')
plt.show()