In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn import feature_selection
from numpy import dot, zeros
from numpy.linalg import matrix_rank, norm
import seaborn as sns
from sklearn.feature_selection import RFECV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from time import time
from sklearn import cross_validation
from sklearn.cross_validation import ShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Critere de performance
def compute_pred_score(y_true, y_pred):
    y_comp = y_true * y_pred
    score = float(10*np.sum(y_comp == -1) + np.sum(y_comp == 0))
    score /= y_comp.shape[0]
    return score

In [None]:
X_train_fname = 'training_templates.csv'
y_train_fname = 'training_labels.txt'
X_test_fname  = 'testing_templates.csv'
X_train = pd.DataFrame(pd.read_csv(X_train_fname, sep=',', header=None))
X_test  = pd.DataFrame(pd.read_csv(X_test_fname,  sep=',', header=None).values)
y_train = np.loadtxt(y_train_fname, dtype=np.int)

Tout d'abord regardons si les colonnes sont *correlées* entre elles, auquel cas on poura enlever celles qui le sont trop.

In [None]:
sns.set(context="paper", font="monospace")
corrmat = X_test.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, xticklabels=False, yticklabels=False);

Visiblement, aucune corrélation évidente n'apparait.

Etudions la distribution des variables

In [None]:
X_train.hist(figsize=(50,50));

Les variables suivent toutes des lois normales, il n'y a pas de problèmes évidents sur ces densités (distribution anormale, valeurs manquantes ...)

On va se donner une référence avec un modèle de base.

In [None]:
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Prediction
y_pred_train =  clf.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

On va maintenant étudier les différentes *features*

In [None]:
estimator = linear_model.LogisticRegression()
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X_train, y_train)

In [None]:
print(selector.support_) 
print(selector.ranking_)

In [None]:
print(np.where(selector.support_ == False))

D'aprés cette sélection on voit que les colonnes #0, #10, #17, #18, #28, #72, #80, #104 et #117 ne sont pas significatives pour la régression logistique

In [None]:
X_test.drop([  0,  10,  17,  18,  28,  72,  80, 104, 117], axis=1, inplace= True)
X_train.drop([  0,  10,  17,  18,  28,  72,  80, 104, 117],axis=1, inplace= True)

In [None]:
print(X_test.shape, X_train.shape)

Est-ce que le retrait de ces colonnes améliore le score ?

In [None]:
clf.fit(X_train, y_train)
# Prediction
y_pred_train =  clf.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

Il y'a une légère amélioration du score.
Prennons ce classifieur comme notre base de référence.
Sur le leaderboard le score n'est pas interessant.
La classification concerne le traitement d'image, nous savons que sur ce type de problématique, les réseaux de neurones sont performants. Nous allons dorénavant exploiter la classe MLP de sklearn et faire notre sélection de variables en ce sens. Nous allons reprendre le dataset d'origine puisque la feature selection a été faite pour une régression logistique mais n'est pas possible pour un réseau de neurones.

In [None]:
X_train = pd.DataFrame(pd.read_csv(X_train_fname, sep=',', header=None))
X_test  = pd.DataFrame(pd.read_csv(X_test_fname,  sep=',', header=None).values)
clf2 = MLPClassifier()
clf2.fit(X_train, y_train)

In [None]:
# Prediction
y_pred_train =  clf2.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

Le score est déjà plus intéressant

In [None]:
y_pred = clf2.predict(X_test)
np.savetxt('y_pred.txt', y_pred, fmt='%d')

Sur le leaderboard nous obtenons un score de **0.35**
Est-ce que la standardization améliore le score ?

In [None]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

In [None]:
# Prediction
y_pred_train =  clf2.predict(X_train)

# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)

Sur les données d'entrainement le score se dégrade.

In [None]:
y_pred = clf2.predict(X_test)
np.savetxt('y_pred.txt', y_pred, fmt='%d')

La même dégradation se constate sur le leaderboard avec un score de **0.40**

Maintenant nous allons rechercher les hyper paramètres optimaux via un grid search

In [None]:
# Rechargeons nos données sans Standardization
X_train = pd.DataFrame(pd.read_csv(X_train_fname, sep=',', header=None))
X_test  = pd.DataFrame(pd.read_csv(X_test_fname,  sep=',', header=None).values)

In [None]:
clf2 = MLPClassifier(max_iter=100, solver='adam', hidden_layer_sizes=14, activation='tanh', alpha = 0.0002)
#param_grid = { 'max_iter' : [100, 300, 500, 1000]}
#grid_search = GridSearchCV(clf2, param_grid=param_grid)
#start = time()
#grid_search.fit(X_train, y_train)

#print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
#      % (time() - start, len(grid_search.cv_results_['params'])))

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

#report(grid_search.cv_results_)

Maintenant que l'on a optimisé les hyper-paramètres on va prédire sur les données de test pour voir si la performance c'est améliorée.
Le score obtenu est moins bon que le précédent **0.3438**

In [None]:
clf2 = MLPClassifier(solver='adam', hidden_layer_sizes=14, activation='tanh', alpha = 0.0002, max_iter=300)
# Prediction
clf2.fit(X_train, y_train)
y_pred_train =  clf2.predict(X_train)
# Compute the score
score = compute_pred_score(y_train, y_pred_train)
print('Score sur le train : %s' % score)
y_pred = clf2.predict(X_test)
np.savetxt('y_pred.txt', y_pred, fmt='%d')

Revenons au MLP, nous allons chercher les meilleurs hyper-paramètres et affiner notre décision sur le critère de la probabilité de prédiction.

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train, y_train, train_size = 0.5, test_size=0.5)
clf3 = MLPClassifier(random_state=79,shuffle=True)

alphas = np.logspace(-5, -1 ) 
# Première GridSearch :  param_grid = { 'hidden_layer_sizes' : range(100,210,10), 'alpha':np.logspace(-5, -1 )} 
# Résultat : hidden layer sizes : 170 et alpha = 0.01264855117962958
# Seconde param_grid = { 'hidden_layer_sizes' : range(160,180,10), 'alpha':np.linspace(0.01, 0.02, 20)} 
# Résultat {'alpha': 0.014444444444444444, 'hidden_layer_sizes': 168}
param_grid = { 'hidden_layer_sizes' : range(165,175,1), 'alpha':np.linspace(0.01, 0.02, 10)} 
grid_search = GridSearchCV(clf3, param_grid=param_grid, verbose=10, n_jobs=-1)
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_) 


In [None]:
clf3 = MLPClassifier(random_state=33,shuffle=True,
                     hidden_layer_sizes=[168,2], alpha=0.014444)
clf3.fit(x_learn,y_learn)
y_pred_train =  clf3.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score) #0.1861742


In [None]:
y_pred = clf3.predict(X_test)

In [None]:
proba_class_neg = np.transpose(clf3.predict_proba(X_test))[0]
n, bins, patches = plt.hist(proba_class_neg, 10, facecolor = 'red')


In [None]:
proba_class_pos = np.transpose(clf3.predict_proba(X_test))[1]
npos, binspos, patchespos = plt.hist(proba_class_pos, 10, facecolor = 'blue')

In [None]:
undefined = np.transpose(np.where(np.logical_and(0.1 < proba_class_neg, proba_class_neg < 0.90)))

In [None]:
y_pred[undefined] = 0
print(len(y_pred[undefined]))
print(len(y_pred[y_pred == -1]))
print(len(y_pred[y_pred == 1]))

In [None]:
np.savetxt('y_pred.txt', y_pred, fmt='%d')#  0.156544256121 sur le leaderboard

Pour améliorer le score on va essayer de classifier les prédictions à 0 avec un autre classifieur. 
Pour ce faire nous allons trouver les données d'entrainement qui correspondent le plus aux données de test non classifiées.

In [None]:
from sklearn.metrics import pairwise_distances_argmin
indexes = y_pred == 0
dist_argmin = pairwise_distances_argmin(X_test[indexes], X_train)
X_train_reclass = X_train.iloc[dist_argmin][:int(0.1 * len(X_train))]
y_train_reclass = y_train[dist_argmin][:int(0.1 * len(y_train))]

In [None]:
# Faisons une GridSearch avec un SVC et 80 % des données d'entrainement ressamblant le plus au données de test mal 
# classifiées
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.8)
svc = SVC()
param_grid = { 'C' : np.linspace(1.5,2.5,20), 'gamma': np.linspace(0.7,2,100), 'kernel' : ['rbf']}
grid_search = GridSearchCV(svc, param_grid=param_grid, verbose=10, n_jobs=-1)
# 1er grid search {'gamma': 0.77426368268112777, C': 2.1544346900318834, 'kernel': 'rbf'}
# 2e {'gamma': 0.88775510204081631, 'kernel': 'rbf', 'C': 2.333333333333333}
# 3e {'gamma': 0.88979591836734695, 'kernel': 'rbf', 'C': 2.3333333333333335}
# 4e {'gamma': 0.88888888888888895, 'kernel': 'rbf', 'C': 2.4777777777777779}
# Leadeboard : 0.23
# On augmente la confiance accordée au classifieur réseau de neurones en prennant seulement 707 points non classifiés 
# 1er grid search {'gamma': 0.89000000000000001, 'kernel': 'rbf', 'C': 2.5}
# 2e {'gamma': 1.1666666666666665, 'kernel': 'rbf', 'C': 2.0}
# 3e {'gamma': 0.89696969696969697, 'kernel': 'rbf', 'C': 2.236842105263158}
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_)

In [None]:
svc = SVC(gamma = 0.89, kernel='rbf', C = 2.2368)
svc.fit(x_learn,y_learn)
y_pred_train =  svc.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score) #0.1861742 et 0.77

In [None]:
svc.fit(X_train_reclass, y_train_reclass)
y_pred_reclass = svc.predict(X_test[indexes])
y_pred[indexes] = y_pred_reclass
np.savetxt('y_pred.txt', y_pred, fmt='%d')#  0.24 puis 0.27

Refaisons la démarche avec un autre réseau de neurone

In [None]:
# Faisons une GridSearch avec un MLP et 80 % des données d'entrainement ressamblant le plus au données de test mal 
# classifiées
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.8)
mlpReclass = MLPClassifier()
layers =[(100,2),(110,2),(120,2),(130,2),(140,2),(150,2),(160,2),(170,2),(200,2)]
param_grid =  {'hidden_layer_sizes' : layers, 'alpha':np.logspace(-5, -1, 20)}
grid_search = GridSearchCV(mlpReclass, param_grid=param_grid, verbose=10, n_jobs=-1)
#{'hidden_layer_sizes': 130, 'alpha': 0.038421052631578953}
# 2e {'alpha': 1.6237767391887208e-05, 'hidden_layer_sizes': (160, 3)}
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_)

In [None]:
mlpReclass = MLPClassifier(hidden_layer_sizes = 130, alpha = 0.038421052631578953)
mlpReclass.fit(x_learn,y_learn)
y_pred_train =  mlpReclass.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score) #0.845

In [None]:
proba_class_neg = np.transpose(mlpReclass.predict_proba(x_val))[0]
n, bins, patches = plt.hist(proba_class_neg, 10, facecolor = 'red')
proba_class_pos = np.transpose(mlpReclass.predict_proba(x_val))[1]
n, bins, patches = plt.hist(proba_class_pos, 10, facecolor = 'blue')


In [None]:
undefined_reclass = np.transpose(np.where(np.logical_and(0. < proba_class_neg, proba_class_neg < 0.95))) 
#0.1 et 0.9 = 0.1833 0.05 et 0.95 = 0.184792843691
y_pred[undefined_reclass] = 0
print(len(y_pred[undefined_reclass]))
print(len(y_pred[y_pred == -1]))
print(len(y_pred[y_pred == 1]))
np.savetxt('y_pred.txt', y_pred, fmt='%d')

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train, y_train, train_size = 0.1)
ks = range(3,8,1)
leaves = range(25,45, 5)
ps= range(1,3, 1)
weights = ['uniform','distance']
knn = KNeighborsClassifier(n_jobs=-1, algorithm='ball_tree')
param_grid = { 'leaf_size' : leaves, 'n_neighbors': ks,
             'p':ps,'weights' : weights}
grid_search = GridSearchCV(knn, param_grid=param_grid, verbose=10, n_jobs=-1)
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_) # {'n_neighbors': 6, 'weights': 'distance', 'p': 2, 'leaf_size': 25}

Généralisons l'approche en repartant du dataset d'entrainement initial

In [None]:
dist_argmin = pairwise_distances_argmin(X_test, X_train)
X_train_reclass = X_train.iloc[dist_argmin][:len(X_train * 0.1)]
y_train_reclass = y_train[dist_argmin][:len(X_train * 0.1)]

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.2)
mlpPairwise = MLPClassifier(hidden_layer_sizes=170, activation='relu', solver = 'adam')
param_grid =  {'beta_1': np.linspace(0.25,0.5,5), 'beta_2': np.linspace(0.996,0.999,5),
               'alpha': np.linspace(0.0002,0.0004, 5)}
grid_search = GridSearchCV(mlpPairwise, param_grid=param_grid, verbose=10, n_jobs=-1)
# 1 :{'beta_2': 0.99770006382255327, 'alpha': 0.00031622776601683794, 'beta_1': 0.31586390484234717}
# 2 : adam
# 3 beta1, beta2 : nothing
# 4 {'beta_2': 0.99750000000000005, 'alpha': 0.00020000000000000001, 'beta_1': 0.375}
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_)

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.8)
mlpPairwise = MLPClassifier(hidden_layer_sizes=170, activation='relu', solver = 'adam',
                            beta_1=0.375, beta_2=0.9975, alpha = 0.0002)
mlpPairwise.fit(x_learn, y_learn)
y_pred_train =  mlpPairwise.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score)

In [None]:
mlpPairwise.fit(X_train_reclass, y_train_reclass)
y_pred = mlpPairwise.predict(X_test)
proba_class_neg = np.transpose(mlpPairwise.predict_proba(X_test))[0]
proba_class_pos = np.transpose(mlpPairwise.predict_proba(X_test))[1]
y_pred[undefined_reclass] = 0
np.savetxt('y_pred.txt', y_pred, fmt='%d')#0.26

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.3)
svc = SVC(kernel = 'rbf')
param_grid = { 'C' : np.linspace(2.5,2.8,20), 'gamma': np.linspace(0.6,0.7,10)}
grid_search = GridSearchCV(svc, param_grid=param_grid, verbose=10, n_jobs=-1)
# 1 {'gamma': 0.66666666666666663, 'kernel': 'rbf', 'C': 2.6366508987303581}
# 2 {'gamma': 0.65555555555555556, 'C': 2.5}
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_)

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.88)

svc = SVC(kernel = 'rbf', gamma=0.656, C = 2.5)
svc.fit(X_train_reclass, y_train_reclass)
y_pred_train =  svc.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score)
y_pred = svc.predict(X_test)
np.savetxt('y_pred.txt', y_pred, fmt='%d')#0.1977

In [None]:
x_learn, x_val, y_learn, y_val =  train_test_split(X_train_reclass, y_train_reclass, train_size = 0.5)

knn = KNeighborsClassifier(n_jobs=-1, algorithm='ball_tree', weights='distance')
param_grid = { 'leaf_size' : range(1,10,1), 'n_neighbors': range(3,6,1),'p': [1,2]}
grid_search = GridSearchCV(knn, param_grid=param_grid, verbose=10, n_jobs=-1)
start = time()
grid_search.fit(x_learn, y_learn)
print(grid_search.best_params_) # {'n_neighbors': 6, 'weights': 'distance', 'p': 2, 'leaf_size': 25}

In [None]:
knn =KNeighborsClassifier(n_neighbors=6, p=2, leaf_size=25)
knn.fit(X_train_reclass, y_train_reclass)
y_pred_train =  knn.predict(x_val)
score = compute_pred_score(y_pred_train, y_val)
print('Score sur la validation : %s' % score)
y_pred = knn.predict(X_test)
np.savetxt('y_pred.txt', y_pred, fmt='%d')#0.1977