In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer
import warnings
warnings.filterwarnings("ignore")

## Load Preprocessed features and labels

In [2]:
df_all = pd.read_csv('./input/TrainTest_Preprocess.csv')
labels = pd.read_csv('./input/label.csv')

In [4]:
print('Train+Test - Shape', df_all.shape)
print('Label - Shape', labels.shape)

Train+Test - Shape (275547, 18)
Label - Shape (213451, 1)


In [6]:
vals = df_all.values
Numtrain = len(labels)
X_train = vals[:Numtrain]
X_test = vals[Numtrain:]
print('X train shape',X_train.shape)
print('X test shape', X_test.shape)

X train shape (213451, 18)
X test shape (62096, 18)


## Evaluation

In [7]:
# Reference Kaggle
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(te_labels, predict, k):
    
    lb = LabelBinarizer()
    lb.fit(range(len(predict) + 1))
    T = lb.transform(te_labels)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predict):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        if best == 0:
            best = 0.000000001
        score = float(actual) / float(best)
        scores.append(score)
    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [8]:
def folds_to_split(data,targets,train,test):
    data_tr = pd.DataFrame(data).iloc[train]
    data_te = pd.DataFrame(data).iloc[test]
    labels_tr = pd.DataFrame(targets).iloc[train]
    labels_te = pd.DataFrame(targets).iloc[test]
    return [data_tr, data_te, labels_tr, labels_te]

## Model: Decision Tree

## Different criterion

In [9]:
from sklearn.model_selection import KFold

criterion='entropy'

foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    
    DT =  tree.DecisionTreeClassifier(criterion=criterion)
    DT.fit(tr_data, tr_labels.values.ravel())
    prob_arr_DT = DT.predict_proba(te_data)
    score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)
    
    foldnum+=1
    print(foldnum, "-fold")
    fold_results.loc[foldnum, 'nDCG_DT'] = score_DT
    print(score_DT)

print("5-fold avg nDCG:",fold_results.mean())

1 -fold
0.6203388142234675
2 -fold
0.6513505338053214
3 -fold
0.6822330726439281
4 -fold
0.6859020187168331
5 -fold
0.6743336612874675
5-fold avg nDCG: nDCG_DT    0.662832
dtype: float64


In [10]:
criterion='gini'

foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    
    DT =  tree.DecisionTreeClassifier(criterion=criterion)
    DT.fit(tr_data, tr_labels.values.ravel())
    prob_arr_DT = DT.predict_proba(te_data)
    score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)
    
    foldnum+=1
    print(foldnum, "-fold")
    fold_results.loc[foldnum, 'nDCG_DT'] = score_DT
    print(score_DT)

print("5-fold avg nDCG:",fold_results.mean())

1 -fold
0.6225225832372667
2 -fold
0.6562259599869823
3 -fold
0.6844825593547923
4 -fold
0.6774899793418753
5 -fold
0.6782749821657894
5-fold avg nDCG: nDCG_DT    0.663799
dtype: float64


## Different max-path

In [11]:
criterion='gini'

foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    for depth in range(3,6):   
        DT =  tree.DecisionTreeClassifier(criterion=criterion,max_depth = depth)
        DT.fit(tr_data, tr_labels.values.ravel())
        prob_arr_DT = DT.predict_proba(te_data)
        score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)

        
        fold_results.loc[foldnum, 'nDCG_DT-'+str(depth) ] = score_DT
        print("Max depth %d ; Score %f" %  (depth, score_DT))
        
    print("-------------------------------")

print("5-fold avg nDCG:",fold_results.mean())


1 -fold
Max depth 3 ; Score 0.771381
Max depth 4 ; Score 0.782531
Max depth 5 ; Score 0.782172
-------------------------------
2 -fold
Max depth 3 ; Score 0.810472
Max depth 4 ; Score 0.812333
Max depth 5 ; Score 0.812603
-------------------------------
3 -fold
Max depth 3 ; Score 0.832128
Max depth 4 ; Score 0.833548
Max depth 5 ; Score 0.833871
-------------------------------
4 -fold
Max depth 3 ; Score 0.843445
Max depth 4 ; Score 0.845496
Max depth 5 ; Score 0.845626
-------------------------------
5 -fold
Max depth 3 ; Score 0.832072
Max depth 4 ; Score 0.834231
Max depth 5 ; Score 0.834570
-------------------------------
5-fold avg nDCG: nDCG_DT-3    0.817900
nDCG_DT-4    0.821628
nDCG_DT-5    0.821768
dtype: float64


### =>The deeper the max depth, the better the nDCG score. 

## Different min_samples_leaf

In [12]:
criterion='gini'
max_depth=5

foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    for min_leaf in [5,10,20,40]:   
        DT =  tree.DecisionTreeClassifier(criterion=criterion,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_leaf)
        
        DT.fit(tr_data, tr_labels.values.ravel())
        prob_arr_DT = DT.predict_proba(te_data)
        score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)

        
        fold_results.loc[foldnum, 'nDCG_DT-'+str(min_leaf) ] = score_DT
        print("Min samples leaf %d ; Score %f" %  (min_leaf, score_DT))
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Min samples leaf 5 ; Score 0.782172
Min samples leaf 10 ; Score 0.782172
Min samples leaf 20 ; Score 0.782172
Min samples leaf 40 ; Score 0.782078
-------------------------------
2 -fold
Min samples leaf 5 ; Score 0.812603
Min samples leaf 10 ; Score 0.812603
Min samples leaf 20 ; Score 0.812603
Min samples leaf 40 ; Score 0.812598
-------------------------------
3 -fold
Min samples leaf 5 ; Score 0.833871
Min samples leaf 10 ; Score 0.833871
Min samples leaf 20 ; Score 0.833871
Min samples leaf 40 ; Score 0.833871
-------------------------------
4 -fold
Min samples leaf 5 ; Score 0.845626
Min samples leaf 10 ; Score 0.845626
Min samples leaf 20 ; Score 0.845626
Min samples leaf 40 ; Score 0.845565
-------------------------------
5 -fold
Min samples leaf 5 ; Score 0.834570
Min samples leaf 10 ; Score 0.834580
Min samples leaf 20 ; Score 0.834580
Min samples leaf 40 ; Score 0.834580
-------------------------------
5-fold avg nDCG:
 nDCG_DT-5     0.821768
nDCG_DT-10    0.821770
n

### =>min samples leaf does not matter a lot on score.

## Different max_samples_nodes

In [14]:
criterion='gini'
max_depth=5
min_samples_leaf=10

foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    for max_leaf in [5,10,15]:   
        DT =  tree.DecisionTreeClassifier(criterion=criterion,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          max_leaf_nodes=max_leaf)
        
        DT.fit(tr_data, tr_labels.values.ravel())
        prob_arr_DT = DT.predict_proba(te_data)
        score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)

        
        fold_results.loc[foldnum, 'nDCG_DT-'+str(max_leaf) ] = score_DT
        print("Max samples nodes %d ; Score %f" %  (max_leaf, score_DT))
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Max samples nodes 5 ; Score 0.771725
Max samples nodes 10 ; Score 0.771531
Max samples nodes 15 ; Score 0.782658
-------------------------------
2 -fold
Max samples nodes 5 ; Score 0.808900
Max samples nodes 10 ; Score 0.812545
Max samples nodes 15 ; Score 0.812525
-------------------------------
3 -fold
Max samples nodes 5 ; Score 0.832440
Max samples nodes 10 ; Score 0.833537
Max samples nodes 15 ; Score 0.833547
-------------------------------
4 -fold
Max samples nodes 5 ; Score 0.845168
Max samples nodes 10 ; Score 0.845848
Max samples nodes 15 ; Score 0.846013
-------------------------------
5 -fold
Max samples nodes 5 ; Score 0.833968
Max samples nodes 10 ; Score 0.834345
Max samples nodes 15 ; Score 0.834460
-------------------------------
5-fold avg nDCG:
 nDCG_DT-5     0.818440
nDCG_DT-10    0.819561
nDCG_DT-15    0.821841
dtype: float64


### =>The more max samples node. the slightly better the score is. 

## parameters choosing

In [15]:
foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    
    DT =  tree.DecisionTreeClassifier(criterion='gini',
                                      max_depth=5,
                                      min_samples_leaf=10,
                                      max_leaf_nodes=15)

    DT.fit(tr_data, tr_labels.values.ravel())
    prob_arr_DT = DT.predict_proba(te_data)
    score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)


    fold_results.loc[foldnum, 'nDCG_DT' ] = score_DT
    print("Score %f" %  score_DT)
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Score 0.782658
-------------------------------
2 -fold
Score 0.812525
-------------------------------
3 -fold
Score 0.833547
-------------------------------
4 -fold
Score 0.846013
-------------------------------
5 -fold
Score 0.834460
-------------------------------
5-fold avg nDCG:
 nDCG_DT    0.821841
dtype: float64


In [16]:
foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    
    DT =  tree.DecisionTreeClassifier(criterion='gini',
                                      max_depth=6,
                                      min_samples_leaf=10,
                                      max_leaf_nodes=20)

    DT.fit(tr_data, tr_labels.values.ravel())
    prob_arr_DT = DT.predict_proba(te_data)
    score_DT = ndcg_score(te_labels.as_matrix(), prob_arr_DT, k=5)


    fold_results.loc[foldnum, 'nDCG_DT' ] = score_DT
    print("Score %f" %  score_DT)
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Score 0.782275
-------------------------------
2 -fold
Score 0.811863
-------------------------------
3 -fold
Score 0.833844
-------------------------------
4 -fold
Score 0.845922
-------------------------------
5 -fold
Score 0.834405
-------------------------------
5-fold avg nDCG:
 nDCG_DT    0.821662
dtype: float64


### the best nDCG score can be reached using decision tree: 0.8218

# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

## Different max depth

In [18]:
foldnum = 0
fold_results = pd.DataFrame()
kf = KFold(n_splits=5, random_state=1)

for train, test in kf.split(X_train):
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    for max_depth in range(3, 9):
        RF = RandomForestClassifier(n_estimators=600, 
                                    criterion='gini',
                                    max_depth=max_depth)

        
        RF.fit(tr_data, tr_labels.values.ravel())
        prob_arr_RF = RF.predict_proba(te_data)
        score_RF = ndcg_score(te_labels.as_matrix(), prob_arr_RF, k=5)

        
        fold_results.loc[foldnum, 'nDCG_RF-'+str(max_depth) ] = score_RF
        print("Max depth %d ; Score %f" %  (max_depth, score_RF))
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Max depth 3 ; Score 0.766815
Max depth 4 ; Score 0.768526
Max depth 5 ; Score 0.774529
Max depth 6 ; Score 0.775507
Max depth 7 ; Score 0.775842
Max depth 8 ; Score 0.777150
-------------------------------
2 -fold
Max depth 3 ; Score 0.798369
Max depth 4 ; Score 0.799777
Max depth 5 ; Score 0.806243
Max depth 6 ; Score 0.807341
Max depth 7 ; Score 0.808139
Max depth 8 ; Score 0.809334
-------------------------------
3 -fold
Max depth 3 ; Score 0.818011
Max depth 4 ; Score 0.819361
Max depth 5 ; Score 0.825170
Max depth 6 ; Score 0.828318
Max depth 7 ; Score 0.829053
Max depth 8 ; Score 0.829696
-------------------------------
4 -fold
Max depth 3 ; Score 0.828725
Max depth 4 ; Score 0.829728
Max depth 5 ; Score 0.836238
Max depth 6 ; Score 0.840118
Max depth 7 ; Score 0.841675
Max depth 8 ; Score 0.842445
-------------------------------
5 -fold
Max depth 3 ; Score 0.821882
Max depth 4 ; Score 0.823740
Max depth 5 ; Score 0.828461
Max depth 6 ; Score 0.830068
Max depth 7 ; Score 

### =>max_depth = 7 or 8 is much better.

## Different min samples leaf

In [20]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")

    for min_leaf in [5,15,30, 50]:
        RF = RandomForestClassifier(n_estimators=600, 
                                    criterion='gini', 
                                    min_samples_leaf=min_leaf)

        RF.fit(tr_data, tr_labels.values.ravel())
        prob_arr_RF = RF.predict_proba(te_data)
        score_RF = ndcg_score(te_labels.as_matrix(), prob_arr_RF, k=5)

        
        fold_results.loc[foldnum, 'nDCG_RF-'+str(min_leaf) ] = score_RF
        print("Min leaf %d ; Score %f" %  (min_leaf, score_RF))
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())


1 -fold
Min leaf 5 ; Score 0.780115
Min leaf 15 ; Score 0.781864
Min leaf 30 ; Score 0.781953
Min leaf 50 ; Score 0.781664
-------------------------------
2 -fold
Min leaf 5 ; Score 0.811072
Min leaf 15 ; Score 0.812471
Min leaf 30 ; Score 0.812707
Min leaf 50 ; Score 0.813094
-------------------------------
3 -fold
Min leaf 5 ; Score 0.832718
Min leaf 15 ; Score 0.833914
Min leaf 30 ; Score 0.833992
Min leaf 50 ; Score 0.833900
-------------------------------
4 -fold
Min leaf 5 ; Score 0.843997
Min leaf 15 ; Score 0.845583
Min leaf 30 ; Score 0.845649
Min leaf 50 ; Score 0.845007
-------------------------------
5 -fold
Min leaf 5 ; Score 0.833119
Min leaf 15 ; Score 0.833703
Min leaf 30 ; Score 0.834331
Min leaf 50 ; Score 0.834028
-------------------------------
5-fold avg nDCG:
 nDCG_RF-5     0.820204
nDCG_RF-15    0.821507
nDCG_RF-30    0.821726
nDCG_RF-50    0.821539
dtype: float64


### =>min samples leaf = 30 is much better.

## Different max leaf nodes

In [21]:
max_depth = 8
min_leaf = 30

foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")

    for max_leaf_nodes in range(5,12,2):
        RF = RandomForestClassifier(n_estimators=600, 
                                    criterion='gini', 
                                    max_depth=max_depth, 
                                    min_samples_leaf=min_leaf,
                                    max_leaf_nodes=max_leaf_nodes)

        RF.fit(tr_data, tr_labels.values.ravel())
        prob_arr_RF = RF.predict_proba(te_data)
        score_RF = ndcg_score(te_labels.as_matrix(), prob_arr_RF, k=5)

        
        fold_results.loc[foldnum, 'nDCG_RF-'+str(max_leaf_nodes) ] = score_RF
        print("Max leaf nodes %d ; Score %f" %  (max_leaf_nodes, score_RF))
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Max leaf nodes 5 ; Score 0.766815
Max leaf nodes 7 ; Score 0.766815
Max leaf nodes 9 ; Score 0.766815
Max leaf nodes 11 ; Score 0.770886
-------------------------------
2 -fold
Max leaf nodes 5 ; Score 0.798396
Max leaf nodes 7 ; Score 0.798387
Max leaf nodes 9 ; Score 0.798360
Max leaf nodes 11 ; Score 0.803858
-------------------------------
3 -fold
Max leaf nodes 5 ; Score 0.818011
Max leaf nodes 7 ; Score 0.818011
Max leaf nodes 9 ; Score 0.818011
Max leaf nodes 11 ; Score 0.823768
-------------------------------
4 -fold
Max leaf nodes 5 ; Score 0.828725
Max leaf nodes 7 ; Score 0.828725
Max leaf nodes 9 ; Score 0.828733
Max leaf nodes 11 ; Score 0.830264
-------------------------------
5 -fold
Max leaf nodes 5 ; Score 0.821882
Max leaf nodes 7 ; Score 0.821882
Max leaf nodes 9 ; Score 0.822340
Max leaf nodes 11 ; Score 0.824847
-------------------------------
5-fold avg nDCG:
 nDCG_RF-5     0.806766
nDCG_RF-7     0.806764
nDCG_RF-9     0.806852
nDCG_RF-11    0.810725
dtype

### =>setting max_lead_nodes decrease the score.

## parameters choosing

In [23]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    RF = RandomForestClassifier(n_estimators=600, 
                                criterion='gini', 
                                max_depth=9, 
                                min_samples_leaf=30)

    RF.fit(tr_data, tr_labels.values.ravel())
    prob_arr_RF = RF.predict_proba(te_data)
    score_RF = ndcg_score(te_labels.as_matrix(), prob_arr_RF, k=5)


    fold_results.loc[foldnum, 'nDCG_RF'] = score_RF
    print("Score %f" %  score_RF)
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Score 0.778541
-------------------------------
2 -fold
Score 0.809750
-------------------------------
3 -fold
Score 0.830342
-------------------------------
4 -fold
Score 0.843567
-------------------------------
5 -fold
Score 0.831990
-------------------------------
5-fold avg nDCG:
 nDCG_RF    0.818838
dtype: float64


In [24]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")
    
    RF = RandomForestClassifier(n_estimators=600, 
                                criterion='gini', 
                                min_samples_leaf=30)

    RF.fit(tr_data, tr_labels.values.ravel())
    prob_arr_RF = RF.predict_proba(te_data)
    score_RF = ndcg_score(te_labels.as_matrix(), prob_arr_RF, k=5)


    fold_results.loc[foldnum, 'nDCG_RF'] = score_RF
    print("Score %f" %  score_RF)
        
    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Score 0.782007
-------------------------------
2 -fold
Score 0.813039
-------------------------------
3 -fold
Score 0.833896
-------------------------------
4 -fold
Score 0.845821
-------------------------------
5 -fold
Score 0.834231
-------------------------------
5-fold avg nDCG:
 nDCG_RF    0.821799
dtype: float64


### the best nDCG score can be reached using decision tree: 0.8217

# Gradient Boosting

In [25]:
from sklearn.ensemble import GradientBoostingClassifier

## Different depth

In [28]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")

    for depth in [1, 2, 3, 4]:
        gra_bo_clf = GradientBoostingClassifier(max_depth=depth, 
                                                n_estimators=100,
                                                random_state=1)

        gra_bo_clf.fit(tr_data, tr_labels.values.ravel())
        prob_arr_gra_bo = gra_bo_clf.predict_proba(te_data)

        score_gb = ndcg_score(te_labels.as_matrix(), prob_arr_gra_bo, k=5)
        fold_results.loc[foldnum, 'nDCG_GRB' + str(depth)] = score_gb
        print("Depth:%d - nDCG score:%f" % (depth, score_gb))
        
        print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Depth:1 - nDCG score:0.780297
-------------------------------
Depth:2 - nDCG score:0.785215
-------------------------------
Depth:3 - nDCG score:0.783353
-------------------------------
Depth:4 - nDCG score:0.780272
-------------------------------
2 -fold
Depth:1 - nDCG score:0.807223
-------------------------------
Depth:2 - nDCG score:0.812087
-------------------------------
Depth:3 - nDCG score:0.813787
-------------------------------
Depth:4 - nDCG score:0.813168
-------------------------------
3 -fold
Depth:1 - nDCG score:0.825931
-------------------------------
Depth:2 - nDCG score:0.831881
-------------------------------
Depth:3 - nDCG score:0.834086
-------------------------------
Depth:4 - nDCG score:0.834455
-------------------------------
4 -fold
Depth:1 - nDCG score:0.839392
-------------------------------
Depth:2 - nDCG score:0.844959
-------------------------------
Depth:3 - nDCG score:0.846099
-------------------------------
Depth:4 - nDCG score:0.843842
--------

## Different number of estimators

In [29]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")

    for num_est in [100, 200, 300, 400, 500]:
        gra_bo_clf = GradientBoostingClassifier(n_estimators=num_est, 
                                                random_state=1)

        gra_bo_clf.fit(tr_data, tr_labels.values.ravel())
        prob_arr_gra_bo = gra_bo_clf.predict_proba(te_data)

        score_gb = ndcg_score(te_labels.as_matrix(), prob_arr_gra_bo, k=5)
        fold_results.loc[foldnum, 'nDCG_GRB' + str(num_est)] = score_gb
        print("Number of Estimators:%d - nDCG score:%f" % (num_est, score_gb))
        
        print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
Number of Estimators:100 - nDCG score:0.783353
-------------------------------
Number of Estimators:200 - nDCG score:0.782489
-------------------------------
Number of Estimators:300 - nDCG score:0.781306
-------------------------------
Number of Estimators:400 - nDCG score:0.780595
-------------------------------
Number of Estimators:500 - nDCG score:0.780812
-------------------------------
2 -fold
Number of Estimators:100 - nDCG score:0.813787
-------------------------------
Number of Estimators:200 - nDCG score:0.813489
-------------------------------
Number of Estimators:300 - nDCG score:0.813047
-------------------------------
Number of Estimators:400 - nDCG score:0.813119
-------------------------------
Number of Estimators:500 - nDCG score:0.812628
-------------------------------
3 -fold
Number of Estimators:100 - nDCG score:0.834086
-------------------------------
Number of Estimators:200 - nDCG score:0.834528
-------------------------------
Number of Estimators:300 - n

## Parameters choosing

In [30]:
foldnum = 0
fold_results = pd.DataFrame()

for train, test in kf.split(X_train):
    
    [tr_data, te_data, tr_labels, te_labels] = folds_to_split(X_train, labels, train, test)
    
    foldnum+=1
    print(foldnum, "-fold")


    gra_bo_clf = GradientBoostingClassifier(max_depth=3, 
                                            n_estimators=100,
                                            random_state=1)

    gra_bo_clf.fit(tr_data, tr_labels.values.ravel())
    prob_arr_gra_bo = gra_bo_clf.predict_proba(te_data)

    score_gb = ndcg_score(te_labels.as_matrix(), prob_arr_gra_bo, k=5)
    fold_results.loc[foldnum, 'nDCG_GRB'] = score_gb
    print("nDCG score:%f" % (score_gb))

    print("-------------------------------")

print("5-fold avg nDCG:\n",fold_results.mean())

1 -fold
nDCG score:0.783353
-------------------------------
2 -fold
nDCG score:0.813787
-------------------------------
3 -fold
nDCG score:0.834086
-------------------------------
4 -fold
nDCG score:0.846099
-------------------------------
5 -fold
nDCG score:0.834181
-------------------------------
5-fold avg nDCG:
 nDCG_GRB    0.822301
dtype: float64


### the best nDCG score can be reached using gradient boosting: 0.8223