In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer


def dcg_score(y_true, y_score, k=5):
   """Discounted cumulative gain (DCG) at rank K.

   Parameters
   ----------
   y_true : array, shape = [n_samples]
       Ground truth (true relevance labels).
   y_score : array, shape = [n_samples, n_classes]
       Predicted scores.
   k : int
       Rank.

   Returns
   -------
   score : float
   """
   order = np.argsort(y_score)[::-1]
   y_true = np.take(y_true, order[:k])

   gain = 2 ** y_true - 1

   discounts = np.log2(np.arange(len(y_true)) + 2)
   return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
   """Normalized discounted cumulative gain (NDCG) at rank K.

   Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
   recommendation system based on the graded relevance of the recommended
   entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
   ranking of the entities.

   Parameters
   ----------
   ground_truth : array, shape = [n_samples]
       Ground truth (true labels represended as integers).
   predictions : array, shape = [n_samples, n_classes]
       Predicted probabilities.
   k : int
       Rank.

   Returns
   -------
   score : float

   Example
   -------
   >>> ground_truth = [1, 0, 2]
   >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
   >>> score = ndcg_score(ground_truth, predictions, k=2)
   1.0
   >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
   >>> score = ndcg_score(ground_truth, predictions, k=2)
   0.6666666666
   """
   lb = LabelBinarizer()
   lb.fit(range(len(predictions) + 1))
   T = lb.transform(ground_truth)

   scores = []

   # Iterate over each y_true and compute the DCG score
   for y_true, y_score in zip(T, predictions):
       actual = dcg_score(y_true, y_score, k)
       best = dcg_score(y_true, y_true, k)
       score = float(actual) / float(best)
       scores.append(score)

   return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba= True, k=5)

import sklearn.grid_search as gs

max_depth_values = [5, 6, 7]
learning_rate_values = [0.1, 0.15]
subsample_values = [0.7]
colsample_bytree_values = [0.7]
n_estimators = [100] #, 200
# gamma = [0]

params = {'max_depth' : max_depth_values, 'learning_rate': learning_rate_values, 
          'subsample': subsample_values, 'colsample_bytree': colsample_bytree_values,
          'n_estimators' : n_estimators 
          #'gamma': gamma,
          #'min_child_weight': min_child_weight
         }

clf = gs.GridSearchCV(model2, params, scoring=ndcg_scorer, cv=5)

clf.fit(train_xvec, train_ytrans)

clf.grid_scores_

In [None]:
clf.best_params_

In [None]:
pred2 = clf.predict_proba(test_xvec)
pred2

In [None]:
test_id = test_m['id']

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(pred2[i])[::-1])[:5].tolist()

In [None]:
sub1 = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
print sub1.head
sub1.to_csv('sub1.csv',index=False)