In [1]:
import xgboost as xbg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
import joblib
from tqdm import tqdm

%matplotlib inline

In [2]:
def load_all(fname):
    return joblib.load(fname)

X_train_all = load_all("X_train_all.jblib")
X_val_all = load_all("X_val_all.jblib")

In [3]:
def load_y(fname):
    return np.array(pickle.load(open(fname,"rb")))

y_train = load_y("y_train.p")
y_val = load_y("y_val.p")

In [4]:
from xgboost import XGBClassifier

model = XGBClassifier(n_jobs=-1,verbosity=2,n_estimators=100)

In [5]:
model.fit(X_train_all,y_train,verbose=True,eval_set=[(X_val_all,y_val)])

[0]	validation_0-error:0.4328
[1]	validation_0-error:0.428733
[2]	validation_0-error:0.421067
[3]	validation_0-error:0.423733
[4]	validation_0-error:0.4168
[5]	validation_0-error:0.410667
[6]	validation_0-error:0.411933
[7]	validation_0-error:0.412733
[8]	validation_0-error:0.4022
[9]	validation_0-error:0.400333
[10]	validation_0-error:0.4004
[11]	validation_0-error:0.3972
[12]	validation_0-error:0.3958
[13]	validation_0-error:0.394533
[14]	validation_0-error:0.396333
[15]	validation_0-error:0.3954
[16]	validation_0-error:0.393333
[17]	validation_0-error:0.3902
[18]	validation_0-error:0.388133
[19]	validation_0-error:0.39
[20]	validation_0-error:0.390467
[21]	validation_0-error:0.3908
[22]	validation_0-error:0.3858
[23]	validation_0-error:0.3848
[24]	validation_0-error:0.3868
[25]	validation_0-error:0.3864
[26]	validation_0-error:0.382067
[27]	validation_0-error:0.383467
[28]	validation_0-error:0.382267
[29]	validation_0-error:0.383533
[30]	validation_0-error:0.380467
[31]	validation_0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, verbosity=2)

In [6]:
from sklearn.metrics import accuracy_score

In [7]:
preds_val = model.predict(X_val_all)

In [8]:
accuracy = accuracy_score(y_val, preds_val)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 64.35%


In [9]:
X_test_all = load_all("X_test_all.jblib")
y_test = load_y("y_test.p")

In [10]:
preds_test = model.predict(X_test_all)
accuracy = accuracy_score(y_test, preds_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 63.76%


In [17]:
preds_test_probs = model.predict_proba(X_test_all)

In [18]:
preds_test_probs

array([[0.5215562 , 0.4784438 ],
       [0.7793901 , 0.22060989],
       [0.5297859 , 0.4702141 ],
       ...,
       [0.43529493, 0.5647051 ],
       [0.43529493, 0.5647051 ],
       [0.43529493, 0.5647051 ]], dtype=float32)

In [19]:
def load_ranking(fname):
    return pickle.load(open("ranking_"+fname+".p","rb"))

In [21]:
qid2c,qid2indexmap = load_ranking("test")

In [47]:
from scipy.stats import rankdata

def mrrs(out, labels):
#     print(out,labels)
    outputs = np.argmax(out,axis=1)
    mrr = 0.0 
    for label,ranks in zip(labels,out):
        ranks = rankdata(ranks*-1)
        rank = ranks[label]
#         print(rank,ranks)
        mrr+=1/rank
    return mrr/len(labels)

def mrrwrapper(qid2c,qid2indexmap,preds_prob):
    labels = []
    out = []
    for qid in qid2c.keys():
        scores = []
        for ix in qid2indexmap[qid]:
            if len(scores) < 6:
                scores.append(preds_prob[ix][1])
        if len(scores) < 6:
            continue
        out.append(scores)
        labels.append(int(qid2c[qid]))
    return mrrs(np.array(out),labels)

In [48]:
mrrwrapper(qid2c,qid2indexmap,preds_test_probs)

0.6008222222222246