In [1]:
import xgboost as xbg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import pickle
import joblib

%matplotlib inline

In [2]:
def load_cls(fname):
    return np.array(pickle.load(open(fname,"rb")))

In [17]:
X_train = load_cls("X_train_tokens.p")
X_val_cls = load_cls("X_val_tokens.p")

In [18]:
y_train = load_cls("y_train.p")
y_val = load_cls("y_val.p")

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

def my_preprocessor(doc):
    return doc

# tokenize the doc and lemmatize its tokens
def my_tokenizer(doc):
    return doc

custom_vec = CountVectorizer(preprocessor=my_preprocessor, tokenizer=my_tokenizer)
cwm = custom_vec.fit_transform(X_train)
tokens = custom_vec.get_feature_names()

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [37]:
feature_pipeline = Pipeline([
('vect',  CountVectorizer(min_df=.0025, max_df=0.25, ngram_range=(1,3),preprocessor=my_preprocessor, tokenizer=my_tokenizer)),
('tfidf', TfidfTransformer()),
])

In [38]:
X_train_f = feature_pipeline.fit_transform(X_train)
X_val_f =feature_pipeline.transform(X_val_cls)

In [41]:
model = XGBClassifier(n_jobs=-1,verbosity=2,n_estimators=300)
model.fit(X_train_f,y_train,verbose=True,eval_set=[(X_val_f,y_val)])

[0]	validation_0-error:0.448933
[1]	validation_0-error:0.4486
[2]	validation_0-error:0.4462
[3]	validation_0-error:0.4446
[4]	validation_0-error:0.443733
[5]	validation_0-error:0.443333
[6]	validation_0-error:0.4394
[7]	validation_0-error:0.439467
[8]	validation_0-error:0.438867
[9]	validation_0-error:0.4392
[10]	validation_0-error:0.438867
[11]	validation_0-error:0.437867
[12]	validation_0-error:0.437267
[13]	validation_0-error:0.4292
[14]	validation_0-error:0.43
[15]	validation_0-error:0.427933
[16]	validation_0-error:0.428533
[17]	validation_0-error:0.428
[18]	validation_0-error:0.428333
[19]	validation_0-error:0.425733
[20]	validation_0-error:0.421467
[21]	validation_0-error:0.422667
[22]	validation_0-error:0.422
[23]	validation_0-error:0.421
[24]	validation_0-error:0.422133
[25]	validation_0-error:0.418667
[26]	validation_0-error:0.418533
[27]	validation_0-error:0.418533
[28]	validation_0-error:0.4192
[29]	validation_0-error:0.4186
[30]	validation_0-error:0.419333
[31]	validation_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1, verbosity=2)

In [42]:
from sklearn.metrics import accuracy_score
preds_val = model.predict(X_val_f)
accuracy = accuracy_score(y_val, preds_val)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 60.92%


In [43]:
X_test = load_cls("X_test_tokens.p")
y_test = load_cls("y_test.p")

In [44]:
X_test_f =feature_pipeline.transform(X_test)

In [45]:
preds_test = model.predict(X_test_f)
accuracy = accuracy_score(y_test, preds_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 59.37%


In [46]:
from scipy.stats import rankdata

def mrrs(out, labels):
#     print(out,labels)
    outputs = np.argmax(out,axis=1)
    mrr = 0.0 
    for label,ranks in zip(labels,out):
        ranks = rankdata(ranks*-1)
        rank = ranks[label]
#         print(rank,ranks)
        mrr+=1/rank
    return mrr/len(labels)

def mrrwrapper(qid2c,qid2indexmap,preds_prob):
    labels = []
    out = []
    for qid in qid2c.keys():
        scores = []
        for ix in qid2indexmap[qid]:
            if len(scores) < 6:
                scores.append(preds_prob[ix][1])
        if len(scores) < 6:
            continue
        out.append(scores)
        labels.append(int(qid2c[qid]))
    return mrrs(np.array(out),labels)

def load_ranking(fname):
    return pickle.load(open("ranking_"+fname+".p","rb"))

In [47]:
preds_test_probs = model.predict_proba(X_test_f)

In [48]:
qid2c,qid2indexmap = load_ranking("test")

In [49]:
mrrwrapper(qid2c,qid2indexmap,preds_test_probs)

0.5230778739778764