In [192]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
import re
from parse import parse
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression

In [193]:
train_data = pd.read_csv('relevance_train.csv', delimiter='\t', index_col=False)
test_data = pd.read_csv('relevance_test.csv')
queries = pd.read_csv('queries.csv', sep='\t', index_col=False).iloc[:-1]

In [194]:
queries

Unnamed: 0,QueryId,Query
0,1.0,what similarity laws must be obeyed when const...
1,2.0,what are the structural and aeroelastic proble...
2,4.0,what problems of heat conduction in composite ...
3,8.0,can a criterion be developed to show empirical...
4,9.0,what chemical kinetic system is applicable to ...
5,10.0,what theoretical and experimental guides do we...
6,12.0,is it possible to relate the available pressur...
7,13.0,what methods -dash exact or approximate -dash ...
8,15.0,papers on internal /slip flow/ heat transfer s...
9,18.0,are real-gas transport properties for air avai...


In [234]:
train_data

Unnamed: 0,QueryId,DocumentId,Relevance
0,1,184,2
1,1,29,2
2,1,31,2
3,1,12,3
4,1,51,3
5,1,102,3
6,1,13,4
7,1,14,4
8,1,15,4
9,1,57,2


In [196]:
test_data.DocumentId.max()

1400

In [197]:
with open('Documents.csv', 'r') as input_data:
    doc_data = input_data.readlines()
    
documents = []
curr_doc = []

id_re = re.compile('.Id \d+\n')
t_re, a_re, b_re, w_re = [re.compile('.' + x + '\n') for x in ['T', 'A', 'B', 'W']]

for index, curr_line in enumerate(doc_data):
    if id_re.match(curr_line):
        documents.append(curr_doc)
        curr_doc = [int(parse('.Id {}\n', curr_line).fixed[0])]
    elif t_re.match(curr_line) or a_re.match(curr_line) or b_re.match(curr_line) or w_re.match(curr_line):
        curr_doc.append('')
    else:
        curr_doc[-1] += curr_line.strip()  
        
documents = documents[1:]
bad_articles = np.arange(len(documents))[np.array([len(x) for x in documents]) != 5]
for i in bad_articles[::-1]:
    del documents[i]

documents = pd.DataFrame(data=documents, columns=['Id', 'T', 'A', 'B', 'W'])

In [198]:
documents

Unnamed: 0,Id,T,A,B,W
0,1,experimental investigation of the aerodynamics...,"brenckman,m.","j. ae. scs. 25, 1958, 324.",experimental investigation of the aerodynamics...
1,2,simple shear flow past a flat plate in an inco...,ting-yili,"department of aeronautical engineering, rensse...",simple shear flow past a flat plate in an inco...
2,3,the boundary layer in simple shear flow past a...,m. b. glauert,"department of mathematics, university of manch...",the boundary layer in simple shear flow past a...
3,4,approximate solutions of the incompressible la...,"yen,k.t.","j. ae. scs. 22, 1955, 728.",approximate solutions of the incompressible la...
4,5,one-dimensional transient heat conduction into...,"wasserman,b.","j. ae. scs. 24, 1957, 924.",one-dimensional transient heat conduction into...
5,6,one-dimensional transient heat flow in a multi...,"campbell,w.f.","j. ae. scs. 25, 1958, 340.",one-dimensional transient heat flow in a multi...
6,7,the effect of controlled three-dimensional rou...,"van driest,e.r. and mccauley,w.d.","j. ae. scs. 27, 1960, 261.",the effect of controlled three-dimensional rou...
7,8,measurements of the effect of two-dimensional ...,"klebanoff,p.s.","j. ae. scs. 22, 1955, 803.",measurements of the effect of two-dimensional ...
8,9,transition studies and skin friction measureme...,"korkegi,r.h.","j. ae. scs. 23, 1956, 97.",transition studies and skin friction measureme...
9,10,the theory of the impact tube at low pressure .,"chambre,p.l. and schaaf,s.a.","j. ae. scs. 15, 1948, 735.",the theory of the impact tube at low pressure ...


In [199]:
documents.loc[239]

Id                                                  241
T     laminar mixing of a non-uniform stream with a ...
A                                             nash,j.f.
B                                      arc 22245, 1960.
W     laminar mixing of a non-uniform stream with a ...
Name: 239, dtype: object

In [228]:
full_string_data = documents['T'].tolist() \
                 + documents['A'].tolist() \
                 + documents['B'].tolist() \
                 + queries['Query'].tolist()
                 # + documents['W'].tolist() \
                 

def create_features(data, documents, queries):
    docs_len = len(documents)
    vectorizer = HashingVectorizer(ngram_range=(1, 4), analyzer='word', n_features=2 ** 14, norm='l2')
    features = vectorizer.fit_transform(data).todense()
    doc_feachures = np.hstack((features[:docs_len], 
                               features[docs_len:2*docs_len], 
                               features[2*docs_len:3*docs_len], 
                               # features[3*docs_len:4*docs_len]
                              ))
    # query_features = features[4*docs_len:]
    query_features = features[3*docs_len:]
    doc_features_pd = pd.DataFrame(index=documents.Id.tolist(), data=doc_feachures)
    query_features_pd = pd.DataFrame(index=queries.QueryId.tolist(), data=query_features)
    return doc_features_pd, query_features_pd

In [229]:
doc_features, query_features = create_features(full_string_data, documents, queries)

In [230]:
features = []
test_features = []
ranks = []

for i in range(len(train_data)):
    doc_id = train_data.DocumentId.iloc[i]
    query_id = train_data.QueryId.iloc[i]
    if doc_id in doc_features.index and query_id in query_features.index:
        features.append(doc_features.loc[doc_id].tolist() + query_features.loc[query_id].tolist())
        ranks.append(train_data.Relevance.iloc[i])

In [231]:
def count_inversion_rate(predicted, real):
    l = len(real)
    result = 0
    for i in range(l):
        for j in range(l):
            if predicted[i] > predicted[j] and real[i] < real[j]:
                result += 1
    result = float(result)
    return result / float(l * (l - 1) / 2)

In [232]:
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k=5, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [233]:
model = LogisticRegression(n_jobs=9, C=10, dual=True)
model.fit(features[:-50], ranks[:-50])
predicted = model.predict(features[-50:])
zip_sorted_list = np.array(sorted(list(zip(ranks[-50:], predicted))))
print(ndcg_at_k(zip_sorted_list[:,1]))

  " = {}.".format(self.n_jobs))


0.426201811102


In [227]:
print(predicted)

[ 2  3  3  3  3  3 -1  3  1 -1  3 -1  2  3  3  3  3  3  2  3  3  3  3  3  2
  3  3  3  3  3  3  3  3 -1  3  3  3  2  2  3  3  3  2 -1  3 -1  2  2  2  3]


In [173]:
result = []
for i in range(len(test_data)):
    doc_id = test_data.DocumentId.iloc[i]
    query_id = test_data.QueryId.iloc[i]
    if doc_id in doc_features.index and query_id in query_features.index:
        test_features = doc_features.loc[doc_id].tolist() + query_features.loc[query_id].tolist()
        result.append([query_id, (-1) * model.predict([test_features])[0], doc_id])
    else:
        result.append([query_id, np.random.rand(1)[0], doc_id])
        
result.sort()
result = np.array(result)             

In [174]:
result.tofile

<function ndarray.tofile>

In [175]:
output = pd.DataFrame(data=result, columns=['QueryId', 'Rel', 'DocumentId'])

In [176]:
output = output.drop('Rel', axis=1)
output.to_csv("submission.tsv", sep=',', index=False)