# Assignment 2B: Ranking

This notebook contains the skeleton for training a model and then applying it to produce a document ranking.

## Loading the precomputed features

The code below loads the precomputed features and combines them into feature vectors for query-document pairs.

For this part to work, you'll need to run the `1_Feature_computation` notebook first to generate the sample features JSON files.

In [176]:
import json
from sklearn import preprocessing
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

In [177]:
QUERIES_FILE = "data/queries.txt"
QRELS_FILE = "data/qrels.csv"
QUERY2_FILE ='data/queries2.txt'


In [178]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries
queries = load_queries(QUERIES_FILE)

In [179]:
def load_qrels(qrels_file):
    labels = {}
    with open(qrels_file, "r") as fin:
        for line in fin.readlines()[1:]:
            qid, docID, relevance = line.strip().split(",")
            label = qid+"-"+docID
            labels[label] = int(relevance)

    return labels

qrels = load_qrels(QRELS_FILE)

In [180]:
# qrels

In [186]:
def load_features(queries, features_to_load):
    feature_names = []
    features = {}
    for f in features_to_load:
        print("Loading features from {}".format(f['file']))
        feature_names += f['features']
        with open(f['file']) as json_file:
            data = json.load(json_file)
            
            for q, qdocs in data.items():
                for d, feature_values in qdocs.items():
                    key = "{}-{}".format(q, d)
                    for feature_name in f['features']:
                        # Note: no error checking is performed. It is assumed that all feature files
                        # contain the same queries and documents.
                        fvect = features.get(key, [])
                        fvect.append(feature_values[feature_name])
                        features[key] = fvect

    print("Feature vector: {}".format(feature_names))
    new_feature = {}
    for key,value in features.items():
        if sum(value[1:]) > 0:
            new_feature[key] = value
    return new_feature

In [187]:
# Specify the features to be loaded from each file.
# They will make up the feature vector in this exact order.
features_to_load = [
    {
        'file': "data/features_1.json",
        'features': ["qlen"]
    },
    {
        'file': "data/features_2.json",
        'features': ["bm25_title", "bm25_content"]
    },
    #{
    #    'file': "data/features_3.json",
    #    'features': ["mlm"]
    #},
    {
        'file': "data/features_4.json", 
        'features': ["idf_title", "idf_content"]
    } ,
    {
        'file': "data/features_5.json", 
        'features': ["docLen_title", "docLen_content", "docLen_anchor"]
    },
    {
        'file': "data/features_6.json", 
        'features': ["pagerank"]
    } 
]

features = load_features(queries, features_to_load)

Loading features from data/features_1.json
Loading features from data/features_2.json
Loading features from data/features_4.json
Loading features from data/features_5.json
Loading features from data/features_6.json
Feature vector: ['qlen', 'bm25_title', 'bm25_content', 'idf_title', 'idf_content', 'docLen_title', 'docLen_content', 'docLen_anchor', 'pagerank']


In [215]:
# features
feat_data=pd.DataFrame.from_dict(features,orient='index',columns=None)

### load for queries2 features and ids

In [189]:
#X, Y, qids, doc_ids=load_train_data(features2,qrels)

In [190]:
queries2=load_queries(QUERY2_FILE)

In [191]:
features_to_load = [
    {
        'file': "data/queries2/features_1.json",
        'features': ["qlen"]
    },
    {
        'file': "data/queries2/features_2.json",
        'features': ["bm25_title", "bm25_content"]
    },
    #{
    #    'file': "data/features_3.json",
    #    'features': ["mlm"]
    #},
    {
        'file': "data/queries2/features_4.json", 
        'features': ["idf_title", "idf_content"]
    } ,
    {
        'file': "data/queries2/features_5.json", 
        'features': ["docLen_title", "docLen_content", "docLen_anchor"]
    },
    {
        'file': "data/queries2/features_6.json", 
        'features': ["pagerank"]
    } 
]
features2 = load_features(queries2, features_to_load)

Loading features from data/queries2/features_1.json
Loading features from data/queries2/features_2.json
Loading features from data/queries2/features_4.json
Loading features from data/queries2/features_5.json
Loading features from data/queries2/features_6.json
Feature vector: ['qlen', 'bm25_title', 'bm25_content', 'idf_title', 'idf_content', 'docLen_title', 'docLen_content', 'docLen_anchor', 'pagerank']


In [230]:
# features2

## Training a model

Training needs to be done differently based on the scenario:

  * **Scenario 1**: The model is trained using cross-validation, that is on 4/5 of queries, then applied on the remaining 1/5 of queries (repeated 5 times).
  * **Scenario 2**: The model is trained on all available training data.
  
The feature vectors at this point are already created. These should contain both (a) the training queries and (b) the queries on which you want to apply your model.

Train your model on queries (a). For that you'll also need to load the corresponding relevance labels.

##### split the data into train and test set

In [194]:
clf = RandomForestRegressor(max_depth=3, random_state=0)

### scenario-1: train with cross validation

In [195]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [196]:
# TODO
feat_data

Unnamed: 0,0,1,2,3,4,5,6,7,8
201-clueweb12-0000wb-60-01497,2,31.367764,30.951880,18.170073,11.587501,9,3355,0,5.200236e-08
201-clueweb12-0106wb-18-19516,2,0.000000,0.000000,0.000000,0.000000,11,5258,0,1.707008e-08
201-clueweb12-0108wb-22-26598,2,0.000000,0.000000,0.000000,0.000000,9,9304,0,7.371517e-08
201-clueweb12-0301tw-21-03835,2,21.564389,63.968048,18.178139,11.580397,14,835,0,2.063839e-08
201-clueweb12-0307wb-47-02869,2,66.874364,71.499628,18.135870,11.585547,3,611,0,1.835034e-08
...,...,...,...,...,...,...,...,...,...
250-clueweb12-1802wb-92-24344,3,18.869161,75.630484,7.059194,10.523420,5,316,0,1.707008e-08
250-clueweb12-1804wb-22-03257,3,0.000000,0.000000,0.000000,0.000000,0,0,0,1.707008e-08
250-clueweb12-1810wb-24-22652,3,26.850440,34.296282,14.363648,10.504713,8,2424,0,2.016969e-08
250-clueweb12-1913wb-64-15762,3,38.128630,42.417501,14.345173,7.988730,5,943,0,1.707008e-08


In [216]:
feat_data.columns = ['0', '1', '2','3','4','5','6','7','8']

In [217]:
rel=[]
for d,v in qrels.items():
    if d in feat_data.index.values:
        rel.append(v)

In [218]:
feat_data['QueryId'], feat_data['DocumentId'] = feat_data.index.str.split('-', 1).str
feat_data.reset_index(inplace=True)
feat_data.drop('index',axis=1,inplace=True)

In [219]:
feat_data['target']=rel

In [221]:
X_train, X_test, y_train, y_test = train_test_split(feat_data[['0','1','2']], feat_data['target'], test_size=0.20, random_state=7)

In [222]:
clf.fit(X_train,y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [223]:
predictions=clf.predict(X_test)

In [224]:
predictions.shape

(734,)

In [225]:
feat_data.shape

(3668, 12)

In [236]:
feat_data2=pd.DataFrame.from_dict(features2,orient='index')

In [241]:
feat_data2.columns = ['0', '1', '2','3','4','5','6','7']

In [242]:
clf.fit(feat_data[['0', '1', '2','3','4','5','6','7','8']], feat_data['target'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=3,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [243]:
feat_data2['QueryId'], feat_data2['DocumentId'] = feat_data2.index.str.split('-', 1).str
feat_data2.reset_index(inplace=True)
feat_data2.drop('index',axis=1,inplace=True)

In [244]:
predictions_full=clf.predict(feat_data2[['0', '1', '2','3','4','5','6','7']])

ValueError: Number of features of the model must match the input. Model n_features is 9 and input n_features is 8 

In [97]:
np.savetxt('data/sample_predict.txt',predictions_full)

In [166]:
feat_data2['prediction']=predictions_full

In [167]:
feat_data2=feat_data2.sort_values(by=['prediction'])

In [169]:
feat_data2.to_csv('data/myranking.csv',columns=['qid','did'],index=False)