<a href="https://colab.research.google.com/github/stavco9/textretrieval-final-project/blob/main/LightGBMRankingSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from google.colab import files
from sklearn.model_selection import train_test_split

gbm = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg"
)

In [2]:
!rm -rf textretrieval-final-project
!git clone https://github.com/stavco9/textretrieval-final-project.git

Cloning into 'textretrieval-final-project'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 49 (delta 23), reused 31 (delta 9), pack-reused 0 (from 0)[K
Receiving objects: 100% (49/49), 33.57 MiB | 15.62 MiB/s, done.
Resolving deltas: 100% (23/23), done.
Updating files: 100% (14/14), done.


In [8]:
relevant_flags_path = './textretrieval-final-project/files/qrels_50_Queries'
ranked_results_path = './textretrieval-final-project/results/run_lucene_docid_2_bm25.res'

In [19]:
relevant_flags_list = []
ranked_results_list = []
docnum2docid = {}

with open(ranked_results_path, 'r') as f:
  ranked_results = f.readlines()

for result in ranked_results:
  ranked_results_list.append({
      'query_id': result.split()[0],
      'doc_id': result.split()[2],
      'doc_num': result.split()[3],
      'rank': result.split()[4],
      'score': result.split()[5]
  })

  docnum2docid[result.split()[3]] = result.split()[2]

with open(relevant_flags_path, 'r') as f:
  relevant_flags = f.readlines()

for relevant in relevant_flags:
  relevant_flags_list.append({
      'query_id': relevant.split()[0],
      'doc_id': relevant.split()[2],
      'relevance': relevant.split()[3]
  })

In [20]:
ranked_results_df = pd.DataFrame.from_records(ranked_results_list)
relevant_flags_df = pd.DataFrame.from_records(relevant_flags_list)

In [66]:
df = pd.merge(
    left=ranked_results_df,
    right=relevant_flags_df,
    how='left',
    left_on=['query_id', 'doc_id'],
    right_on=['query_id', 'doc_id'],
).drop_duplicates()

In [67]:
df['relevance'] = df['relevance'].fillna(0)

In [68]:
df['query_id'] = df['query_id'].astype(int)
df['relevance'] = df['relevance'].astype(int)
df['rank'] = df['rank'].astype(int)
df['doc_num'] = df['doc_num'].astype(int)
df['score'] = df['score'].astype(float)

In [69]:
N_labeled = 50
N_max = 10000

train_queries = []

all_queries = df['query_id'].unique()
for query_id in all_queries[:N_labeled]:
  train_queries.append(df.loc[(df['query_id'] == query_id)][:int(N_max/N_labeled)])

In [70]:
df_labeled = pd.concat(train_queries)
df_labeled

Unnamed: 0,query_id,doc_id,doc_num,rank,score,relevance
0,301,FBIS4-41991,419770,1,8.2814,0
1,301,FBIS4-38364,287141,2,7.9861,1
2,301,FBIS3-19646,398295,3,7.9419,1
3,301,FBIS3-21961,267222,4,7.9419,1
4,301,FBIS4-19535,149553,5,7.9335,0
...,...,...,...,...,...,...
201476,350,FT943-7679,357397,196,5.5832,0
201477,350,FR940610-1-00047,433957,197,5.5829,0
201478,350,FBIS3-59008,145229,198,5.5788,0
201479,350,LA030889-0016,368591,199,5.5751,0


In [71]:
df_labeled_train = df_labeled[:int(len(df_labeled)*0.8)]
df_labeled_val = df_labeled[int(len(df_labeled)*0.8):]

qids_train = df_labeled_train.groupby("query_id")["query_id"].count().to_numpy()
X_train = df_labeled_train.drop(['doc_id', 'query_id', 'relevance'], axis=1)
y_train = df_labeled_train['relevance']

qids_val = df_labeled_val.groupby("query_id")["query_id"].count().to_numpy()
X_val = df_labeled_val.drop(['doc_id', 'query_id', 'relevance'], axis=1)
y_val = df_labeled_val['relevance']

In [81]:
gbm.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_val, y_val)],
    eval_group=[qids_val],
    eval_at=10
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 711
[LightGBM] [Info] Number of data points in the train set: 7553, number of used features: 3




In [86]:
results_out = []

for query_id in all_queries:
  df_test = df.loc[(df['query_id'] == query_id)][:1000]
  X_test = df_test.drop(['doc_id', 'query_id', 'relevance'], axis=1)
  test_pred = gbm.predict(X_test)
  X_test["new_score"] = test_pred
  X_test = X_test.sort_values("new_score", ascending=False)
  X_test.insert(0, 'new_rank', range(1, 1 + len(X_test)))

  results_out.extend(f"{query_id} Q0 {docnum2docid[str(int(row['doc_num']))]} {row['new_rank']} {round(row['new_score'], 4)} run4" for i, row in X_test.iterrows())

In [87]:
with open('results.res', 'w') as f:
    for line in results_out:
        f.write(f"{line}\n")

In [88]:
files.download('results.res')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>