<a href="https://colab.research.google.com/github/stavco9/textretrieval-final-project/blob/main/LightGBMRankingSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================================
# Setup and Imports
# ============================================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
import os
import sys
import subprocess
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from google.colab import files
import pyserini
import lightgbm

# Install required packages if not already installed
print("Checking/installing required packages...")

try:
    print("‚úÖ Pyserini already installed")
except ImportError:
    print("Installing Pyserini...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pyserini"])
    print("‚úÖ Pyserini installed")

try:
    
    print("‚úÖ LightGBM already installed")
except ImportError:
    print("Installing LightGBM...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm"])
    print("‚úÖ LightGBM installed")

print("\nüìÅ Required files:")
print("   - rank.py")
print("   - files/qrels_50_Queries")
print("   - files/queriesROBUST.txt")
print("   - index/RobustPyserini/ (entire directory)")
print("\n" + "="*60)

In [None]:
!rm -rf textretrieval-final-project
!git clone https://github.com/shmoshkin/text-retrieval.git


In [None]:
# ============================================================================
# File Upload and Verification
# ============================================================================

print("üì§ File Upload Helper")
print("="*60)
print("\nTo upload files:")
print("1. Use the Files sidebar (üìÅ icon) to drag and drop files")
print("2. Or uncomment files.upload() below to upload interactively")
print("3. Or mount Google Drive and copy files from Drive")
print("\n" + "="*60)

# Uncomment the line below to upload files interactively
# files.upload()

# Verify critical files
print("\nüîç Checking for required files...")
required_files = [
    'rank.py',
    'files/qrels_50_Queries',
    'files/queriesROBUST.txt',
    'index/RobustPyserini'
]

all_files_exist = True
for file_path in required_files:
    if os.path.exists(file_path):
        print(f"‚úÖ {file_path}")
    else:
        print(f"‚ùå {file_path} - MISSING!")
        all_files_exist = False

print("\n" + "="*60)
if all_files_exist:
    print("‚úÖ All required files found! Ready to proceed.")
else:
    print("‚ö†Ô∏è  Some files are missing. Please upload them before continuing.")


In [None]:
# ============================================================================
# CONFIGURATION: Choose which methods to run
# ============================================================================

# Set to True to run each method
RUN_RM3 = True      # Run 1: RM3 (Relevance Model 3)
RUN_VECTOR = True   # Run 2: Vector-based (TF-IDF Cosine Similarity)
RUN_LIGHTGBM = True # Run 3: LightGBM Learning-to-Rank

# Paths (using local repository)
BASE_DIR = "."  # Current directory (adjust if needed)
FILES_DIR = f"{BASE_DIR}/files"
RESULTS_DIR = f"{BASE_DIR}/results"

# Input files
RELEVANT_FLAGS_PATH = f'{FILES_DIR}/qrels_50_Queries'

# Baseline ranking file for LightGBM (will try to find automatically if not specified)
# Options: 'run_10_bm25.res', 'run_1_rm3.res', 'run_2_vector.res', or None for auto-detect
BASELINE_RANKING_FILE = None  # Set to None to auto-detect, or specify path

# Output files
OUTPUT_RUN1 = f'{RESULTS_DIR}/run_1_rm3.res'
OUTPUT_RUN2 = f'{RESULTS_DIR}/run_2_vector.res'
OUTPUT_RUN3 = f'{RESULTS_DIR}/run_3_lightgbm.res'

# Auto-detect baseline file if not specified
if BASELINE_RANKING_FILE is None:
    # Try to find an existing baseline file
    possible_baselines = [
        f'{RESULTS_DIR}/run_10_bm25.res',
        f'{RESULTS_DIR}/run_1_rm3.res',
        f'{RESULTS_DIR}/run_2_vector.res'
    ]
    for baseline in possible_baselines:
        if os.path.exists(baseline):
            RANKED_RESULTS_PATH = baseline
            print(f"‚úÖ Auto-detected baseline: {baseline}")
            break
    else:
        RANKED_RESULTS_PATH = f'{RESULTS_DIR}/run_10_bm25.res'  # Default
        print(f"‚ö†Ô∏è  No baseline found, will use: {RANKED_RESULTS_PATH}")
else:
    RANKED_RESULTS_PATH = BASELINE_RANKING_FILE

print("\nConfiguration:")
print(f"  Run RM3: {RUN_RM3}")
print(f"  Run Vector: {RUN_VECTOR}")
print(f"  Run LightGBM: {RUN_LIGHTGBM}")
print(f"  Base directory: {BASE_DIR}")
print(f"  Baseline for LightGBM: {RANKED_RESULTS_PATH}")

In [None]:
# ============================================================================
# Run Baseline Ranking Methods (RM3 and Vector-based)
# ============================================================================

if RUN_RM3 or RUN_VECTOR or (RUN_LIGHTGBM and not os.path.exists(RANKED_RESULTS_PATH)):
    # Import ranking functions
    import sys
    sys.path.append(BASE_DIR)
    
    from rank import rank_documents, rank_documents_vector
    
    # Check if we need to generate BM25 for LightGBM baseline
    if RUN_LIGHTGBM and not os.path.exists(RANKED_RESULTS_PATH):
        print("\n" + "="*60)
        print("Generating BM25 baseline for LightGBM (Run 10)")
        print("="*60)
        rank_documents(run_number=10, method="bm25", top_k=1000)
        # Update path after generation
        RANKED_RESULTS_PATH = f'{RESULTS_DIR}/run_10_bm25.res'
        print(f"‚úÖ Baseline generated: {RANKED_RESULTS_PATH}")
    
    if RUN_RM3:
        print("\n" + "="*60)
        print("Running RM3 Method (Run 1)")
        print("="*60)
        rank_documents(run_number=1, method="rm3", top_k=1000)
    
    if RUN_VECTOR:
        print("\n" + "="*60)
        print("Running Vector-based Method (Run 2)")
        print("="*60)
        rank_documents_vector(run_number=2, top_k=1000)
    
    print("\n‚úÖ Baseline methods completed!")
else:
    print("Skipping baseline methods (RM3 and Vector)")

In [None]:
# ============================================================================
# Load Data for LightGBM Training
# ============================================================================

if RUN_LIGHTGBM:
    # Check if baseline results exist
    if not os.path.exists(RANKED_RESULTS_PATH):
        print(f"‚ö†Ô∏è  Warning: Baseline results not found at {RANKED_RESULTS_PATH}")
        print("   Please run RM3 or BM25 first to generate baseline rankings.")
        print("   Or update RANKED_RESULTS_PATH to point to an existing .res file.")
        RUN_LIGHTGBM = False
    else:
        relevant_flags_list = []
        ranked_results_list = []

        print(f"Loading ranked results from: {RANKED_RESULTS_PATH}")
        with open(RANKED_RESULTS_PATH, 'r') as f:
            ranked_results = f.readlines()

        for result in ranked_results:
            ranked_results_list.append({
                'query_id': result.split()[0],
                'doc_id': result.split()[2],
                'rank': result.split()[3],
                'score': result.split()[4]
            })

        print(f"Loading relevance judgments from: {RELEVANT_FLAGS_PATH}")
        with open(RELEVANT_FLAGS_PATH, 'r') as f:
            relevant_flags = f.readlines()

        for relevant in relevant_flags:
            relevant_flags_list.append({
                'query_id': relevant.split()[0],
                'doc_id': relevant.split()[2],
                'relevance': relevant.split()[3]
            })
        
        print(f"‚úÖ Loaded {len(ranked_results_list)} ranked results")
        print(f"‚úÖ Loaded {len(relevant_flags_list)} relevance judgments")
else:
    print("Skipping LightGBM (not enabled in configuration)")

In [None]:
if RUN_LIGHTGBM:
    ranked_results_df = pd.DataFrame.from_records(ranked_results_list)
    relevant_flags_df = pd.DataFrame.from_records(relevant_flags_list)
    
    print(f"Ranked results: {len(ranked_results_df)} rows")
    print(f"Relevance flags: {len(relevant_flags_df)} rows")
else:
    print("Skipping data preparation (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    df = pd.merge(
        left=ranked_results_df,
        right=relevant_flags_df,
        how='left',
        left_on=['query_id', 'doc_id'],
        right_on=['query_id', 'doc_id'],
    ).drop_duplicates()
    
    print(f"Merged dataframe: {len(df)} rows")
    print(f"Unique queries: {df['query_id'].nunique()}")
else:
    print("Skipping merge (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    df['relevance'] = df['relevance'].fillna(0)
    print(f"Relevance distribution:")
    print(df['relevance'].value_counts())
else:
    print("Skipping relevance fill (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    df['query_id'] = df['query_id'].astype(int)
    df['relevance'] = df['relevance'].astype(int)
    df['rank'] = df['rank'].astype(int)
    df['score'] = df['score'].astype(float)
    
    print("‚úÖ Data types converted")
    print(df.dtypes)
else:
    print("Skipping type conversion (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    N_labeled = 50
    N_max = 10000

    train_queries = []

    all_queries = df['query_id'].unique()
    print(f"Total unique queries: {len(all_queries)}")
    print(f"Using first {N_labeled} queries for training")
    
    for query_id in all_queries[:N_labeled]:
        train_queries.append(df.loc[(df['query_id'] == query_id)][:int(N_max/N_labeled)])
    
    print(f"Prepared {len(train_queries)} training query sets")
else:
    print("Skipping training data preparation (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    df_labeled = pd.concat(train_queries)
    print(f"Combined training data: {len(df_labeled)} rows")
    print(f"Unique queries in training: {df_labeled['query_id'].nunique()}")
    df_labeled.head()
else:
    print("Skipping (LightGBM not enabled)")

Unnamed: 0,query_id,doc_id,rank,score,relevance
0,301,FBIS4-41991,1,8.2814,0
1,301,FBIS4-38364,2,7.9861,1
2,301,FBIS3-19646,3,7.9419,1
3,301,FBIS3-21961,4,7.9419,1
4,301,FBIS4-19535,5,7.9335,0
...,...,...,...,...,...
44917,350,FT943-7679,196,5.5832,0
44918,350,FR940610-1-00047,197,5.5829,0
44919,350,FBIS3-59008,198,5.5788,0
44920,350,LA030889-0016,199,5.5751,0


In [None]:
if RUN_LIGHTGBM:
    df_labeled_train = df_labeled[:int(len(df_labeled)*0.8)]
    df_labeled_val = df_labeled[int(len(df_labeled)*0.8):]

    qids_train = df_labeled_train.groupby("query_id")["query_id"].count().to_numpy()
    X_train = df_labeled_train.drop(['doc_id', 'query_id', 'relevance'], axis=1)
    y_train = df_labeled_train['relevance']

    qids_val = df_labeled_val.groupby("query_id")["query_id"].count().to_numpy()
    X_val = df_labeled_val.drop(['doc_id', 'query_id', 'relevance'], axis=1)
    y_val = df_labeled_val['relevance']
    
    print(f"Training set: {len(X_train)} samples, {len(qids_train)} queries")
    print(f"Validation set: {len(X_val)} samples, {len(qids_val)} queries")
    print(f"Features: {list(X_train.columns)}")
else:
    print("Skipping train/val split (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    gbm = lgb.LGBMRanker(
        boosting_type="gbdt",
        learning_rate=0.1,
        n_estimators=100,
        objective="lambdarank",
        metric="ndcg"
    )
    print("‚úÖ LightGBM Ranker initialized")
else:
    print("Skipping LightGBM initialization (not enabled)")

In [None]:
if RUN_LIGHTGBM:
    print("Training LightGBM model...")
    gbm.fit(
        X=X_train,
        y=y_train,
        group=qids_train,
        eval_set=[(X_val, y_val)],
        eval_group=[qids_val],
        eval_at=10
    )
    print("‚úÖ Model training completed")
else:
    print("Skipping model training (LightGBM not enabled)")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 456
[LightGBM] [Info] Number of data points in the train set: 7553, number of used features: 2


In [None]:
if RUN_LIGHTGBM:
    print("Generating predictions for all queries...")
    results_out = []

    for query_id in tqdm(all_queries, desc="Predicting", unit="query"):
        df_test = df.loc[(df['query_id'] == query_id)][:1000]
        if len(df_test) == 0:
            continue
            
        X_test = df_test.drop(['doc_id', 'query_id', 'relevance'], axis=1)

        test_pred = gbm.predict(X_test)
        X_test = X_test.copy()
        X_test["new_score"] = test_pred
        X_test = X_test.sort_values("new_score", ascending=False)
        X_test.insert(0, 'new_rank', range(1, 1 + len(X_test)))

        # Map back to doc_ids
        for i, row in X_test.iterrows():
            original_rank = int(row['rank'])
            doc_id = df_test.loc[df_test['rank'] == original_rank, 'doc_id'].iloc[0]
            new_rank = int(row['new_rank'])
            new_score = round(row['new_score'], 4)
            results_out.append(f"{query_id} Q0 {doc_id} {new_rank} {new_score} run3")
    
    print(f"‚úÖ Generated {len(results_out)} predictions")
else:
    print("Skipping predictions (LightGBM not enabled)")

In [None]:
if RUN_LIGHTGBM:
    os.makedirs(RESULTS_DIR, exist_ok=True)
    
    with open(OUTPUT_RUN3, 'w') as f:
        for line in results_out:
            f.write(f"{line}\n")
    
    print(f"‚úÖ Results saved to: {OUTPUT_RUN3}")
    print(f"   Total lines: {len(results_out)}")
    
    # Download if in Colab
    if IN_COLAB and files:
        files.download(OUTPUT_RUN3)
        print("‚úÖ File downloaded")
else:
    print("Skipping save (LightGBM not enabled)")

In [None]:
# ============================================================================
# Summary
# ============================================================================

print("\n" + "="*60)
print("RUN SUMMARY")
print("="*60)

if RUN_RM3:
    if os.path.exists(OUTPUT_RUN1):
        size = os.path.getsize(OUTPUT_RUN1) / (1024*1024)  # MB
        print(f"‚úÖ Run 1 (RM3): {OUTPUT_RUN1} ({size:.2f} MB)")
    else:
        print(f"‚ùå Run 1 (RM3): Not generated")

if RUN_VECTOR:
    if os.path.exists(OUTPUT_RUN2):
        size = os.path.getsize(OUTPUT_RUN2) / (1024*1024)  # MB
        print(f"‚úÖ Run 2 (Vector): {OUTPUT_RUN2} ({size:.2f} MB)")
    else:
        print(f"‚ùå Run 2 (Vector): Not generated")

if RUN_LIGHTGBM:
    if os.path.exists(OUTPUT_RUN3):
        size = os.path.getsize(OUTPUT_RUN3) / (1024*1024)  # MB
        print(f"‚úÖ Run 3 (LightGBM): {OUTPUT_RUN3} ({size:.2f} MB)")
    else:
        print(f"‚ùå Run 3 (LightGBM): Not generated")

print("="*60)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>