<a href="https://colab.research.google.com/github/jaredzrks0/DSAN5400_final_project/blob/main/DSAN_5400_Final_News_Recommender_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
#!pip install ujson

Collecting ujson
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ujson
Successfully installed ujson-5.10.0


In [13]:
import os
import sys
import pandas as pd
import re
import logging
import time
import json

In [2]:
# Install ColBERT and dependencies
try:
    import google.colab
    !git -C ColBERT/ pull || git clone https://github.com/stanford-futuredata/ColBERT.git > /dev/null 2>&1
    !pip install -q fsspec==2024.9.0 faiss-gpu torch torchvision torchaudio > /dev/null 2>&1
    sys.path.insert(0, 'ColBERT/')
except Exception:
    raise RuntimeError("Failed to install ColBERT. Ensure you are in Google Colab or set it up manually.")

# Verify GPU availability
import torch
if not torch.cuda.is_available():
    print("Warning: GPU is not available. Performance may be slower.")
else:
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")


Already up to date.
Using GPU: Tesla T4


In [3]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("ColBERT")

# Download ColBERT Checkpoint
logger.info("Downloading ColBERT checkpoint...")
checkpoint_url = "https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz"
!wget -O colbertv2.0.tar.gz $checkpoint_url
!mkdir -p checkpoints  # Create the 'checkpoints' directory if it doesn't exist
!tar -xvzf colbertv2.0.tar.gz -C checkpoints


--2024-12-07 00:37:25--  https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405924985 (387M) [application/octet-stream]
Saving to: ‘colbertv2.0.tar.gz’


2024-12-07 00:38:40 (5.18 MB/s) - ‘colbertv2.0.tar.gz’ saved [405924985/405924985]

colbertv2.0/
colbertv2.0/artifact.metadata
colbertv2.0/vocab.txt
colbertv2.0/tokenizer.json
colbertv2.0/special_tokens_map.json
colbertv2.0/tokenizer_config.json
colbertv2.0/config.json
colbertv2.0/pytorch_model.bin


In [4]:
# ColBERT Imports
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

# Load and Clean Data
logger.info("Loading and cleaning data...")
tsv_file = "collection.tsv"

df = pd.read_csv(tsv_file, sep='\t', header=None)
texts = df[0]
articles = [text.split('\t')[1] for text in texts if '\t' in text]

logger.info(f"Loaded {len(articles)} articles.")

In [5]:
# Preprocess Query Function
def preprocess_query(query):
    return re.sub(r'[^\w\s]', '', query.lower())


In [6]:
# Indexing Articles
logger.info("Starting indexing...")
nbits = 1  # encode each dimension with 2 bits
doc_maxlen = 300  # truncate passages at 300 tokens
index_name = f'dsan5400_project_nbits={nbits}'
checkpoint = 'checkpoints/colbertv2.0'

# Index articles
with Run().context(RunConfig(nranks=1, experiment='DSAN5400')):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4)
    indexer = Indexer(checkpoint=checkpoint, config=config)
    indexer.index(name=index_name, collection=articles[:500], overwrite=True)

logger.info("Indexing complete.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.




[Dec 07, 00:43:05] #> Creating directory /content/experiments/DSAN5400/indexes/dsan5400_project_nbits=1 


#> Starting...
#> Joined...


In [14]:
# Search Articles Dynamically
logger.info("Setting up searcher...")
with Run().context(RunConfig(experiment='DSAN5400')):
    searcher = Searcher(index=index_name, collection=articles[:500])

print("\nEnter your query below (or type 'exit' to quit):")
while True:
    # get user query
    user_query = input("\nQuery: ").strip()
    if user_query.lower() == 'exit':
        print("Exiting search...")
        break

    # Preprocess Query
    query = preprocess_query(user_query)

    # Perform search
    start_time = time.time()  # Start timer
    results = searcher.search(query, k=3)  # Retrieve top-3 results
    print(f"\nQuery executed in {time.time() - start_time:.2f} seconds")

    # Display Results
    print("\nTop-3 Results:\n")
    for passage_id, passage_rank, passage_score in zip(*results):
        print(f"Passage ID: {passage_id}")
        print(f"Rank: {passage_rank}")
        print(f"Score: {passage_score:.2f}")
        print(f"Content: {searcher.collection[passage_id][:200]}...")
        print("\n---\n")

[Dec 07, 01:00:28] #> Loading codec...
[Dec 07, 01:00:28] #> Loading IVF...
[Dec 07, 01:00:28] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 4240.95it/s]

[Dec 07, 01:00:29] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 464.95it/s]



Enter your query below (or type 'exit' to quit):

Query: election fraud

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: election fraud, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([ 101,    1, 2602, 9861,  102,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,  103,
         103,  103,  103,  103,  103,  103,  103,  103], device='cuda:0')
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')


Query executed in 0.03 seconds

Top-3 Results:

Passage ID: 94
Rank: 1
Score: 22.11
 When Donald Trump lost his re-election bid in 2020, many Republicans — Trump inclu...

---

Passage ID: 56
Rank: 2
Score: 21.27
 On election night 2020, then-President Donald Trump prematurely...

---

Passage ID: 110
Rank: 3
Score: 21.20
 Just hours after the polls closed in the 2020 Unite

In [15]:
# Save Results
output_file = "search_results.json"
import json

with open(output_file, "w") as f:
    json.dump({"query": query, "results": results}, f, indent=4)
logger.info(f"Results saved to {output_file}.")
