# Document Reranking Demo

In [1]:
import torch

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    DistilBertForMaskedLM,
)

from IPython.display import Markdown, display
import html


## Load Trained Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
tokenizer_options = {
    "return_tensors": "pt",
    "truncation": True,
    "padding": True,
    "max_length": 512,
}

model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
model.load_state_dict(torch.load("demo_model.pt", map_location=device))
model.to(device)
model.eval()

print("success!")

success!


## Generate Testing Queries and Documents

In [4]:
queries = [
    "What is UBC?", 
    "UBC MDS", 
    "number of students at UBC", 
    "What is Vancouver?", 
    "best programming language"
]
documents = [
    (
        "The University of British Columbia is a global centre "
        "for teaching, learning and research, consistently ranked among "
        "the top 20 public universities in the world and recently recognized "
        "as North America’s most international university."
    ),
    (
        "The University of British Columbia attracts, nurtures and proactively transform more than "
        "58,000 students from Canada and 140 countries."
    ),
    (
        "University of British Columbia Master of Data Science (MDS) is a 10-month, "
        "full-time, accelerated professional graduate program offered at both "
        "the University of British Columbia Vancouver and Okanagan campuses. "
    ),
    (
        "Vancouver is a major city in western Canada, located in the Lower Mainland region "
        "of British Columbia. As the most populous city in the province, the 2016 census "
        "recorded 631,486 people in the city, up from 603,502 in 2011. "
        "The Greater Vancouver area had a population of 2,463,431 in 2016, making it the "
        "third-largest metropolitan area in Canada."
    ),
    (
        "Python is a programming language that lets you work more quickly and "
        "integrate your systems more effectively. Python can be easy to pick up whether "
        "you're a first time programmer or you're experienced with other languages. "
    ),
    (
        "R is a programming language and free software environment for statistical "
        "computing and graphics supported by the R Foundation for Statistical Computing. "
        "The R language is widely used among statisticians and data miners for "
        "developing statistical software and data analysis."
    )
]

## Ranking Query-Documents

In [5]:
with torch.no_grad():
    inputs = queries + documents

    # Tokenize queries and documents 
    encodings = tokenizer(inputs, **tokenizer_options).to(device)
    ids, masks = encodings["input_ids"], encodings["attention_mask"]

    # Inference pass through model
    outputs = model.distilbert(ids, masks)
    outputs_hidden = outputs.last_hidden_state.mean(dim=1) #[:, 0]
    vec_queries = outputs_hidden[:len(queries)].unsqueeze(1)
    vec_documents = outputs_hidden[len(queries):].unsqueeze(0)

    # Compute pairwise distances between queries and documents
    all_distances, all_indices = (vec_queries - vec_documents).norm(dim=-1).sort(dim=-1)
    
    # Print nicely the results
    for qdistances, dindices, query in zip(all_distances, all_indices, queries):
        display(Markdown(f"### {query}"))
        for dist, dindex in zip(qdistances, dindices):
            display(Markdown(f"**{dist:.2f}**: {documents[dindex]}"))

### What is UBC?

**11.24**: The University of British Columbia is a global centre for teaching, learning and research, consistently ranked among the top 20 public universities in the world and recently recognized as North America’s most international university.

**12.71**: University of British Columbia Master of Data Science (MDS) is a 10-month, full-time, accelerated professional graduate program offered at both the University of British Columbia Vancouver and Okanagan campuses. 

**13.59**: The University of British Columbia attracts, nurtures and proactively transform more than 58,000 students from Canada and 140 countries.

**14.97**: Vancouver is a major city in western Canada, located in the Lower Mainland region of British Columbia. As the most populous city in the province, the 2016 census recorded 631,486 people in the city, up from 603,502 in 2011. The Greater Vancouver area had a population of 2,463,431 in 2016, making it the third-largest metropolitan area in Canada.

**16.76**: R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.

**17.12**: Python is a programming language that lets you work more quickly and integrate your systems more effectively. Python can be easy to pick up whether you're a first time programmer or you're experienced with other languages. 

### UBC MDS

**10.27**: University of British Columbia Master of Data Science (MDS) is a 10-month, full-time, accelerated professional graduate program offered at both the University of British Columbia Vancouver and Okanagan campuses. 

**11.85**: The University of British Columbia is a global centre for teaching, learning and research, consistently ranked among the top 20 public universities in the world and recently recognized as North America’s most international university.

**13.93**: The University of British Columbia attracts, nurtures and proactively transform more than 58,000 students from Canada and 140 countries.

**15.84**: Vancouver is a major city in western Canada, located in the Lower Mainland region of British Columbia. As the most populous city in the province, the 2016 census recorded 631,486 people in the city, up from 603,502 in 2011. The Greater Vancouver area had a population of 2,463,431 in 2016, making it the third-largest metropolitan area in Canada.

**16.16**: R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.

**17.07**: Python is a programming language that lets you work more quickly and integrate your systems more effectively. Python can be easy to pick up whether you're a first time programmer or you're experienced with other languages. 

### number of students at UBC

**13.42**: The University of British Columbia is a global centre for teaching, learning and research, consistently ranked among the top 20 public universities in the world and recently recognized as North America’s most international university.

**13.93**: The University of British Columbia attracts, nurtures and proactively transform more than 58,000 students from Canada and 140 countries.

**14.43**: University of British Columbia Master of Data Science (MDS) is a 10-month, full-time, accelerated professional graduate program offered at both the University of British Columbia Vancouver and Okanagan campuses. 

**15.58**: Vancouver is a major city in western Canada, located in the Lower Mainland region of British Columbia. As the most populous city in the province, the 2016 census recorded 631,486 people in the city, up from 603,502 in 2011. The Greater Vancouver area had a population of 2,463,431 in 2016, making it the third-largest metropolitan area in Canada.

**18.90**: R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.

**19.31**: Python is a programming language that lets you work more quickly and integrate your systems more effectively. Python can be easy to pick up whether you're a first time programmer or you're experienced with other languages. 

### What is Vancouver?

**8.43**: Vancouver is a major city in western Canada, located in the Lower Mainland region of British Columbia. As the most populous city in the province, the 2016 census recorded 631,486 people in the city, up from 603,502 in 2011. The Greater Vancouver area had a population of 2,463,431 in 2016, making it the third-largest metropolitan area in Canada.

**12.24**: The University of British Columbia is a global centre for teaching, learning and research, consistently ranked among the top 20 public universities in the world and recently recognized as North America’s most international university.

**13.55**: The University of British Columbia attracts, nurtures and proactively transform more than 58,000 students from Canada and 140 countries.

**13.61**: University of British Columbia Master of Data Science (MDS) is a 10-month, full-time, accelerated professional graduate program offered at both the University of British Columbia Vancouver and Okanagan campuses. 

**17.91**: R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.

**18.24**: Python is a programming language that lets you work more quickly and integrate your systems more effectively. Python can be easy to pick up whether you're a first time programmer or you're experienced with other languages. 

### best programming language

**14.23**: R is a programming language and free software environment for statistical computing and graphics supported by the R Foundation for Statistical Computing. The R language is widely used among statisticians and data miners for developing statistical software and data analysis.

**14.43**: Python is a programming language that lets you work more quickly and integrate your systems more effectively. Python can be easy to pick up whether you're a first time programmer or you're experienced with other languages. 

**16.95**: University of British Columbia Master of Data Science (MDS) is a 10-month, full-time, accelerated professional graduate program offered at both the University of British Columbia Vancouver and Okanagan campuses. 

**17.38**: The University of British Columbia is a global centre for teaching, learning and research, consistently ranked among the top 20 public universities in the world and recently recognized as North America’s most international university.

**17.56**: The University of British Columbia attracts, nurtures and proactively transform more than 58,000 students from Canada and 140 countries.

**17.77**: Vancouver is a major city in western Canada, located in the Lower Mainland region of British Columbia. As the most populous city in the province, the 2016 census recorded 631,486 people in the city, up from 603,502 in 2011. The Greater Vancouver area had a population of 2,463,431 in 2016, making it the third-largest metropolitan area in Canada.