In [1]:
from google.colab import drive
drive.mount('/content/drive')

!pip install transformers
!pip install torch
!pip install sentencepiece

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
movie_file_path = '/content/drive/MyDrive/Colab Notebooks/ir_project/movie_data.csv'
query_file_path = '/content/drive/MyDrive/Colab Notebooks/ir_project/generated_queries.csv'

## Synthetic Query Generation

### Load in Movie Plots

In [3]:
import pandas as pd

In [4]:
import re

def process_plot(plots):
    processed_plots = []
    
    for plot in plots:
      processed_plot = re.sub(r'\[.*?\]', '', plot)
      processed_plots.append(processed_plot)
    
    return processed_plots

In [5]:
movies = pd.read_csv(movie_file_path,header=0)
plots = list(movies['Plot'].to_numpy())

In [6]:
plots = process_plot(plots)

### Loading T5 Model

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model = T5ForConditionalGeneration.from_pretrained('BeIR/query-gen-msmarco-t5-large-v1')
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [8]:
#Select the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

### Generate Queries

In [9]:
from tqdm import tqdm
import csv

In [10]:
# Parameters for query generation
batch_size = 16
queries_per_plot = 5 # generate 5 queries per movie plot
max_query_length = 20 # max length for generated query
max_length_plot = 512 # max length for movie plot (anythign after length of 512 is truncated)

In [11]:
def RemoveNonAscii(s): 
    return "".join(i for i in s if ord(i) < 128)

In [12]:
torch.cuda.empty_cache()

# start = 5000
# end = 10000
# with open(query_file_path, 'a+') as file:  
with open(query_file_path, 'w') as file:
    writer = csv.writer(file)
    writer.writerow(["Query", "Plot"])

    for start_idx in tqdm(range(0, len(plots), batch_size)):
        # process plots in batches
        sub_plots = plots[start_idx:start_idx+batch_size]

        #tokenise inputs
        inputs = tokenizer.prepare_seq2seq_batch(sub_plots, max_length=max_length_plot, truncation=True, return_tensors='pt').to(device)

        # generate outputs
        outputs = model.generate(**inputs, max_length=max_query_length, do_sample=True, top_p=0.95, num_return_sequences=queries_per_plot)

        for idx, output in enumerate(outputs):
            # decode each output to get the query
            decoded_query = tokenizer.decode(output, skip_special_tokens=True)
            # remove any non-ASCII character from generated query
            query = RemoveNonAscii(decoded_query)
            plot = sub_plots[int(idx/queries_per_plot)]
            # remove any non-ASCII character from plot
            plot = RemoveNonAscii(plot)
            # write generated query to file
            writer.writerow([query, plot])

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    la