* https://huggingface.co/learn/nlp-course/en/chapter5/6?fw=pt
* https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section6_pt.ipynb

In [2]:
!pip install datasets evaluate transformers[sentencepiece]



In [30]:
# faiss-gpu does not work with CUDA 12, so we install faiss-gpu-cu12: https://pypi.org/project/faiss-gpu-cu12/
!pip install faiss-gpu-cu12

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




# Load the dataset

In [4]:
from datasets import load_dataset

issues_dataset = load_dataset("renwei2024/nemo-github-issues", split="train")
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 10000
})

# Prepare the dataset

In [6]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and x["comments"])
)
issues_dataset

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'sub_issues_summary', 'active_lock_reason', 'draft', 'pull_request', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason', 'is_pull_request'],
    num_rows: 783
})

In [7]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 783
})

In [8]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]

In [9]:
df["comments"][0].tolist()

['And if I change `punctuation_en_distilbert.nemo` to `punctuation_en_bert.nemo` than .onnx model has 3 inputs, but the issue with \r\n`onnxruntime.capi.onnxruntime_pybind11_state.InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Unexpected input data type. Actual: (tensor(float)) , expected: (tensor(int64))` for `"attention_mask"` still exist',
 'In forward DistilBERT takes only input_ids and attention_mask (no token_type_ids as in BERT).\r\n@borisfom could you help with the ONNX part?',
 'Well, finally I ended up with the next code:\r\n\r\nExport .onnx and config yaml:\r\n\r\n```\r\nmodel = PunctuationCapitalizationModel.restore_from(restore_path="punctuation_en_distilbert.nemo")\r\nmodel.export("punctuation_en_distilbert.onnx")\r\n\r\nmodel = PunctuationCapitalizationModel.restore_from(restore_path="punctuation_en_distilbert.nemo", return_config = True)\r\ntextfile = open("punctuation_en_distilbert.yaml", "w")\r\ntextfile.write(str(OmegaConf.to_yaml(model)))\r\ntextfile.c

In [10]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/NVIDIA/NeMo/issues/2327,Exporting PunctuationCapitalizationModel model...,And if I change `punctuation_en_distilbert.nem...,I am trying to export the nemo_nlp Punctuation...
1,https://github.com/NVIDIA/NeMo/issues/2327,Exporting PunctuationCapitalizationModel model...,In forward DistilBERT takes only input_ids and...,I am trying to export the nemo_nlp Punctuation...
2,https://github.com/NVIDIA/NeMo/issues/2327,Exporting PunctuationCapitalizationModel model...,"Well, finally I ended up with the next code:\r...",I am trying to export the nemo_nlp Punctuation...
3,https://github.com/NVIDIA/NeMo/issues/2327,Exporting PunctuationCapitalizationModel model...,I am still facing the same issue after followi...,I am trying to export the nemo_nlp Punctuation...


In [11]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2825
})

In [12]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())}
)

Map:   0%|          | 0/2825 [00:00<?, ? examples/s]

In [13]:
comments_dataset = comments_dataset.filter(lambda x: x["comment_length"] > 15)
comments_dataset

Filter:   0%|          | 0/2825 [00:00<?, ? examples/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2234
})

In [16]:
def concatenate_text(examples):
    title = examples["title"] if examples["title"] else ""
    body = examples["body"] if examples["body"] else ""
    comments = examples["comments"] if examples["comments"] else ""
    return {"text": title + " \n " + body + " \n " + comments}

In [17]:
comments_dataset = comments_dataset.map(concatenate_text)

Map:   0%|          | 0/2234 [00:00<?, ? examples/s]

# Create text embeddings

In [18]:
from transformers import AutoModel, AutoTokenizer

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [20]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [22]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [23]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [24]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [25]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/2234 [00:00<?, ? examples/s]

In [27]:
embeddings_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2234
})

# Use FAISS for efficient similarity search

In [29]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embeddings'],
    num_rows: 2234
})

In [35]:
question = "How do I build a NeMo docker image?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [36]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [37]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [38]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.comments}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.title}")
    print(f"URL: {row.html_url}")
    print("=" * 50)
    print()

COMMENT: @wheynelau thanks for your reply.
That is what I'm doing. Here is my dockerfile:

```dorckerfile
FROM nvcr.io/nvidia/nemo:24.12 AS base
EXPOSE 8080

# Install PowerShell Core
RUN apt-get update && apt-get install -y wget \
    && wget -q https://packages.microsoft.com/config/ubuntu/24.04/packages-microsoft-prod.deb \
    && dpkg -i packages-microsoft-prod.deb \
    && apt-get update \
    && apt-get install -y powershell \
    && rm packages-microsoft-prod.deb \
    && rm -rf /var/lib/apt/lists/*

# Install PowerShell Core
RUN apt-get update && apt-get install -y wget \
    && wget -q https://packages.microsoft.com/config/ubuntu/24.04/packages-microsoft-prod.deb \
    && dpkg -i packages-microsoft-prod.deb \
    && apt-get update \
    && apt-get install -y powershell \
    && rm packages-microsoft-prod.deb \
    && rm -rf /var/lib/apt/lists/*

# Install dependencies aspnetcore
RUN apt-get update && apt-get install -y \
    apt-transport-https \
    software-properties-common
