docs: Resolve many typos in documentation, comments and tutorials (#1701

) * chore: Resolve many typos in documentation, comments and tutorials Found via codespell (would recommend) * chore: run pre-commit --all to clean up all files (cherry picked from commit 5d9340b)
argilla-io · Oct 5, 2022 · f05e1c1 · f05e1c1
1 parent 1cde0d0
commit f05e1c1
Show file tree

Hide file tree

Showing 16 changed files with 53 additions and 34 deletions.
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
@@ -0,0 +1,20 @@
+# Dependency Review Action
+#
+# This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging.
+#
+# Source repository: https://github.com/actions/dependency-review-action
+# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
+name: 'Dependency Review'
+on: [pull_request]
+
+permissions:
+  contents: read
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    steps:
+      - name: 'Checkout Repository'
+        uses: actions/checkout@v3
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@v1
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ To better understand what's possible take a look at Rubrix's [Cookbook](https://
 
 ## Example
 
-Let's see Rubrix in action with a quick example: _Bootstraping data annotation with a zero-shot classifier_
+Let's see Rubrix in action with a quick example: _Bootstrapping data annotation with a zero-shot classifier_
 
 **Why**:
 
@@ -186,7 +186,7 @@ Let's see Rubrix in action with a quick example: _Bootstraping data annotation w
 
 ### 1. Predict and log
 
-Let's load the zero-shot pipeline and the dataset (we are using the AGNews dataset for demonstration, but this could be your own dataset). Then, let's go over the dataset records and log them using `rb.log()`. This will create a Rubrix dataset, accesible from the web app.
+Let's load the zero-shot pipeline and the dataset (we are using the AGNews dataset for demonstration, but this could be your own dataset). Then, let's go over the dataset records and log them using `rb.log()`. This will create a Rubrix dataset, accessible from the web app.
 
 ```python
 from transformers import pipeline

diff --git a/docs/getting_started/advanced_setup_guides.md b/docs/getting_started/advanced_setup_guides.md
@@ -297,7 +297,7 @@ NAME                   ACTIVE   DRIVER      STATE     URL
 rubrix-aws             -        amazonec2   Running   tcp://52.213.178.33:2376           v20.10.7
 ```
 
-### Save asigned machine ip
+### Save assigned machine ip
 
 In our case, the assigned ip is `52.213.178.33`
 

diff --git a/docs/getting_started/basics.ipynb b/docs/getting_started/basics.ipynb
@@ -624,7 +624,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Log the datset to the Rubrix web app\n",
+    "# Log the dataset to the Rubrix web app\n",
     "rb.log(dataset_rb, \"coffee-reviews\")"
    ]
   },
@@ -789,7 +789,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Log the datset to the Rubrix web app\n",
+    "# Log the dataset to the Rubrix web app\n",
     "rb.log(dataset_rb, \"ecdc_en\")"
    ]
   },

diff --git a/docs/guides/cookbook.ipynb b/docs/guides/cookbook.ipynb
@@ -1426,7 +1426,7 @@
     "# Creating the pipeline\n",
     "nlp = stanza.Pipeline(lang=\"en\", processors=\"tokenize,sentiment\")\n",
     "\n",
-    "# Analizing the input text\n",
+    "# Analyzing the input text\n",
     "doc = nlp(text)\n",
     "\n",
     "# This model returns 0 for negative, 1 for neutral and 2 for positive outcome.\n",
@@ -1505,7 +1505,7 @@
     "# Creating the pipeline\n",
     "nlp = stanza.Pipeline(lang=\"ca\", processors=\"tokenize,mwt,pos\")\n",
     "\n",
-    "# Analizing the input text\n",
+    "# Analyzing the input text\n",
     "doc = nlp(input_text)\n",
     "\n",
     "# Creating the prediction entity as a list of tuples (tag, start_char, end_char)\n",
@@ -1564,7 +1564,7 @@
     "# Creating the pipeline\n",
     "nlp = stanza.Pipeline(lang=\"ru\", processors=\"tokenize,ner\")\n",
     "\n",
-    "# Analizing the input text\n",
+    "# Analyzing the input text\n",
     "doc = nlp(input_text)\n",
     "\n",
     "# Creating the prediction entity as a list of tuples (entity, start_char, end_char)\n",

diff --git a/docs/guides/dataset_settings.ipynb b/docs/guides/dataset_settings.ipynb
@@ -33,7 +33,7 @@
     "# Define labeling schema\n",
     "settings = rb.TextClassificationSettings(label_schema=[\"A\", \"B\", \"C\"])\n",
     "\n",
-    "# Apply seetings to a new or already existing dataset\n",
+    "# Apply settings to a new or already existing dataset\n",
     "rb.configure_dataset(name=\"my_dataset\", settings=settings)\n",
     "\n",
     "# Logging to the newly created dataset triggers the validation checks\n",

diff --git a/docs/guides/queries.md b/docs/guides/queries.md
@@ -18,9 +18,8 @@ For a complete list of available fields and their content, have a look at the fi
 
 ```{note}
 The default behavior when not specifying any fields in the query string changed in version `>=0.16.0`.
-
-Before this version, Rubrix searched in a mixture of the deprecated `word` and `word.extended` fields that allowed searches for special characters like `!` and `.`.
-If you want to search for special characters now, you have to spcify the `text.exact` field.
+Before this version, Rubrix searched in a mixture of the the deprecated `word` and `word.extended` fields that allowed searches for special characters like `!` and `.`.
+If you want to search for special characters now, you have to specify the `text.exact` field.
 For example, this is the query if you want to search for words with an exclamation mark in the end: `text.exact:*\!`
 
 If you do not retrieve any results after a version update, you should use the `words` and `words.extended` fields in your search query for old datasets instead of the `text` and `text.exact` ones.

diff --git a/docs/guides/weak-supervision.ipynb b/docs/guides/weak-supervision.ipynb
@@ -362,7 +362,7 @@
    "id": "405c93ec-b136-43cf-af50-96c956b65f12",
    "metadata": {},
    "source": [
-    "## 3. Building and analizing weak labels"
+    "## 3. Building and analyzing weak labels"
    ]
   },
   {

diff --git a/docs/tutorials/01-labeling-finetuning.ipynb b/docs/tutorials/01-labeling-finetuning.ipynb
@@ -167,7 +167,7 @@
     "\n",
     "For this tutorial, we'll use the **original labeling scheme** defined by the pre-trained model which is composed of two labels: `POSITIVE` and `NEGATIVE`. We could have added the `NEUTRAL` label, but let's keep it simple. \n",
     "\n",
-    "Another important issue when approaching a data annotaion project are the **annotation guidelines**, which explain how to assign the labels to specific examples. As we'll see later, the messages we'll be labeling are mostly questions with a neutral sentiment, which we'll label with the `POSITIVE` label, and some other are negative questions which we'll label with the `NEGATIVE` label. Later on, we'll show some examples of each label.\n",
+    "Another important issue when approaching a data annotation project are the **annotation guidelines**, which explain how to assign the labels to specific examples. As we'll see later, the messages we'll be labeling are mostly questions with a neutral sentiment, which we'll label with the `POSITIVE` label, and some other are negative questions which we'll label with the `NEGATIVE` label. Later on, we'll show some examples of each label.\n",
     "\n"
    ]
   },
@@ -582,7 +582,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# split the data into a training and evalutaion set\n",
+    "# split the data into a training and evaluation set\n",
     "train_dataset, eval_dataset = tokenized_train_ds.train_test_split(test_size=0.2, seed=42).values()"
    ]
   },
@@ -951,7 +951,7 @@
     "\n",
     "Although this is somehow a toy example, you will be able to apply this workflow to your own projects to adapt existing models or building them from scratch. \n",
     "\n",
-    "In this tutorial, we've covered one way of building training sets: **hand labeling**. If you are interested in other methods, which could be combined witth hand labeling, checkout the following:\n",
+    "In this tutorial, we've covered one way of building training sets: **hand labeling**. If you are interested in other methods, which could be combined with hand labeling, checkout the following:\n",
     "\n",
     "- [Building a news classifier with weak supervision](weak-supervision-with-rubrix.ipynb)\n",
     "- [Active learning with ModAL and scikit-learn](05-active_learning.ipynb)"

diff --git a/docs/tutorials/02-spacy.ipynb b/docs/tutorials/02-spacy.ipynb
@@ -520,7 +520,7 @@
    "metadata": {},
    "source": [
     "## Summary\n",
-    "In this tutorial, you learned how to log and explore differnt `spaCy` NER models with Rubrix. Now you can:\n",
+    "In this tutorial, you learned how to log and explore different `spaCy` NER models with Rubrix. Now you can:\n",
     "\n",
     "- Build custom dashboards using Kibana to monitor and visualize spaCy models.\n",
     "- Build training sets using pre-trained spaCy models."

diff --git a/docs/tutorials/09-automatic_fastapi_log.ipynb b/docs/tutorials/09-automatic_fastapi_log.ipynb
@@ -108,7 +108,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For more informations about using the `transformers` library with Rubrix, check the tutorial [How to label your data and fine-tune a 🤗 sentiment classifier](01-labeling-finetuning.ipynb)"
+    "For more information about using the `transformers` library with Rubrix, check the tutorial [How to label your data and fine-tune a 🤗 sentiment classifier](01-labeling-finetuning.ipynb)"
    ]
   },
   {
@@ -146,16 +146,16 @@
    "metadata": {},
    "source": [
     "Looks like the `predictions` is a list containing lists of two elements : \n",
-    "- The first dictionnary containing the `NEGATIVE` sentiment label and its score.\n",
-    "- The second dictionnary containing the same data but for `POSITIVE` sentiment."
+    "- The first dictionary containing the `NEGATIVE` sentiment label and its score.\n",
+    "- The second dictionary containing the same data but for `POSITIVE` sentiment."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## 2. Convert output to Rubrix format\n",
-    "To log the output to Rubrix, we should supply a list of dictionnaries, each dictonnary containing two keys:\n",
+    "To log the output to Rubrix, we should supply a list of dictionaries, each dictionary containing two keys:\n",
     "- `labels` : value is a list of strings, each string being the label of the sentiment.\n",
     "- `scores` : value is a list of floats, each float being the probability of the sentiment."
    ]
@@ -353,7 +353,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Launch the appplication"
+    "### Launch the application"
    ]
   },
   {

diff --git a/docs/tutorials/extend_weak_labels_with_embeddings.ipynb b/docs/tutorials/extend_weak_labels_with_embeddings.ipynb
@@ -600,7 +600,7 @@
    "source": [
     "### Generate sentence embeddings\n",
     "\n",
-    "Let's generate sentence embeddings for each record of our weak labels matrix. Best results will be achieved through powerful general-purpose pretrained embeddings, or by embeddings especifically pretrained for the domain of the task at hand.  \n",
+    "Let's generate sentence embeddings for each record of our weak labels matrix. Best results will be achieved through powerful general-purpose pretrained embeddings, or by embeddings specifically pretrained for the domain of the task at hand.  \n",
     "\n",
     "Here we choose the `all-mpnet-base-v2` embeddings from the well-known [Sentence Transformers library](https://www.sbert.net/). Rubrix allows us to experiment with embeddings from any source, as long as they are provided to the weak labels matrix as a two-dimensional array.\n",
     "\n",

diff --git a/docs/tutorials/skweak.ipynb b/docs/tutorials/skweak.ipynb
@@ -36,7 +36,7 @@
    "source": [
     "## Introduction\n",
     "\n",
-    "Our goal is to show you how you can incorporate Rubrix into data programming workflows to programatically build training data with a human-in-the-loop approach. We will use the [skweak](https://github.com/NorskRegnesentral/skweak) library.\n",
+    "Our goal is to show you how you can incorporate Rubrix into data programming workflows to programmatically build training data with a human-in-the-loop approach. We will use the [skweak](https://github.com/NorskRegnesentral/skweak) library.\n",
     "\n",
     "### What is weak supervision? and skweak?\n",
     "Weak supervision is a branch of machine learning based on getting lower quality labels more efficiently. We can achieve this by using [skweak](https://github.com/NorskRegnesentral/skweak), a library for programmatically building and managing training datasets without manual labeling.\n",
@@ -416,7 +416,7 @@
    "source": [
     "### Annotating with generic rules\n",
     "\n",
-    "We can also write rules that are a litle bit more generic.   \n",
+    "We can also write rules that are a little bit more generic.   \n",
     "\n",
     "For instance, organizations often are presented as a series of capitalized words that either start or end with a certain keyword. We write a generator called `title_detector` to capture them.    \n",
     "\n",
@@ -1030,7 +1030,7 @@
    "id": "cdd6a6ef",
    "metadata": {},
    "source": [
-    "Although here we are using the majority voter in a rather simple way to vote for a single `ORG` label, it is possible to attribute weights to the vote of each labelling function and even define complex hierarchies between labels. These details are explained in the majority voter [documention](https://github.com/NorskRegnesentral/skweak/wiki/Step-2:-Aggregation) and [code](https://github.com/NorskRegnesentral/skweak/blob/de888c26d7faaa7fe15746f2aed5e574685bbfad/skweak/aggregation.py#L193) on the skweak repository.\n",
+    "Although here we are using the majority voter in a rather simple way to vote for a single `ORG` label, it is possible to attribute weights to the vote of each labelling function and even define complex hierarchies between labels. These details are explained in the majority voter [documentation](https://github.com/NorskRegnesentral/skweak/wiki/Step-2:-Aggregation) and [code](https://github.com/NorskRegnesentral/skweak/blob/de888c26d7faaa7fe15746f2aed5e574685bbfad/skweak/aggregation.py#L193) on the skweak repository.\n",
     "\n",
     "\n",
     "![Visualization of the majority voter in Rubrix](../_static/tutorials/skweak/skweak_6.png)"

diff --git a/docs/tutorials/weak-supervision-multi-label.ipynb b/docs/tutorials/weak-supervision-multi-label.ipynb
@@ -1896,7 +1896,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Extact the title and split the data\n",
+    "# Extract the title and split the data\n",
     "\n",
     "import pandas as pd\n",
     "import rubrix as rb\n",

diff --git a/src/rubrix/metrics/token_classification/metrics.py b/src/rubrix/metrics/token_classification/metrics.py
@@ -105,9 +105,9 @@ def token_length(name: str, query: Optional[str] = None) -> MetricSummary:
 def token_capitalness(name: str, query: Optional[str] = None) -> MetricSummary:
     """Computes the token capitalness distribution
 
-        ``UPPER``: All charactes in the token are upper case.
+        ``UPPER``: All characters in the token are upper case.
 
-        ``LOWER``: All charactes in the token are lower case.
+        ``LOWER``: All characters in the token are lower case.
 
         ``FIRST``: The first character in the token is upper case.
 
@@ -308,9 +308,9 @@ def entity_capitalness(
 ) -> MetricSummary:
     """Computes the entity capitalness. The entity capitalness splits the entity mention shape in 4 groups:
 
-        ``UPPER``: All charactes in entity mention are upper case.
+        ``UPPER``: All characters in entity mention are upper case.
 
-        ``LOWER``: All charactes in entity mention are lower case.
+        ``LOWER``: All characters in entity mention are lower case.
 
         ``FIRST``: The first character in the mention is upper case.
 

diff --git a/tests/client/test_api.py b/tests/client/test_api.py
@@ -86,7 +86,7 @@ def mock_get(*args, **kwargs):
 
 
 def test_init_correct(mock_response_200):
-    """Testing correct default initalization
+    """Testing correct default initialization
 
     It checks if the _client created is a RubrixClient object.
     """
@@ -107,7 +107,7 @@ def test_init_correct(mock_response_200):
 
 
 def test_init_evironment_url(mock_response_200, monkeypatch):
-    """Testing initalization with api_url provided via environment variable
+    """Testing initialization with api_url provided via environment variable
 
     It checks the url in the environment variable gets passed to client.
     """
@@ -125,7 +125,7 @@ def test_init_evironment_url(mock_response_200, monkeypatch):
 
 
 def test_trailing_slash(mock_response_200):
-    """Testing initalization with provided api_url via environment variable and argument
+    """Testing initialization with provided api_url via environment variable and argument
 
     It checks the trailing slash is removed in all cases
     """