Skip to content

Commit

Permalink
docs(clean): add user guide for clean_duplication
Browse files Browse the repository at this point in the history
Added the user guide for the clean_duplication function.
  • Loading branch information
ryanwdale committed Apr 7, 2021
1 parent b6997b8 commit d834e85
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 11 deletions.
5 changes: 4 additions & 1 deletion dataprep/clean/clean_duplication.py
Expand Up @@ -20,6 +20,8 @@ def clean_duplication(
"""
Cleans and standardized duplicate values in a DataFrame.
Read more in the :ref:`User Guide <duplication_userguide>`.
Parameters
----------
df
Expand All @@ -45,6 +47,7 @@ def clean_duplication(
>>> df = pd.DataFrame({'city': ['New York', 'new york']})
>>> clean_duplication(df, 'city')
city
0 New York
1 New York
Expand Down Expand Up @@ -185,7 +188,7 @@ def __init__(self, df: pd.DataFrame, col_name: str, df_name: str, page_size: int
# VBox containing a VBox with all the clusters in the first row and an optional
# second row containing next and previous page buttons
self._cluster_and_next_prev = VBox()
self._cluster_vbox = VBox(layout=Layout(height="600px", flex_flow="row wrap"))
self._cluster_vbox = VBox(layout=Layout(height="450px", flex_flow="row wrap"))

footer = HBox([self._sel_all, merge_and_recluster, finish])

Expand Down
34 changes: 25 additions & 9 deletions dataprep/clean/clean_duplication_utils.py
Expand Up @@ -15,6 +15,15 @@
from metaphone import doublemetaphone
from Levenshtein import distance

DECODE_FUNC = """
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
"""


class Clusterer:
"""
Expand All @@ -31,7 +40,7 @@ class Clusterer:

def __init__(self, df: pd.DataFrame, col_name: str, df_name: str):
self.clusters = pd.Series()
self._df = df
self._df = df.copy(deep=True)
self._df_name = df_name
self._col = col_name
self._ngram = 2
Expand Down Expand Up @@ -192,6 +201,7 @@ def _ngram_finger_print_key(self, val: str) -> str:
def _create_replace_calls(
self, cluster_page: pd.Series, do_merge: List[bool], new_values: List[str]
) -> str:

"""
Creates a string containing the required replace function calls.
Expand Down Expand Up @@ -224,13 +234,15 @@ def live_export_code(
"""
code = self._create_replace_calls(cluster_page, do_merge, new_values)
encoded_code = (b64encode(str.encode(code))).decode()

code = """
{0}
var ind = IPython.notebook.get_selected_index();
var cell = IPython.notebook.get_cell(ind);
var text = cell.get_text();
cell.set_text(text.concat(atob("{0}")));
cell.set_text(text.concat(b64DecodeUnicode("{1}")));
""".format(
encoded_code
DECODE_FUNC, encoded_code
)
display(Javascript(code))

Expand All @@ -256,13 +268,17 @@ def final_df(self) -> None:
"""
code = "# dataframe with cleaned string values\ndf_clean"
encoded_code = (b64encode(str.encode(code))).decode()
json = self._df.to_json(force_ascii=False)
execute_code = f"import pandas as pd\ndf_clean = pd.read_json('{json}')"
encoded_execute = (b64encode(str.encode(execute_code))).decode()
code = """
IPython.notebook.kernel.execute("df_clean = {0}.copy()");
var code = IPython.notebook.insert_cell_below('code');
code.set_text(atob("{1}"));
code.execute();
""".format(
self._df_name, encoded_code
{0}
IPython.notebook.kernel.execute(b64DecodeUnicode('{1}'));
var code = IPython.notebook.insert_cell_below('code');
code.set_text(b64DecodeUnicode("{2}"));
code.execute();
""".format(
DECODE_FUNC, encoded_execute, encoded_code
)
display(Javascript(code))

Expand Down
187 changes: 187 additions & 0 deletions docs/source/user_guide/clean/clean_duplication.ipynb
@@ -0,0 +1,187 @@
{
"cells": [
{
"cell_type": "raw",
"metadata": {
"raw_mimetype": "text/restructuredtext"
},
"source": [
".. _duplication_userguide:\n",
"\n",
"Duplicate Values\n",
"================"
]
},
{
"cell_type": "raw",
"metadata": {
"raw_mimetype": "text/restructuredtext",
"scrolled": false
},
"source": [
"Introduction\n",
"------------\n",
"\n",
"The function :func:`clean_duplication() <dataprep.clean.clean_duplication.clean_duplication>` creates a user interface that clusters duplicate values and allows the user to merge them into standardized values. The following clustering methods are provided:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### fingerprint\n",
"This is the process for creating a fingerprint key:\n",
"\n",
"* remove leading and trailing whitespace\n",
"* convert to lowercase\n",
"* remove punctuation and control characters\n",
"* normalize extended western characters to ASCII\n",
"* split into whitespace separated tokens\n",
"* sort tokens and remove duplicates\n",
"* join tokens back together\n",
"\n",
"### ngram-fingerprint\n",
"This is the process for creating a n-gram fingerprint key:\n",
"\n",
"* convert to lowercase\n",
"* remove punctuation, whitespace and control characters\n",
"* get string n-grams\n",
"* sort n-grams and remove duplicates\n",
"* join sorted n grams back together\n",
"* normalize extended western characters to ASCII\n",
"\n",
"A textbox is provided for choosing the n-gram size.\n",
"\n",
"### phonetic-fingerprint\n",
"Uses the double metaphone algorithm for generating phonetic-fingerprint keys. The [metaphone](https://github.com/oubiwann/metaphone) library is used.\n",
"\n",
"### levenshtein distance\n",
"Blocking is used to speed up the process, blocks are obtained where strings in the same\n",
"block share a substring of a given blocking size. Only strings within the same block are\n",
"compared using the levenshtein distance function. If two values have a distance less than \n",
"or equal to the given radius they are added to the same cluster. Textboxes are provided for choosing the block size and the radius.\n",
"\n",
"The [python-Levenshtein](https://github.com/ztane/python-Levenshtein) library is used for a fast levenshtein distance implementation.\n",
"\n",
"Clustering methods are taken from the [OpenRefine](https://github.com/OpenRefine/OpenRefine) project and the [simile-vicino](https://code.google.com/archive/p/simile-vicino/n) project, you can read more about these clustering methods [here](https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth).\n",
"\n",
"The `df_var_name` parameter sets the variable name to be used for the dataframe when creating replace function calls.\n",
"\n",
"The `page_size` parameter can be used to set the number of clusters that are displayed on each page of the user interface.\n",
"\n",
"Most of the functionality is provided through an interactive user interface which will be introduced shortly."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## An example dirty dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"df = pd.DataFrame(\n",
" {\n",
" \"city\": [\n",
" \"Québec\",\n",
" \"Quebec\",\n",
" \"Vancouver\",\n",
" \"Vancouver\",\n",
" \"vancouver\",\n",
" \" Vancuver \",\n",
" \"Toronto\",\n",
" \"Toront\",\n",
" \"Tronto\",\n",
" \"Ottowa\",\n",
" \"otowa\"\n",
" ]\n",
" }\n",
")\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Default `clean_duplication()`\n",
"\n",
"By default the `df_var_name` parameter is set to \"df\" and the `page_size` variable is set to 5. Clustering methods can be toggled using the dropdown menu at the top of the GUI. Select which clusters you would like to merge using the checkboxes under the \"Merge?\" heading. Then press the \"Merge and Re-Cluster\" button to merge the cluster. If the \"export code\" checkbox is selected, code for merging the clusters will be created and added to the notebook cell. Finally, you can press the \"finish\" button to close the GUI and see the final DataFrame created."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataprep.clean import clean_duplication\n",
"clean_duplication(df, \"city\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. `df_var_name` parameter\n",
"\n",
"Pandas Series.replace function calls are created and added to the current notebook cell when merging a cluster with the \"export code\" checkbox selected. This parameter allows for changing the DataFrame variable name used in the exported code."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_duplication(df, \"city\", df_var_name=\"dataframe\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. `page_size` parameter\n",
"This parameter allows for changing the number of clusters that are displayed on each page of the user interface."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"clean_duplication(df, \"city\", page_size=1)"
]
}
],
"metadata": {
"celltoolbar": "Raw Cell Format",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
3 changes: 2 additions & 1 deletion docs/source/user_guide/clean/introduction.ipynb
Expand Up @@ -33,6 +33,7 @@
" * [Column Headers](clean_headers.ipynb)\n",
" * [Country Names](clean_country.ipynb)\n",
" * [Dates and Times](clean_date.ipynb)\n",
" * [Duplicate Values](clean_duplication.ipynb)\n",
" * [Email Addresses](clean_email.ipynb)\n",
" * [Geographic Goordinates](clean_lat_long.ipynb)\n",
" * [IP Addresses](clean_ip.ipynb)\n",
Expand All @@ -59,7 +60,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.7.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit d834e85

Please sign in to comment.