In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<center style="font-family:verdana;"><h1 style="font-size:200%; padding: 20px; background: #001f3f;"><i><b style="color:orange;">Dataprep Clean</b></i></h1></center>



DataPrep.Clean provides functions for quickly and easily cleaning and validating your data.

Section Contents: Column Headers, Country Names, Dates and Times, Duplicate Values, Email Addresses,

Geographic Goordinates, IP Addresses, Phone Numbers, Text, URLs, US Street Addresses, Whole DataFrame

Dataprep is an initiative by SFU Data Science Research Group to speed up Data Science.

Acknowlegdements

<h1 style="font-size:180%; color:orange;"><i><b>SFU Data Science Research Group - SIMON FRASER UNIVERSITY</b></i></h1>

https://docs.dataprep.ai/user_guide/clean/introduction.html

https://docs.dataprep.ai/user_guide/clean/clean_text.html

https://dataprep.ai/

In [None]:
!pip install dataprep

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/train.csv',delimiter=',', encoding='utf-8')
df.head()

![](https://coursereport-production.imgix.net/uploads/image/file/70/Guide_to_Data_Cleaning_-_infographic.png?auto=compress%2Cformat&w=800&h=1184)coursereport.com

#Text -Introduction 

The function clean_text() cleans text data in a DataFrame column.

Using a default or customized pipeline, the function performs a series of cleaning operations on the data.

The following sections demonstrate the functionality of clean_text().

https://docs.dataprep.ai/user_guide/clean/clean_text.html

In [None]:
#Two thousand eight hundred fourteenth row, fourth column 

df.iloc[2814,3]

#We must type quotation marks and separate with commas each part. So that it will be divided in columns.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

df = pd.DataFrame(
    {
        "text": [
            "Augmented reality (AR) is a live direct", "or indirect view of a physical", "real-world environment whose elements are augmented", "(or supplemented) by computer-generated sensory", "input such as sound", "video, graphics or GPS data.", "It is related to", "a more general concept", "called mediated reality", "in which a view of reality is modified", "(possibly even diminished ", "rather than augmented) by a computer.", "As a result", "the technology functions by enhancing", "one's current perception of reality.", "By contrast", "virtual reality replaces the real world", "with a simulated one.", "Augmentation is conventionally", "in real-time and in semantic context", "with environmental elements", "such as sports scores", "on TV during a match.", "With the help of advanced AR technology", "(e.g. adding computer vision and object recognition)", "the information about the surrounding", "real world of the user", "becomes interactive", "and digitally manipulable.", "Information about the environment and its objects", "is overlaid on the real world.", "This information can be virtual or real", "e.g. seeing other real sensed or measured information", "such as electromagnetic radio waves overlaid", "in exact alignment with where they actually are in space.",
            123,
            np.nan,
            "NULL",
        ]
    }
)
df

<h1 style="font-size:180%; color:orange;"><i><b>Default clean_text()</b></i></h1>


The default pipeline for the clean_text() function is the following:

fillna: Replace all null values with NaN.

lowercase: Convert all characters to lowercase.

remove_digits: Remove numbers.

remove_html Remove HTML tags.

remove_urls: Remove URLs.

remove_punctuation: Remove punctuation marks.

remove_accents: Remove accent marks.

remove_stopwords: Remove stopwords.

remove_whitespace: Remove extra spaces, and tabs and newlines.

https://docs.dataprep.ai/user_guide/clean/clean_text.html

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

from dataprep.clean import clean_text
clean_text(df, "text")

#By default, the stopwords removed are the set of words in NLTK’s English stopwords. To remove a different set of words, pass the set into the stopwords parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

#Include your stopwords. I've just written some of them as an example.

from dataprep.clean import clean_text
clean_text(df, "text", stopwords={"a", "or", "of", "and", "as"})

<h1 style="font-size:180%; color:orange;"><i><b>Custom pipeline</b></i></h1>


Users can pass in a custom pipeline to clean_text() using the pipeline parameter.

https://docs.dataprep.ai/user_guide/clean/clean_text.html

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "lowercase"},
    {"operator": "remove_digits"},
    {"operator": "remove_whitespace"},
]
clean_text(df, "text", pipeline=custom_pipeline)

#Users can also define and pass in their own functions using the pipeline parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

import re

def split(text: str) -> str:
    return str(text).split()

def replace_z(text: str, value: str) -> str:
    return re.sub(r"z", value, str(text), flags=re.I)

custom_pipeline = [
    {"operator": "lowercase"},
    {"operator": "remove_digits"},
    {"operator": split},
    {"operator": replace_z, "parameters": {"value": "*"}},
    {"operator": "remove_whitespace"},
]
clean_text(df, "text", pipeline=custom_pipeline)

#In general, custom pipelines can be defined using the form:

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "<operator_name>",
        "parameters": {"<parameter_name>": "<parameter_value>"},
    }
]

#To get the default pipeline in the form of a list, call default_text_pipeline().

This can be used as a template to build a list of cleaning operations to be passed into the pipeline parameter.

In [None]:
from dataprep.clean import default_text_pipeline
default_text_pipeline()

<h1 style="font-size:180%; color:orange;"><i><b>Built-in functions</b></i></h1>


This section demonstrates the built-in cleaning operations which can be called using the pipeline parameter.

clean_text() assumes the DataFrame column contains text data. As such, any int values will be cast to str after applying a cleaning function.

<h1 style="font-size:180%; color:orange;"><i><b>fillna</b></i></h1>


By default, fillna replaces all null values with NaN.

https://docs.dataprep.ai/user_guide/clean/clean_text.html


In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "fillna"}]
clean_text(df, "text", pipeline=custom_pipeline)

#To specify a specific value to replace null values, use the value parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "fillna", "parameters": {"value": "<NAN>"}}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>lowercase</b></i></h1>

Convert all characters to lowercase.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "lowercase"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>sentence_case</b></i></h1>


Convert the first character of the string to uppercase and all remaining characters to lowercase.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "sentence_case"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>title_case</b></i></h1>


Convert the first character of each word to uppercase and the remaining words to lowercase.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "title_case"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>uppercase</b></i></h1>


Convert all characters to uppercase.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "uppercase"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_accents</b></i></h1>


Remove accents (diacritic marks) from the text.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_accents"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_bracketed</b></i></h1>

Remove text between brackets.

The style of the brackets can be specified using the brackets parameter:

“angle”: <>

“curly”: {}

“round”: ()

“square”: []

By default, the inclusive parameter is set to True and the brackets are removed along with the text in between.

https://docs.dataprep.ai/user_guide/clean/clean_text.html

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "remove_bracketed", "parameters": {"brackets": "round"}}
]
clean_text(df, "text", pipeline=custom_pipeline)

#To remove the text but keep the brackets, set inclusive to False.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "remove_bracketed",
        "parameters": {"brackets": "round", "inclusive": False},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

#The brackets parameter can also take in a set, which allows multiple bracket styles to be specified at a time.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "remove_bracketed",
        "parameters": {"brackets": {"angle", "curly", "round", "square"}},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_digits</b></i></h1>


Remove all digits.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_digits"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_html</b></i></h1>


Remove HTML tags, including the non-breaking space &nbsp;.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_html"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_prefixed</b></i></h1>


Remove substrings that start with the prefix(es) specified in the prefix parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_prefixed", "parameters": {"prefix": "#"}}]
clean_text(df, "text", pipeline=custom_pipeline)

#To specify multiple prefixes, pass in a set of the prefixes to the prefix parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "remove_prefixed", "parameters": {"prefix": {"#", "@"}}}
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_punctuation</b></i></h1>


Remove all punctuation marks defined in Python’s string.punctuation.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_punctuation"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_stopwords</b></i></h1>


Remove common words. By default, the set of stopwords to remove is NLTK’s English stopwords.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_stopwords"}]
clean_text(df, "text", pipeline=custom_pipeline)

#To use a custom set of words, pass the set into the stopwords parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "remove_stopwords", "parameters": {"stopwords": {"a", "or", "of", "and", "as"}}}
]
clean_text(df, "text", pipeline=custom_pipeline)

#Alternatively, expand upon the default set of stopwords by importing dataprep.assets.english_stopwords and adding custom words.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

from dataprep.assets.english_stopwords import english_stopwords
custom_stopwords = english_stopwords.copy()
custom_stopwords.add("or")
custom_stopwords.add("and")

custom_pipeline = [
    {
        "operator": "remove_stopwords",
        "parameters": {"stopwords": custom_stopwords},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

In [None]:
#remove_urls
#Remove URLs. Substrings that start with “http” or “www” are considered URLs.

#custom_pipeline = [{"operator": "remove_urls"}]
#clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>remove_whitespace</b></i></h1>


Remove extra spaces (two or more) along with tabs and newlines. Leading and trailing spaces are also removed.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "remove_whitespace"}]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>replace_bracketed</b></i></h1>


Replace text between brackets with the value.

The style of the brackets can be specified using the brackets parameter:

“angle”: <>

“curly”: {}

“round”: ()

“square”: []

By default, the inclusive parameter is set to True and the brackets are also replaced by the value along with the text in between.

https://docs.dataprep.ai/user_guide/clean/clean_text.html

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_bracketed",
        "parameters": {"brackets": "square", "value": "**SPOILERS**"},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

#To replace the text, but keep the brackets, set inclusive to False.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_bracketed",
        "parameters": {
            "brackets": "square",
            "value": "**SPOILERS**",
            "inclusive": False,
        },
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

#The brackets parameter can also take in a set, which allows multiple bracket styles to be specified at a time.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_bracketed",
        "parameters": {
            "brackets": {"angle", "curly", "round", "square"},
            "value": "<REDACTED>",
        },
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

#To assign different replacement values to different bracket styles, chain together replace_bracketed operations.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_bracketed",
        "parameters": {
            "brackets": "square",
            "value": "**SPOILERS**",
        },
    },
    {
        "operator": "replace_bracketed",
        "parameters": {
            "brackets": "curly",
            "value": "in every aspect.",
        },
    },
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>replace_digits</b></i></h1>


Replace all digits with the value. By default, the block parameter is set to True and only blocks of digits, i.e. tokens composed solely of numbers, are removed.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "replace_digits", "parameters": {"value": "X"}}]
clean_text(df, "text", pipeline=custom_pipeline)

#To replace all digits appearing in the text, set block to False.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "replace_digits", "parameters": {"value": "X", "block": False}}
]
clean_text(df, "text", pipeline=custom_pipeline)

In [None]:
#replace_prefixed

#Replace all substrings that start with the prefix(es) specified in the prefix parameter with the value.

#custom_pipeline = [
 #   {
  #      "operator": "replace_prefixed",
   #     "parameters": {"prefix": "#", "value": "<HASHTAG>"},
   # }
#]
#clean_text(df, "text", pipeline=custom_pipeline)

In [None]:
#To replace substrings of different prefixes with the same value, pass in a set of the prefixes to the prefix parameter.

#custom_pipeline = [
 #   {
  #      "operator": "replace_prefixed",
   #     "parameters": {"prefix": {"#", "@"}, "value": "<TAG>"},
    #}
#]
#clean_text(df, "text", pipeline=custom_pipeline)


<h1 style="font-size:180%; color:orange;"><i><b>replace_punctuation</b></i></h1>

Replace all punctuation marks defined in string.punctuation with the value.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {"operator": "replace_punctuation", "parameters": {"value": "<PUNC>"}}
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>replace_stopwords</b></i></h1>


Replace common words with the value. By default, the set of stopwords to replace is NLTK’s English stopwords.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [{"operator": "replace_stopwords", "parameters": {"value": "<S>"}}]
clean_text(df, "text", pipeline=custom_pipeline)

#To use a custom set of words, pass the set into the stopwords parameter.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_stopwords",
        "parameters": {"stopwords": {"imdb", "film"}, "value": "<S>"},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>replace_text</b></i></h1>


Replace a sequence of characters with another according to the mapping specified in the value parameter. By default, block is set to True and only blocks of text, i.e. tokens composed solely of the specified sequence of characters, are replaced.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_text",
        "parameters": {"value": {"graphics": "charts", "real": "existent"}},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

#To replace the sequence of characters wherever they appear in the text, set block to False.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

custom_pipeline = [
    {
        "operator": "replace_text",
        "parameters": {"value": {"graphics": "charts", "real": "existent"}, "block": False},
    }
]
clean_text(df, "text", pipeline=custom_pipeline)

<h1 style="font-size:180%; color:orange;"><i><b>replace_urls</b></i></h1>

Replace URLs with the value. Substrings that start with “http” or “www” are considered URLs.

In [None]:
#Code by https://docs.dataprep.ai/user_guide/clean/clean_text.html

#custom_pipeline = [{"operator": "replace_urls", "parameters": {"value": "<URL>"}}]
#clean_text(df, "text", pipeline=custom_pipeline)

#Thanks Again to Dataprep Clean - SFU Data Science Research Group - SIMON FRASER UNIVERSITY

#https://docs.dataprep.ai/user_guide/clean/introduction.html