<a href="https://colab.research.google.com/github/rahiakela/nlp-research-and-practice/blob/main/nlp-in-real-world/01-data-processing-and-modeling/01_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup

In [1]:
%%capture

!pip -q install spacy
!pip install textblob==0.17.1
!pip install pyspellchecker==0.7.0
!pip install pyenchant==3.2.2
!python -m spacy download en_core_web_sm

In [16]:
import re
import spacy

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import word_tokenize, pos_tag

from textblob import TextBlob
from spellchecker import SpellChecker

In [3]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

##Segmentation

In [4]:
nlp = spacy.load("en_core_web_sm")

doc = nlp(u"Hi!. I like NLP. Do you??")

for sent in doc.sents:
  print(sent)

Hi!.
I like NLP.
Do you??


In [5]:
sentences = sent_tokenize("I like it. Did you like it too?")
print(sentences)

['I like it.', 'Did you like it too?']


###Word tokenization

In [6]:
text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"

tokens = TextBlob(text).words
print(tokens)

['Hi', 'I', 'like', 'NLP', 'Do', 'you', 'Do', 'you', 'live', 'in', 'the', 'U.K']


In [7]:
text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"

tokens = word_tokenize(text)
print(tokens)

['Hi', '!', 'I', 'like', 'NLP', '.', 'Do', 'you', '?', '?', 'Do', 'you', 'live', 'in', 'the', 'U.K.', '?']


In [8]:
text = "Hi! I like NLP. Do you?? Do you live in the U.K.?"

doc = nlp(text)
print([token for token in doc])

[Hi, !, I, like, NLP, ., Do, you, ?, ?, Do, you, live, in, the, U.K., ?]


###Part-of-speech tagging

In [9]:
tokens = word_tokenize("Can you please buy me an Arizona Ice Tea? It's $0.57.")

pos = pos_tag(tokens)
print(pos)

[('Can', 'MD'), ('you', 'PRP'), ('please', 'VB'), ('buy', 'VB'), ('me', 'PRP'), ('an', 'DT'), ('Arizona', 'NNP'), ('Ice', 'NNP'), ('Tea', 'NNP'), ('?', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('$', '$'), ('0.57', 'CD'), ('.', '.')]


###N-grams

In [10]:
text = "natural language processing"
TextBlob(text).ngrams(2)

[WordList(['natural', 'language']), WordList(['language', 'processing'])]

##Cleaning

In [11]:
# Punctuation removal
text = "Hi. I like NLP, do you?"

# .sub substitutes all matches with empty string below
punc_cleaned = re.sub(r"[^\w\s]", "", text)
print(punc_cleaned)

Hi I like NLP do you


In [12]:
# URL removal
text = """
Check it out on https://google.com or www.google.com for more information. Reach out to abc@xyz.com for inquiries.
"""

url_cleaned = re.sub(r"https?://\S+|www\.\S+", "", text)
print(url_cleaned)


Check it out on  or  for more information. Reach out to abc@xyz.com for inquiries.



In [13]:
# Emoji removal
text = "What does 😲 emoji mean?"

emoji_cleaned = re.sub(r"[\U00010000-\U0010ffff]", "", text, flags=re.UNICODE)
print(emoji_cleaned)

What does  emoji mean?


##Spelling correction

In [18]:
spell = SpellChecker()

# List the words that might be misspelled
misspelled = spell.unknown(["mispell", "craazy", "craaaazy"])

for word in misspelled:
  # Get the one `most likely` answer
  print(f"{word} -> {spell.correction(word)}")

craazy -> crazy
mispell -> misspell
craaaazy -> None


In [19]:
data = "Are yu suuree about your decisiion?"
output = TextBlob(data).correct()
print(output)

data = "Are yu suuuree about your decisiion?"
output = TextBlob(data).correct()
print(output)

Are you sure about your decision?
Are you suture about your decision?


In [None]:
# from enchant.checker import SpellChecker

# # Creating the SpellChecker object
# chkr = SpellChecker("en_US")

# # Spelling error detection
# chkr.set_text("This is sme sample txt with erors.")

# for err in chkr:
#     corrections = chkr.suggest(err.word)
#     if len(corrections) > 0:
#         # Get top likely correction
#         correction = corrections[0]
#         print("ERROR:", err.word, "Correction:", correction)

##Stopwords removal