In [1]:
import nltk
from nltk.corpus import gutenberg
from pprint import pprint

# Download necessary datasets
nltk.download('gutenberg')
nltk.download('punkt')

# Load "Alice in Wonderland" text
alice = gutenberg.raw(fileids='carroll-alice.txt')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Sample text
sample_text = (
    'We will discuss briefly about the basic syntax, structure and design philosophies. '
    'There is a defined hierarchical syntax for Python code which you should remember '
    'when writing code! Python is a really powerful programming language!'
)

In [3]:
# Default sentence tokenizer
default_st = nltk.sent_tokenize
alice_sentences = default_st(text=alice)
sample_sentences = default_st(text=sample_text)

In [4]:
# Print results
print('Total sentences in sample_text:', len(sample_sentences))
print('Sample text sentences:')
pprint(sample_sentences)


Total sentences in sample_text: 3
Sample text sentences:
['We will discuss briefly about the basic syntax, structure and design '
 'philosophies.',
 'There is a defined hierarchical syntax for Python code which you should '
 'remember when writing code!',
 'Python is a really powerful programming language!']


In [5]:
print('\nTotal sentences in alice:', len(alice_sentences))
print('First 5 sentences in alice:')
pprint(alice_sentences[0:3])


Total sentences in alice: 1625
First 5 sentences in alice:
["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.",
 'Down the Rabbit-Hole\n'
 '\n'
 'Alice was beginning to get very tired of sitting by her sister on the\n'
 'bank, and of having nothing to do: once or twice she had peeped into the\n'
 'book her sister was reading, but it had no pictures or conversations in\n'
 "it, 'and what is the use of a book,' thought Alice 'without pictures or\n"
 "conversation?'",
 'So she was considering in her own mind (as well as she could, for the\n'
 'hot day made her feel very sleepy and stupid), whether the pleasure\n'
 'of making a daisy-chain would be worth the trouble of getting up and\n'
 'picking the daisies, when suddenly a White Rabbit with pink eyes ran\n'
 'close by her.']


In [6]:
# Word tokenization
sentence = "The brown fox wasn't that quick and he couldn't win the race"

In [7]:
from nltk.tokenize import word_tokenize

sentence = "Don't you know that the U.S. is a big country?"
words = word_tokenize(sentence)
print(words)


['Do', "n't", 'you', 'know', 'that', 'the', 'U.S.', 'is', 'a', 'big', 'country', '?']


In [8]:
from nltk.tokenize import TreebankWordTokenizer

treebank_wt = TreebankWordTokenizer()
sentence = "Don't you know that the U.S. is a big country?"
words = treebank_wt.tokenize(sentence)
print(words)


['Do', "n't", 'you', 'know', 'that', 'the', 'U.S.', 'is', 'a', 'big', 'country', '?']


In [9]:
# Regex word tokenizer
TOKEN_PATTERN = r'\w+'
regex_wt = nltk.RegexpTokenizer(pattern=TOKEN_PATTERN, gaps=False)
words = regex_wt.tokenize( "Don't you know that the U.S. is a big country?")
print(words)

['Don', 't', 'you', 'know', 'that', 'the', 'U', 'S', 'is', 'a', 'big', 'country']


In [10]:
# Another sample sentence
sentence = 'The brown fox is quick and he is jumping over the lazy dog'

In [11]:
# Downloading necessary datasets for tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


True

In [12]:
# Part-of-speech tagging
tokens = nltk.word_tokenize(sentence)
tagged_sent = nltk.pos_tag(tokens, tagset='universal')
print(tagged_sent)

[('The', 'DET'), ('brown', 'ADJ'), ('fox', 'NOUN'), ('is', 'VERB'), ('quick', 'ADJ'), ('and', 'CONJ'), ('he', 'PRON'), ('is', 'VERB'), ('jumping', 'VERB'), ('over', 'ADP'), ('the', 'DET'), ('lazy', 'ADJ'), ('dog', 'NOUN')]


In [13]:
# Building your own tagger
from nltk.corpus import treebank
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [14]:
# Preparing the data
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]

print(train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


In [15]:
# Default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')
print(dt.evaluate(test_data))
print(dt.tag(tokens))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(dt.evaluate(test_data))


0.1454158195372253
[('The', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'NN'), ('quick', 'NN'), ('and', 'NN'), ('he', 'NN'), ('is', 'NN'), ('jumping', 'NN'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN')]


In [16]:
# Default tagger
from nltk.tag import DefaultTagger
dt = DefaultTagger('NN')
print(dt.evaluate(test_data))
print(dt.tag(tokens))

  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(dt.evaluate(test_data))


0.1454158195372253
[('The', 'NN'), ('brown', 'NN'), ('fox', 'NN'), ('is', 'NN'), ('quick', 'NN'), ('and', 'NN'), ('he', 'NN'), ('is', 'NN'), ('jumping', 'NN'), ('over', 'NN'), ('the', 'NN'), ('lazy', 'NN'), ('dog', 'NN')]


In [17]:
# Text normalization
import re
import string

corpus = [
    "The brown fox wasn't that quick and he couldn't win the race",
    "Hey that's a great deal! I just bought a phone for $199",
    "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"
]


In [18]:
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    return word_tokens

token_list = [tokenize_text(text) for text in corpus]
pprint(token_list)

[[['The',
   'brown',
   'fox',
   'was',
   "n't",
   'that',
   'quick',
   'and',
   'he',
   'could',
   "n't",
   'win',
   'the',
   'race']],
 [['Hey', 'that', "'s", 'a', 'great', 'deal', '!'],
  ['I', 'just', 'bought', 'a', 'phone', 'for', '$', '199']],
 [['@',
   '@',
   'You',
   "'ll",
   '(',
   'learn',
   ')',
   'a',
   '*',
   '*',
   'lot',
   '*',
   '*',
   'in',
   'the',
   'book',
   '.'],
  ['Python', 'is', 'an', 'amazing', 'language', '!'],
  ['@', '@']]]


In [19]:
# Removing extra characters
def remove_characters_after_tokenization(tokens):
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    return filtered_tokens

filtered_list_1 = [
    filter(None, [remove_characters_after_tokenization(tokens) for tokens in sentence_tokens])
    for sentence_tokens in token_list
]

print(filtered_list_1)

[<filter object at 0x000002C32F007670>, <filter object at 0x000002C32F006F50>, <filter object at 0x000002C32F007A60>]


In [20]:
# Case conversion
print(corpus[0].lower())
print(corpus[0].upper())

nltk.download('stopwords')


the brown fox wasn't that quick and he couldn't win the race
THE BROWN FOX WASN'T THAT QUICK AND HE COULDN'T WIN THE RACE


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
# Removing stopwords
def remove_stopwords(tokens):
    stopword_list = nltk.corpus.stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    return filtered_tokens

cleaned_corpus_tokens = [
    remove_stopwords([token for tokens in sentence_tokens for token in tokens])
    for sentence_tokens in filtered_list_1
]

pprint(cleaned_corpus_tokens)


[['The', 'brown', 'fox', 'nt', 'quick', 'could', 'nt', 'win', 'race'],
 ['Hey', 'great', 'deal', 'I', 'bought', 'phone', '199'],
 ['You', 'learn', 'lot', 'book', 'Python', 'amazing', 'language']]


In [22]:
pip install contractions


Collecting contractions
  Obtaining dependency information for contractions from https://files.pythonhosted.org/packages/bb/e4/725241b788963b460ce0118bfd5c505dd3d1bdd020ee740f9f39044ed4a7/contractions-0.1.73-py2.py3-none-any.whl.metadata
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Obtaining dependency information for textsearch>=0.0.21 from https://files.pythonhosted.org/packages/e2/0f/6f08dd89e9d71380a369b1f5b6c97a32d62fc9cfacc1c5b8329505b9e495/textsearch-0.0.24-py2.py3-none-any.whl.metadata
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Obtaining dependency information for anyascii from https://files.pythonhosted.org/packages/4f/7b/a9a747e0632271d855da379532b05a62c58e979813814a57fa3b3afeb3a4/anyascii-0.3.2-py3-none-any.whl.metadata
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick 

In [23]:
import contractions

# Example sentence with contractions
sentence = "You're going to do it, aren't you?"

# Expand contractions using the `fix` function
expanded_sentence = contractions.fix(sentence)

print(expanded_sentence)


You are going to do it, are not you?


In [24]:
pip install textblob


Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/02/07/5fd2945356dd839974d3a25de8a142dc37293c21315729a41e775b5f3569/textblob-0.18.0.post0-py3-none-any.whl.metadata
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
    --------------------------------------- 10.2/626.3 kB ? eta -:--:--
   - ------------------------------------- 30.7/626.3 kB 163.8 kB/s eta 0:00:04
   --- ----------------------------------- 61.4/626.3 kB 252.2 kB/s eta 0:00:03
   ---- ---------------------------------- 71.7/626.3 kB 280.5 kB/s eta 0:00:02
   -------- ----------------------------- 1

In [25]:
from textblob import TextBlob

def normalize_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Perform spell checking and correction
    text = str(TextBlob(text).correct())
    
    return text

# Example usage
sentence = "I have a speling mistake in this sentense."
normalized_sentence = normalize_text(sentence)

print(normalized_sentence)


i have a spelling mistake in this sentence.
