### Text Cleaning and Tokenization ###

In [1]:
"""
Tokenization splits a sentence into its components
Cleaning occurs through the use of regular expressions
A regular expression is a set of characters in a given order that represents a pattern
"""

import re

sentence = "Sunil tweeted, 'Witnessing 70th Republic Day of India from Rajpath, \
           New Dehli.  Mesmerizing performance by Indian Army!  Awesome airshow! @india_official \
           @indian_army #India #70thRepublic_Day.  For more photos ping me @sunil@photoking.com :)'"

In [2]:
regextext = re.sub(r"([^\s\w]|_)+", " ", sentence).split()
regextext

['Sunil',
 'tweeted',
 'Witnessing',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 'New',
 'Dehli',
 'Mesmerizing',
 'performance',
 'by',
 'Indian',
 'Army',
 'Awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'India',
 '70thRepublic',
 'Day',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com']

### Extraction of N-Grams ###

In [3]:
#Definition of a Function for Extraction
import re

def n_gram_extractor(sentence, n):
    tokens = re.sub(r"([^\s\w]|_)+", " ", sentence).split()
    for i in range(len(tokens)- n + 1):
        print(tokens[i: i + n])

In [4]:
#Extraction of Bi-Grams
bgex = n_gram_extractor("The cute little girl is playing with the kitten.", 2)
bgex

['The', 'cute']
['cute', 'little']
['little', 'girl']
['girl', 'is']
['is', 'playing']
['playing', 'with']
['with', 'the']
['the', 'kitten']


In [5]:
#Tri-Grams
tgex = n_gram_extractor("The cute little girl is playing with the kitten.", 3)
tgex

['The', 'cute', 'little']
['cute', 'little', 'girl']
['little', 'girl', 'is']
['girl', 'is', 'playing']
['is', 'playing', 'with']
['playing', 'with', 'the']
['with', 'the', 'kitten']


In [6]:
#NLTK and Bi-Grams
from nltk import ngrams
bgnltk = list(ngrams("The cute little girl is playing with the kitten.".split(), 2))
bgnltk

[('The', 'cute'),
 ('cute', 'little'),
 ('little', 'girl'),
 ('girl', 'is'),
 ('is', 'playing'),
 ('playing', 'with'),
 ('with', 'the'),
 ('the', 'kitten.')]

In [7]:
#NLTK and Tri-Grams
from nltk import ngrams
tgnltk = list(ngrams("The cute little girl is playing with the kitten.".split(), 3))
tgnltk

[('The', 'cute', 'little'),
 ('cute', 'little', 'girl'),
 ('little', 'girl', 'is'),
 ('girl', 'is', 'playing'),
 ('is', 'playing', 'with'),
 ('playing', 'with', 'the'),
 ('with', 'the', 'kitten.')]

In [8]:
#TextBlob and Bi-Grams
from textblob import TextBlob
blob = TextBlob("The cute little girl is playing with the kitten.")
bgblob = blob.ngrams(n=2)
bgblob

[WordList(['The', 'cute']),
 WordList(['cute', 'little']),
 WordList(['little', 'girl']),
 WordList(['girl', 'is']),
 WordList(['is', 'playing']),
 WordList(['playing', 'with']),
 WordList(['with', 'the']),
 WordList(['the', 'kitten'])]

In [9]:
#TextBlob and Tri-Grams
from textblob import TextBlob
blob = TextBlob("The cute little girl is playing with the kitten.")
tgblob = blob.ngrams(n=3)
tgblob

[WordList(['The', 'cute', 'little']),
 WordList(['cute', 'little', 'girl']),
 WordList(['little', 'girl', 'is']),
 WordList(['girl', 'is', 'playing']),
 WordList(['is', 'playing', 'with']),
 WordList(['playing', 'with', 'the']),
 WordList(['with', 'the', 'kitten'])]

### Tokenization of Text with Keras and TextBlob ###

In [10]:
from keras.preprocessing.text import text_to_word_sequence
from textblob import TextBlob
sentence = "Sunil tweeted, 'Witnessing the 70th Republic Day of India from Rajpath, \
           New Dehli.  Mesmerizing performance by the Indian Army!  Awesome airshow!  @india_official \
           @indian_army #India #70thRepublic_Day.  For more photos ping me sunil@photoking.com :)'"

Using TensorFlow backend.


In [11]:
#Tokenization with Keras
kerastoken = text_to_word_sequence(sentence)
kerastoken

['sunil',
 'tweeted',
 "'witnessing",
 'the',
 '70th',
 'republic',
 'day',
 'of',
 'india',
 'from',
 'rajpath',
 'new',
 'dehli',
 'mesmerizing',
 'performance',
 'by',
 'the',
 'indian',
 'army',
 'awesome',
 'airshow',
 'india',
 'official',
 'indian',
 'army',
 'india',
 '70threpublic',
 'day',
 'for',
 'more',
 'photos',
 'ping',
 'me',
 'sunil',
 'photoking',
 'com',
 "'"]

In [12]:
#Tokenization with TextBlob
tbtoken = TextBlob(sentence)
blob = tbtoken.words
blob

WordList(['Sunil', 'tweeted', "'Witnessing", 'the', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Dehli', 'Mesmerizing', 'performance', 'by', 'the', 'Indian', 'Army', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'sunil', 'photoking.com'])

### Tokenization of Text with Various Tokenizers ###

In [13]:
"""
Tweet Tokenizer
MWE (Multi-Word Expression) Tokenizer
Regular Expression Tokenizer
Whitespace Tokenizer
Word Punkt Tokenizer
"""

sentence = "Sunil tweeted, 'Witnessing the 70th Republic Day of India from Rajpath, \
           New Dehli.  Mesmerizing performance by the Indian Army!  Awesome airshow!  @india_official \
           @indian_army #India #70thRepublic_Day.  For more photos ping me sunil@photoking.com :)'"

In [14]:
#Tweet Tokenizer
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
tttoken = tweet_tokenizer.tokenize(sentence)
tttoken

['Sunil',
 'tweeted',
 ',',
 "'",
 'Witnessing',
 'the',
 '70th',
 'Republic',
 'Day',
 'of',
 'India',
 'from',
 'Rajpath',
 ',',
 'New',
 'Dehli',
 '.',
 'Mesmerizing',
 'performance',
 'by',
 'the',
 'Indian',
 'Army',
 '!',
 'Awesome',
 'airshow',
 '!',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day',
 '.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ':)',
 "'"]

In [15]:
#MWE (Multi-Word Expression) Tokenizer
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([("Republic", "Day")])
mwe_tokenizer.add_mwe(("Indian", "Army"))
mwetoken = mwe_tokenizer.tokenize(sentence.replace("!", "").split())
mwetoken

['Sunil',
 'tweeted,',
 "'Witnessing",
 'the',
 '70th',
 'Republic_Day',
 'of',
 'India',
 'from',
 'Rajpath,',
 'New',
 'Dehli.',
 'Mesmerizing',
 'performance',
 'by',
 'the',
 'Indian_Army',
 'Awesome',
 'airshow',
 '@india_official',
 '@indian_army',
 '#India',
 '#70thRepublic_Day.',
 'For',
 'more',
 'photos',
 'ping',
 'me',
 'sunil@photoking.com',
 ":)'"]