In [14]:
#Tokenization with Regex
import re

In [15]:
raw="""'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
... though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
... well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [16]:
re.split(r' ',raw)
["'When", "I'M", 'a', "Duchess,'", 'she', 'said', 'to', 'herself,', '(not', 'in',
'a', 'very', 'hopeful', 'tone\nthough),', "'I", "won't", 'have', 'any', 'pepper',
'in', 'my', 'kitchen', 'AT', 'ALL.', 'Soup', 'does', 'very\nwell', 'without--Maybe',
"it's", 'always', 'pepper', 'that', 'makes', 'people', "hot-tempered,'..."]
re.split(r'[ \t\n]+',raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [20]:
re.split(r'\W+', raw)##We can use \W in a simple regular expression to split the input on anything other than a word character

['',
 'When',
 'I',
 'M',
 'a',
 'Duchess',
 'she',
 'said',
 'to',
 'herself',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 'I',
 'won',
 't',
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'Maybe',
 'it',
 's',
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 'tempered',
 '']

In [21]:
re.findall(r'\w+|\S\w*',raw)

["'When",
 'I',
 "'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'I",
 'won',
 "'t",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 '-',
 '-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 '-tempered',
 ',',
 "'",
 '.',
 '.',
 '.']

In [22]:
#Let's generalize the \w+ in the preceding expression to permit word-internal hyphens
#and apostrophes: «\w+([-']\w+)*». This expression means \w+ followed by zero or more
#instances of [-']\w+; it would match hot-tempered and it's.(We need to include ?: in
#this expression for reasons discussed earlier.) We'll also add a pattern to match quote
#characters so these are kept separate from the text they enclose.
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*",raw))

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


In [23]:
#Natural Language Processing with Python NLTK
#Install nltk pip install NLTK

In [38]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
import nltk

In [40]:
nltk.__version__

'3.8.1'

In [33]:
#pip list gives you the list of installed packages

In [44]:
#nltk.download()#download all packages for NLTK

In [35]:
#DiCaprio's Oscars Acceptance Speech
#https://www.bustle.com/articles/144803-transcript-of-leonardo-dicaprios-oscars-acceptance-speech-gets-political-about-climate-change-video

paragraph="""Thank you all so very much. Thank you to the Academy.
               Thank you to all of you in this room. I have to congratulate
               the other incredible nominees this year. The Revenant was
               the product of the tireless efforts of an unbelievable cast
               and crew. First off, to my brother in this endeavor, Mr. Tom
               Hardy. Tom, your talent on screen can only be surpassed by
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience. Thank you to everybody at
               Fox and New Regency … my entire team. I have to thank
               everyone from the very onset of my career … To my parents;
               none of this would be possible without you. And to my
               friends, I love you dearly; you know who you are. And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world. A world that we
               collectively felt in 2015 as the hottest year in recorded
               history. Our production needed to move to the southern
               tip of this planet just to be able to find snow. Climate
               change is real, it is happening right now. It is the most
               urgent threat facing our entire species, and we need to work
               collectively together and stop procrastinating. We need to
               support leaders around the world who do not speak for the
               big polluters, but who speak for all of humanity, for the
               indigenous people of the world, for the billions and
               billions of underprivileged people out there who would be
               most affected by this. For our children’s children, and
               for those people out there whose voices have been drowned
               out by the politics of greed. I thank you all for this
               amazing award tonight. Let us not take this planet for
               granted. I do not take tonight for granted. Thank you so very much."""

In [36]:
paragraph

"Thank you all so very much. Thank you to the Academy. \n               Thank you to all of you in this room. I have to congratulate \n               the other incredible nominees this year. The Revenant was \n               the product of the tireless efforts of an unbelievable cast\n               and crew. First off, to my brother in this endeavor, Mr. Tom \n               Hardy. Tom, your talent on screen can only be surpassed by \n               your friendship off screen … thank you for creating a t\n               ranscendent cinematic experience. Thank you to everybody at \n               Fox and New Regency … my entire team. I have to thank \n               everyone from the very onset of my career … To my parents; \n               none of this would be possible without you. And to my \n               friends, I love you dearly; you know who you are. And lastly,\n               I just want to say this: Making The Revenant was about\n               man's relationship to the nat

In [42]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [43]:
#Tokenizing sentences
sentences=nltk.sent_tokenize(paragraph)

In [45]:
len(sentences)

21

In [46]:
type(sentences)

list

In [47]:
sentences[0]

'Thank you all so very much.'

In [48]:
for x in sentences:
  print(x)

Thank you all so very much.
Thank you to the Academy.
Thank you to all of you in this room.
I have to congratulate 
               the other incredible nominees this year.
The Revenant was 
               the product of the tireless efforts of an unbelievable cast
               and crew.
First off, to my brother in this endeavor, Mr. Tom 
               Hardy.
Tom, your talent on screen can only be surpassed by 
               your friendship off screen … thank you for creating a t
               ranscendent cinematic experience.
Thank you to everybody at 
               Fox and New Regency … my entire team.
I have to thank 
               everyone from the very onset of my career … To my parents; 
               none of this would be possible without you.
And to my 
               friends, I love you dearly; you know who you are.
And lastly,
               I just want to say this: Making The Revenant was about
               man's relationship to the natural world.
A world that we
  

In [49]:
#Tokenizing words
words=nltk.word_tokenize(paragraph)

In [50]:
len(words)

347

In [51]:
words

['Thank',
 'you',
 'all',
 'so',
 'very',
 'much',
 '.',
 'Thank',
 'you',
 'to',
 'the',
 'Academy',
 '.',
 'Thank',
 'you',
 'to',
 'all',
 'of',
 'you',
 'in',
 'this',
 'room',
 '.',
 'I',
 'have',
 'to',
 'congratulate',
 'the',
 'other',
 'incredible',
 'nominees',
 'this',
 'year',
 '.',
 'The',
 'Revenant',
 'was',
 'the',
 'product',
 'of',
 'the',
 'tireless',
 'efforts',
 'of',
 'an',
 'unbelievable',
 'cast',
 'and',
 'crew',
 '.',
 'First',
 'off',
 ',',
 'to',
 'my',
 'brother',
 'in',
 'this',
 'endeavor',
 ',',
 'Mr.',
 'Tom',
 'Hardy',
 '.',
 'Tom',
 ',',
 'your',
 'talent',
 'on',
 'screen',
 'can',
 'only',
 'be',
 'surpassed',
 'by',
 'your',
 'friendship',
 'off',
 'screen',
 '…',
 'thank',
 'you',
 'for',
 'creating',
 'a',
 't',
 'ranscendent',
 'cinematic',
 'experience',
 '.',
 'Thank',
 'you',
 'to',
 'everybody',
 'at',
 'Fox',
 'and',
 'New',
 'Regency',
 '…',
 'my',
 'entire',
 'team',
 '.',
 'I',
 'have',
 'to',
 'thank',
 'everyone',
 'from',
 'the',
 've