In [2]:
from nltk.tokenize import RegexpTokenizer

In [3]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s) 

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [4]:
tokenizer = RegexpTokenizer(r'\s+', gaps=True)
tokenizer.tokenize(s) 

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them.',
 'Thanks.']

In [5]:
capword_tokenizer = RegexpTokenizer(r'[A-Z]\w+')
capword_tokenizer.tokenize(s)

['Good', 'New', 'York', 'Please', 'Thanks']

In [6]:
from nltk.tokenize import BlanklineTokenizer

In [7]:
# Uses '\s*\n\s*\n\s*':
BlanklineTokenizer().tokenize(s) 

['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
 'Thanks.']

In [8]:
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

In [9]:
regexp_tokenize(s, pattern=r'\w+|\$[\d\.]+|\S+')

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [10]:
wordpunct_tokenize(s) 

['Good',
 'muffins',
 'cost',
 '$',
 '3',
 '.',
 '88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

In [11]:
blankline_tokenize(s)

['Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.',
 'Thanks.']

In [12]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [13]:
from nltk.tokenize import WhitespaceTokenizer

In [14]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
WhitespaceTokenizer().tokenize(s) 

['Good',
 'muffins',
 'cost',
 '$3.88',
 'in',
 'New',
 'York.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them.',
 'Thanks.']

In [15]:
from nltk.tokenize import WordPunctTokenizer

In [16]:
s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
WordPunctTokenizer().tokenize(s) 

['Good',
 'muffins',
 'cost',
 '$',
 '3',
 '.',
 '88',
 'in',
 'New',
 'York',
 '.',
 'Please',
 'buy',
 'me',
 'two',
 'of',
 'them',
 '.',
 'Thanks',
 '.']

### Stemming

There is one root word, but there are many variations of the same words. For example, the root word is “eat” and it’s variations are “eats, eating, eaten and like so”. 

In the same way, with the help of Stemming in Python, we can find the root word of any variations.

In [17]:
# gives the root word
# PorterStemmer is an algorithm for stemming words

from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
    rootWord=ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


In [18]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

sentence="Hello Guru99, You have to build a very good site and I love visiting your site."

# tokenize the sentence
words = word_tokenize(sentence)

# calling the PorterStemmer algorithm
ps = PorterStemmer()

# printing the root words in the sentence 
for w in words:
	rootWord=ps.stem(w)
	print(rootWord)

hello
guru99
,
you
have
to
build
a
veri
good
site
and
i
love
visit
your
site
.


### Lemmatization

In [19]:
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer  = PorterStemmer()
text = "studies studying cries cry"

# tokenize the string
tokenization = nltk.word_tokenize(text)


for w in tokenization:
    print("Stemming for {} is {}".format(w,porter_stemmer.stem(w)))  

Stemming for studies is studi
Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri


In [20]:
# contextual analysis 

import nltk
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))  

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


In [21]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
from collections import defaultdict
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

text = "guru99 is a totally new kind of learning experience."
tokens = word_tokenize(text)
lemma_function = WordNetLemmatizer()
for token, tag in pos_tag(tokens):
    lemma = lemma_function.lemmatize(token, tag_map[tag[0]])
    print(token, "=>", lemma)

guru99 => guru99
is => be
a => a
totally => totally
new => new
kind => kind
of => of
learning => learn
experience => experience
. => .


In [22]:
import re
from nltk.tokenize import RegexpTokenizer

s = "We can expect the profitability of IT companies can increase by 10.0%. The profit of IT company TCS increased by 7.2% this year."
tokenizer = RegexpTokenizer(r'\bIT\s+compan(?:y|ies)|[\d\.%]+')
tokens = tokenizer.tokenize(s)
print(tokens)

['IT companies', '10.0%.', 'IT company', '7.2%', '.']


In [23]:
type(tokens)

list

In [24]:
from nltk.tokenize import RegexpTokenizer
pattern = r'[a-zA-Z0-9]+'
tokenizer = RegexpTokenizer(pattern)
input_str = "abcABC12456A1B2C3D5"
alphanumeric_values = tokenizer.tokenize(input_str)
print(alphanumeric_values)

['abcABC12456A1B2C3D5']


In [25]:
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.freshbooks.com/en-au/hub/other/business-vs-company'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    business_pattern = r'\b(business|enterprise|company|industry)\b'
    if re.search(business_pattern, text, re.IGNORECASE):
        print("The text contains business-related content.")
    else:
        print("The text does not contain business-related content.")
else:
    print("Failed to retrieve the webpage.")

The text contains business-related content.


In [26]:
print(text)








Business Vs Company: What’s the Difference?



































 







×

 

FreshBooks
Official App
Free - Google Play


Get it



 



Fresh Year. Fresher Books. 🥳 Get 50% Off for 3 Months. BUY NOW & SAVE


				📣 Only 
 Left to Get 50% Off for 3 Months BUY NOW & SAVE 


50% Off for 3 Months  Buy Now & Save







				You're currently on our Australian site. Select your regional site here:			




Select your region
Australia
Canada
European Union
Germany
Mexico
New Zealand
Singapore
South Africa
United Kingdom
United States





×







Back

 

Try It Free
Login

  




Features Bill & Get PaidInvoicingWow clients with professional invoices that take seconds to create
PaymentsQuick and easy online, recurring, and invoice-free payment options
Time TrackingAutomated, to accurately track time and easily log billable hours

Accounting & TaxesAccountingReports and tools to track money in and out, so you know where you stand
Expenses & ReceiptsEasily log expenses a

In [27]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\b(business|enterprise|company|industry)\b')
s = tokenizer.tokenize(text) 

In [28]:
len(text)

16146

In [29]:
len(s)

77

In [30]:
for words in s:
    print (words)

business
business
company
business
business
company
business
enterprise
business
company
business
business
business
business
business
company
business
industry
business
company
industry
business
company
business
company
industry
business
industry
company
business
industry
industry
business
business
company
business
business
business
company
business
business
business
business
company
business
company
business
business
business
business
business
business
business
company
business
company
company
business
business
business
company
company
business
business
company
business
company
industry
company
business
business
company
company
company
business
company
business


By: Prajukta Dey 

Roll: 21052263