In [29]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

In [30]:
ps = PorterStemmer()

example_words = ["python", "pythonic", "pythoner", "pythoning", "pythoned", "pythonly"]

In [31]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
python
pythonli


In [32]:
new_text = "It is very important to be pythonly while you're pythoning with python. All pythoners should be pythonic. All pythoners have pythoned poorly atleast once."

In [33]:
for w in word_tokenize(new_text):
    print(ps.stem(w))

it
is
veri
import
to
be
pythonli
while
you
're
python
with
python
.
all
python
should
be
python
.
all
python
have
python
poorli
atleast
onc
.


In [34]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
lanc = LancasterStemmer()

In [36]:
porter.stem('cats')
porter.stem('amazing')
porter.stem('amazement')
porter.stem('amaze')
porter.stem('amazed')
porter.stem('amazon')
porter.stem('nation')
porter.stem('premonition')

# Comparison between Porter and Snowball
porter.stem('loudly')
snowball.stem('loudly')

# Comparison between Snowball and Lancaster
porter.stem('salty')
snowball.stem('salty')

'salti'

In [8]:
# tokenization, stop word, stem 

for w in word_tokenize(new_text):
    #tokens_without_sw = [word for word in w if not word in stop_words]
    print(porter.stem(w))

it
is
veri
import
to
be
pythonli
while
you
're
python
with
python
.
all
python
should
be
python
.
all
python
have
python
poorli
atleast
onc
.


In [9]:
words_filtered = []
words = wordpunct_tokenize(new_text)
for w in words:
    if w not in stop_words:
        words_filtered.append(w)

print(words_filtered)

['It', 'important', 'pythonly', "'", 'pythoning', 'python', '.', 'All', 'pythoners', 'pythonic', '.', 'All', 'pythoners', 'pythoned', 'poorly', 'atleast', '.']


In [10]:
for w in word_tokenize(new_text):
    print(lanc.stem(w))

it
is
very
import
to
be
python
whil
you
're
python
with
python
.
al
python
should
be
python
.
al
python
hav
python
poor
atleast
ont
.


In [11]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
example_words = ["word", "wordy", "wording", "cacti", "rocks", "catty", "demonic", "geese", "ravishing", "better", "best", "run"]

In [13]:
for w in example_words:
    print(lemmatizer.lemmatize(w, pos="a"))

word
wordy
wording
cacti
rocks
catty
demonic
geese
ravishing
good
best
run


In [14]:
for w in example_words:
    print(lemmatizer.lemmatize(w, pos="v"))

word
wordy
word
cacti
rock
catty
demonic
geese
ravish
better
best
run


In [15]:
for w in example_words:
    print(lemmatizer.lemmatize(w))

word
wordy
wording
cactus
rock
catty
demonic
goose
ravishing
better
best
run


In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
    print(token,"--->",wordnet.lemmatize(token))

She ---> She
jumped ---> jumped
into ---> into
the ---> the
river ---> river
and ---> and
breathed ---> breathed
heavily ---> heavily


In [17]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

text = "I am running and I usually use to runs"

wordnet = WordNetLemmatizer()
tokenizer = word_tokenize(text)

for token in tokenizer:
    print(token,"--->",wordnet.lemmatize(token))

I ---> I
am ---> am
running ---> running
and ---> and
I ---> I
usually ---> usually
use ---> use
to ---> to
runs ---> run


In [18]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

text = "She jumped into the river and breathed heavily"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
    pos=tag[0].lower()
        
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
    
    print(token,"--->",wordnet.lemmatize(token,pos))

She ---> She
jumped ---> jump
into ---> into
the ---> the
river ---> river
and ---> and
breathed ---> breathe
heavily ---> heavily


In [19]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,pos_tag

text = "I am running and I usually use to runs"
wordnet = WordNetLemmatizer()

for token,tag in pos_tag(word_tokenize(text)):
    pos=tag[0].lower()
        
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
    
    print(token,"--->",wordnet.lemmatize(token,pos))

I ---> I
am ---> be
running ---> run
and ---> and
I ---> I
usually ---> usually
use ---> use
to ---> to
runs ---> run


In [37]:
from nltk.stem import SnowballStemmer,PorterStemmer,WordNetLemmatizer
from nltk import word_tokenize,pos_tag

snowball = SnowballStemmer(language='english')
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

text = ["better","Caring","are","am","worse","struggling",'meeting']
print("{0:10}{1:20}{2:30}{3:40}".format("Word","Snowball Stemmer","porter stemmer", "Wordnet Lemmatizer",))
for token,tag in pos_tag(text):
    
    pos=tag[0].lower()
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
        
    print("{0:10}{1:20}{2:30}{3:40}".format(token,snowball.stem(token),porter.stem(token),wordnet.lemmatize(token,pos)))

Word      Snowball Stemmer    porter stemmer                Wordnet Lemmatizer                      
better    better              better                        well                                    
Caring    care                care                          Caring                                  
are       are                 are                           be                                      
am        am                  am                            be                                      
worse     wors                wors                          worse                                   
strugglingstruggl             struggl                       struggle                                
meeting   meet                meet                          meeting                                 


Tokenizing, stop word removal and lemmatization

### We are going to do end to end  sentence pre-processing
### Step 1  We need to do 2 time tokenization to remove the special characteres and tokenize cleanly 

In [21]:
import re, nltk
tokens = nltk.word_tokenize(new_text)
print(tokens)
# Remove special characters from each token
cleaned_tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    
    # If you want to remove leading/trailing whitespaces and combine them into a string
cleaned_text = ' '.join(cleaned_tokens)
print(cleaned_text)

['It', 'is', 'very', 'important', 'to', 'be', 'pythonly', 'while', 'you', "'re", 'pythoning', 'with', 'python', '.', 'All', 'pythoners', 'should', 'be', 'pythonic', '.', 'All', 'pythoners', 'have', 'pythoned', 'poorly', 'atleast', 'once', '.']
It is very important to be pythonly while you re pythoning with python  All pythoners should be pythonic  All pythoners have pythoned poorly atleast once 


### Step 2 : We need to do remove the stops

In [22]:
words_filtered = []
words = wordpunct_tokenize(cleaned_text)
for w in words:
    if w not in stop_words:
        words_filtered.append(w)

print(words_filtered)

['It', 'important', 'pythonly', 'pythoning', 'python', 'All', 'pythoners', 'pythonic', 'All', 'pythoners', 'pythoned', 'poorly', 'atleast']


### Step 3 :  part of speech tagging 

In [23]:
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()
for word, pos in pos_tag(words_filtered):
    print(word, '|', pos)

It | PRP
important | JJ
pythonly | RB
pythoning | VBG
python | IN
All | NNP
pythoners | NNS
pythonic | VBP
All | DT
pythoners | NNS
pythoned | VBD
poorly | RB
atleast | JJ


### Step 4 : lemmatizing the word and check whether we are able to prepare the root word 

In [24]:

for token,tag in pos_tag(words_filtered):
    pos=tag[0].lower()
        
    if pos not in ['a', 'r', 'n', 'v']:
        pos='n'
    
    print(token,"--->",wordnet.lemmatize(token,pos))

It ---> It
important ---> important
pythonly ---> pythonly
pythoning ---> pythoning
python ---> python
All ---> All
pythoners ---> pythoners
pythonic ---> pythonic
All ---> All
pythoners ---> pythoners
pythoned ---> pythoned
poorly ---> poorly
atleast ---> atleast


In [25]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer

porter = PorterStemmer()
snowball = SnowballStemmer(language='english')
lanc = LancasterStemmer()

In [26]:
eu_definition = '''
The European Union (EU) is a political and economic union of 27 member states that are located primarily in Europe. 
Its members have a combined area of 4,233,255.3 km2 (1,634,469.0 sq mi) and an estimated total population of about 447 million. 
The EU has developed an internal single market through a standardised system of laws that apply in all member states in those matters, 
and only those matters, where members have agreed to act as one. EU policies aim to ensure the free movement of people, goods, 
services and capital within the internal market; enact legislation in justice and home affairs; and maintain common policies on trade, 
agriculture, fisheries and regional development. Passport controls have been abolished for travel within the Schengen Area. 
A monetary union was established in 1999, coming into full force in 2002, and is composed of 19 EU member states which use the euro 
currency. The EU has often been described as a sui generis political entity (without precedent or comparison).
The EU and European citizenship were established when the Maastricht Treaty came into force in 1993. 
The EU traces its origins to the European Coal and Steel Community (ECSC) and the European Economic Community (EEC), established, 
respectively, by the 1951 Treaty of Paris and 1957 Treaty of Rome. The original members of what came to be known as the European 
Communities were the Inner Six: Belgium, France, Italy, Luxembourg, the Netherlands, and West Germany. The Communities and their 
successors have grown in size by the accession of new member states and in power by the addition of policy areas to their remit. 
The United Kingdom became the first member state to leave the EU on 31 January 2020. Before this, three territories of member states 
had left the EU or its forerunners. The latest major amendment to the constitutional basis of the EU, the Treaty of Lisbon, 
came into force in 2009.
'''

In [28]:

sentence_example = (
  'This is definitely a controversy as the attorney labeled the case "extremely controversial"'
)

# Porter Stemmed version of sentence example
stemmed_sentence = [
  porter.stem(word) for word in word_tokenize(sentence_example)
]
print(stemmed_sentence)


# Tokenizing and Stemming the eu_definition
tokenized_eu = word_tokenize(eu_definition)
porter_eu = [porter.stem(word) for word in tokenized_eu]
print(porter_eu)

['thi', 'is', 'definit', 'a', 'controversi', 'as', 'the', 'attorney', 'label', 'the', 'case', '``', 'extrem', 'controversi', "''"]
['the', 'european', 'union', '(', 'eu', ')', 'is', 'a', 'polit', 'and', 'econom', 'union', 'of', '27', 'member', 'state', 'that', 'are', 'locat', 'primarili', 'in', 'europ', '.', 'it', 'member', 'have', 'a', 'combin', 'area', 'of', '4,233,255.3', 'km2', '(', '1,634,469.0', 'sq', 'mi', ')', 'and', 'an', 'estim', 'total', 'popul', 'of', 'about', '447', 'million', '.', 'the', 'eu', 'ha', 'develop', 'an', 'intern', 'singl', 'market', 'through', 'a', 'standardis', 'system', 'of', 'law', 'that', 'appli', 'in', 'all', 'member', 'state', 'in', 'those', 'matter', ',', 'and', 'onli', 'those', 'matter', ',', 'where', 'member', 'have', 'agre', 'to', 'act', 'as', 'one', '.', 'eu', 'polici', 'aim', 'to', 'ensur', 'the', 'free', 'movement', 'of', 'peopl', ',', 'good', ',', 'servic', 'and', 'capit', 'within', 'the', 'intern', 'market', ';', 'enact', 'legisl', 'in', 'justic