In [3]:
import nltk
import nltk.corpus

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
s1 = '''Gold is a chemical element with symbol Au (from Latin: aurum) and atomic number 79, making it one of the higher atomic number elements that occur naturally.
In its purest form, it is a bright, slightly reddish yellow, dense, soft, malleable, and ductile metal.
Chemically, gold is a transition metal and a group 11 element. It is one of the least reactive chemical elements and is
solid under standard conditions.
Gold often occurs in free elemental (native) form, as nuggets or grains, in rocks, in veins, and in alluvial deposits.
It occurs in a solid solution series with the native element silver (as electrum) and also naturally alloyed with copper and palladium.
Less commonly, it occurs in minerals as gold compounds, often with tellurium (gold tellurides).'''

**Tokenization**

In [6]:
from nltk.tokenize import word_tokenize
s1_tokens = word_tokenize(s1)
print(s1_tokens)

['Gold', 'is', 'a', 'chemical', 'element', 'with', 'symbol', 'Au', '(', 'from', 'Latin', ':', 'aurum', ')', 'and', 'atomic', 'number', '79', ',', 'making', 'it', 'one', 'of', 'the', 'higher', 'atomic', 'number', 'elements', 'that', 'occur', 'naturally', '.', 'In', 'its', 'purest', 'form', ',', 'it', 'is', 'a', 'bright', ',', 'slightly', 'reddish', 'yellow', ',', 'dense', ',', 'soft', ',', 'malleable', ',', 'and', 'ductile', 'metal', '.', 'Chemically', ',', 'gold', 'is', 'a', 'transition', 'metal', 'and', 'a', 'group', '11', 'element', '.', 'It', 'is', 'one', 'of', 'the', 'least', 'reactive', 'chemical', 'elements', 'and', 'is', 'solid', 'under', 'standard', 'conditions', '.', 'Gold', 'often', 'occurs', 'in', 'free', 'elemental', '(', 'native', ')', 'form', ',', 'as', 'nuggets', 'or', 'grains', ',', 'in', 'rocks', ',', 'in', 'veins', ',', 'and', 'in', 'alluvial', 'deposits', '.', 'It', 'occurs', 'in', 'a', 'solid', 'solution', 'series', 'with', 'the', 'native', 'element', 'silver', '(',

In [7]:
len(s1_tokens)

156

**Frequency of words**

In [8]:
from nltk.probability import FreqDist
fdist = FreqDist()

for word in s1_tokens:
    fdist[word.lower()] += 1

print(fdist['gold'], len(fdist), fdist['a'])

5 80 5


In [9]:
fdist.most_common(5)

[(',', 14), ('and', 7), ('.', 7), ('in', 7), ('gold', 5)]

**ngrams - tokens of n consecutive words**

In [10]:
from nltk.util import ngrams, bigrams, trigrams
s2 = "The Mona Lisa is a half length portrait painting by the Italian Renaissance artist Leonardo da Vinci"
s2_tokens = word_tokenize(s2)
print(list(ngrams(s2_tokens, 2)))

[('The', 'Mona'), ('Mona', 'Lisa'), ('Lisa', 'is'), ('is', 'a'), ('a', 'half'), ('half', 'length'), ('length', 'portrait'), ('portrait', 'painting'), ('painting', 'by'), ('by', 'the'), ('the', 'Italian'), ('Italian', 'Renaissance'), ('Renaissance', 'artist'), ('artist', 'Leonardo'), ('Leonardo', 'da'), ('da', 'Vinci')]


**Stemming**

In [11]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer('english')

words_to_stem = ['give', 'giving', 'given', 'gave', 'data', 'curricula', 'corpora']

for word in words_to_stem:
    print(word, "-->", ps.stem(word), ls.stem(word), ss.stem(word))

give --> give giv give
giving --> give giv give
given --> given giv given
gave --> gave gav gave
data --> data dat data
curricula --> curricula curricul curricula
corpora --> corpora corpor corpora


**Lemmatize**

In [12]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...


True

In [13]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()

for word in words_to_stem:
    print(word, "-->", wl.lemmatize(word))

give --> give
giving --> giving
given --> given
gave --> gave
data --> data
curricula --> curriculum
corpora --> corpus


**StopWords**

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\praka\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [15]:
from nltk.corpus import stopwords
stp_words = stopwords.words('english')
print(stp_words)
print(len(stp_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [16]:
import re
non_alnum = re.compile(r'\W')
non_num = re.compile(r'\d')

In [17]:
alpha_only = []

for words in s1_tokens:
    word_alnum = non_alnum.sub("" , words)
    word_alpha = non_num.sub("" , word_alnum)
    if len(word_alpha) > 0:
        alpha_only.append(word_alpha)

print(alpha_only)
print(len(alpha_only))

['Gold', 'is', 'a', 'chemical', 'element', 'with', 'symbol', 'Au', 'from', 'Latin', 'aurum', 'and', 'atomic', 'number', 'making', 'it', 'one', 'of', 'the', 'higher', 'atomic', 'number', 'elements', 'that', 'occur', 'naturally', 'In', 'its', 'purest', 'form', 'it', 'is', 'a', 'bright', 'slightly', 'reddish', 'yellow', 'dense', 'soft', 'malleable', 'and', 'ductile', 'metal', 'Chemically', 'gold', 'is', 'a', 'transition', 'metal', 'and', 'a', 'group', 'element', 'It', 'is', 'one', 'of', 'the', 'least', 'reactive', 'chemical', 'elements', 'and', 'is', 'solid', 'under', 'standard', 'conditions', 'Gold', 'often', 'occurs', 'in', 'free', 'elemental', 'native', 'form', 'as', 'nuggets', 'or', 'grains', 'in', 'rocks', 'in', 'veins', 'and', 'in', 'alluvial', 'deposits', 'It', 'occurs', 'in', 'a', 'solid', 'solution', 'series', 'with', 'the', 'native', 'element', 'silver', 'as', 'electrum', 'and', 'also', 'naturally', 'alloyed', 'with', 'copper', 'and', 'palladium', 'Less', 'commonly', 'it', 'occu

In [18]:
alpha_without_stop_words = []

for words in alpha_only:
    words = words.lower()
    if words not in stp_words:
        alpha_without_stop_words.append(words)

print(alpha_without_stop_words)
print(len(alpha_without_stop_words))

['gold', 'chemical', 'element', 'symbol', 'au', 'latin', 'aurum', 'atomic', 'number', 'making', 'one', 'higher', 'atomic', 'number', 'elements', 'occur', 'naturally', 'purest', 'form', 'bright', 'slightly', 'reddish', 'yellow', 'dense', 'soft', 'malleable', 'ductile', 'metal', 'chemically', 'gold', 'transition', 'metal', 'group', 'element', 'one', 'least', 'reactive', 'chemical', 'elements', 'solid', 'standard', 'conditions', 'gold', 'often', 'occurs', 'free', 'elemental', 'native', 'form', 'nuggets', 'grains', 'rocks', 'veins', 'alluvial', 'deposits', 'occurs', 'solid', 'solution', 'series', 'native', 'element', 'silver', 'electrum', 'also', 'naturally', 'alloyed', 'copper', 'palladium', 'less', 'commonly', 'occurs', 'minerals', 'gold', 'compounds', 'often', 'tellurium', 'gold', 'tellurides']
78


In [19]:
fdist2 = FreqDist()

for word in alpha_without_stop_words:
    fdist2[word] += 1

fdist2.most_common(10)

[('gold', 5),
 ('element', 3),
 ('occurs', 3),
 ('chemical', 2),
 ('atomic', 2),
 ('number', 2),
 ('one', 2),
 ('elements', 2),
 ('naturally', 2),
 ('form', 2)]