# Lexicon Score

In [18]:
from nltk.corpus import stopwords
import nltk

In [19]:
sw = stopwords.words('english')

In [20]:
entries = nltk.corpus.cmudict.entries()
print(entries)

[('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ...]


In [21]:
for entry in entries[10000:10025]:
    print(entry)

('belford', ['B', 'EH1', 'L', 'F', 'ER0', 'D'])
('belfry', ['B', 'EH1', 'L', 'F', 'R', 'IY0'])
('belgacom', ['B', 'EH1', 'L', 'G', 'AH0', 'K', 'AA0', 'M'])
('belgacom', ['B', 'EH1', 'L', 'JH', 'AH0', 'K', 'AA0', 'M'])
('belgard', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D'])
('belgarde', ['B', 'EH0', 'L', 'G', 'AA1', 'R', 'D', 'IY0'])
('belge', ['B', 'EH1', 'L', 'JH', 'IY0'])
('belger', ['B', 'EH1', 'L', 'G', 'ER0'])
('belgian', ['B', 'EH1', 'L', 'JH', 'AH0', 'N'])
('belgians', ['B', 'EH1', 'L', 'JH', 'AH0', 'N', 'Z'])
('belgique', ['B', 'EH0', 'L', 'ZH', 'IY1', 'K'])
("belgique's", ['B', 'EH0', 'L', 'JH', 'IY1', 'K', 'S'])
('belgium', ['B', 'EH1', 'L', 'JH', 'AH0', 'M'])
("belgium's", ['B', 'EH1', 'L', 'JH', 'AH0', 'M', 'Z'])
('belgo', ['B', 'EH1', 'L', 'G', 'OW2'])
('belgrade', ['B', 'EH1', 'L', 'G', 'R', 'EY0', 'D'])
('belgrade', ['B', 'EH1', 'L', 'G', 'R', 'AA2', 'D'])
("belgrade's", ['B', 'EH1', 'L', 'G', 'R', 'EY0', 'D', 'Z'])
("belgrade's", ['B', 'EH1', 'L', 'G', 'R', 'AA2', 'D', 'Z'

In [22]:
from nltk.corpus import wordnet as wn

In [23]:
wn.synsets('motorcar')

[Synset('car.n.01')]

In [24]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

## NLTK Pipelines

In [25]:
textSet = ["""Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with roles in the feature adaptations of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting roles in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He has since appeared in numerous major Hollywood films, such as Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011)."""]

In [26]:
for text in textSet:
    print(text)

Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with roles in the feature adaptations of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting roles in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He has since appeared in numerous major Hollywood films, such as Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011).


In [27]:
for text in textSet:
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tagged_words = nltk.pos_tag(words)
        print(tagged_words)

[('Henry', 'NNP'), ('William', 'NNP'), ('Dalgliesh', 'NNP'), ('Cavill', 'NNP'), ('(', '('), ('/ˈkævəl/', 'NNP'), (';', ':'), ('born', 'VBN'), ('5', 'CD'), ('May', 'NNP'), ('1983', 'CD'), (')', ')'), ('is', 'VBZ'), ('an', 'DT'), ('English', 'JJ'), ('actor', 'NN'), ('.', '.')]
[('He', 'PRP'), ('began', 'VBD'), ('his', 'PRP$'), ('career', 'NN'), ('with', 'IN'), ('roles', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('feature', 'NN'), ('adaptations', 'NNS'), ('of', 'IN'), ('The', 'DT'), ('Count', 'NNP'), ('of', 'IN'), ('Monte', 'NNP'), ('Cristo', 'NNP'), ('(', '('), ('2002', 'CD'), (')', ')'), ('and', 'CC'), ('I', 'PRP'), ('Capture', 'VBP'), ('the', 'DT'), ('Castle', 'NNP'), ('(', '('), ('2003', 'CD'), (')', ')'), ('.', '.')]
[('He', 'PRP'), ('later', 'RB'), ('appeared', 'VBD'), ('in', 'IN'), ('supporting', 'VBG'), ('roles', 'NNS'), ('in', 'IN'), ('several', 'JJ'), ('television', 'NN'), ('series', 'NN'), (',', ','), ('including', 'VBG'), ('BBC', 'NNP'), ("'s", 'POS'), ('The', 'DT'), ('Inspector', 

# Implementation of Tokenizations

### Tweet Tokenizer

In [28]:
trumpTweet = """Democrats are now the party of high taxes, high crime, open borders, late-term abortion, socialism, and blatant corruption. The Republican Party is the party of the American Worker, the American Family, and the American Dream! #KAG2020"""

In [29]:
# import TweetTokenizer() method from nltk 
from nltk.tokenize import TweetTokenizer 
  
# Create a reference variable for Class TweetTokenizer 
tk = TweetTokenizer() 

In [30]:
tweet = tk.tokenize(trumpTweet)

In [31]:
print(tweet)

['Democrats', 'are', 'now', 'the', 'party', 'of', 'high', 'taxes', ',', 'high', 'crime', ',', 'open', 'borders', ',', 'late-term', 'abortion', ',', 'socialism', ',', 'and', 'blatant', 'corruption', '.', 'The', 'Republican', 'Party', 'is', 'the', 'party', 'of', 'the', 'American', 'Worker', ',', 'the', 'American', 'Family', ',', 'and', 'the', 'American', 'Dream', '!', '#KAG2020']


## Frequency Distribution

In [32]:
from nltk.corpus import brown

In [33]:
news = brown.words(categories="news")
print(news)


['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


## Performance of Stemmers


In [8]:
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [20]:
porterstemmer = PorterStemmer()
porterstemmer.stem('happiness')
porterstemmer.stem('functionality')
# porterstemmer.stem('inflamation')
porterstemmer.stem('variety')

'varieti'

## Passed Words

In [10]:
lancaster = LancasterStemmer()
lancaster.stem('Sing')

'sing'

In [11]:
from nltk.stem import RegexpStemmer

In [12]:
regex = RegexpStemmer('ing')
regex.stem('sing')


's'

In [13]:
from nltk.stem import SnowballStemmer

In [15]:
len(SnowballStemmer.languages)

16

In [16]:
frenchStemmer = SnowballStemmer('french')

In [17]:
frenchStemmer.stem('manges')

'mang'

In [21]:
from nltk.stem import WordNetLemmatizer 

In [22]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("Am"))

Am


In [23]:
print(lemmatizer.lemmatize("cacti"))

cactus


In [24]:
print(lemmatizer.lemmatize("corpora"))

corpus


In [25]:
print(lemmatizer.lemmatize("better",pos="a"))

good


In [27]:
example = """Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with roles in the feature adaptations of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting roles in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He has since appeared in numerous major Hollywood films, such as Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011)."""

In [28]:
example = [porterstemmer.stem(token) for token in example.split(' ')]

In [29]:
print(" ".join(example))

henri william dalgliesh cavil (/ˈkævəl/; born 5 may 1983) is an english actor. He began hi career with role in the featur adapt of the count of mont cristo (2002) and I captur the castl (2003). He later appear in support role in sever televis series, includ bbc' the inspector lynley mysteries, itv' midsom murders, and showtime' the tudors. He ha sinc appear in numer major hollywood films, such as tristan & isold (2006), stardust (2007), blood creek (2009), and immort (2011).


In [31]:
example = """Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with roles in the feature adaptations of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting roles in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He has since appeared in numerous major Hollywood films, such as Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011)."""
example = [lemmatizer.lemmatize(token) for token in example.split(' ')]
print(" ".join(example))

Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with role in the feature adaptation of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting role in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He ha since appeared in numerous major Hollywood films, such a Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011).


In [32]:
example = """Henry William Dalgliesh Cavill (/ˈkævəl/; born 5 May 1983) is an English actor. He began his career with roles in the feature adaptations of The Count of Monte Cristo (2002) and I Capture the Castle (2003). He later appeared in supporting roles in several television series, including BBC's The Inspector Lynley Mysteries, ITV's Midsomer Murders, and Showtime's The Tudors. He has since appeared in numerous major Hollywood films, such as Tristan & Isolde (2006), Stardust (2007), Blood Creek (2009), and Immortals (2011)."""

# TF-IDF and Vectorization

In [37]:
!pip3 install sklearn

