In [23]:
import nltk
nltk.download() #NLTK will display a download manager showing all available and installed resources

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


[nltk_data] Downloading package shakespeare to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Unzipping corpora/shakespeare.zip.


True

In [30]:
#quick way to download specific resources directly from the console is to pass a list to nltk.download()

nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
 ])

[nltk_data] Downloading package names to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sabinbasnet/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexic

True

In [None]:
w = nltk.corpus.shakespeare.words()

In [32]:
#Compiling Data
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]

In [34]:
stopwords = nltk.corpus.stopwords.words("english")

In [35]:
#remove stop words from your original word list
words = [w for w in words if w.lower() not in stopwords]

In [36]:
from pprint import pprint
text = """
For some quick analysis, creating a corpus could be overkill. 
If all you need is a word list, 
there are simpler ways to achieve that goal."""
pprint(nltk.word_tokenize(text), width=79, compact=True)

['For', 'some', 'quick', 'analysis', ',', 'creating', 'a', 'corpus', 'could',
 'be', 'overkill', '.', 'If', 'all', 'you', 'need', 'is', 'a', 'word', 'list',
 ',', 'there', 'are', 'simpler', 'ways', 'to', 'achieve', 'that', 'goal', '.']


In [37]:
#Creating Frequency Distributions
words: list[str] = nltk.word_tokenize(text)
fd = nltk.FreqDist(words)

In [38]:
fd.most_common(3)

[(',', 2), ('a', 2), ('.', 2)]

In [39]:
fd.tabulate(3)

, a . 
2 2 2 


In [19]:
fd["America"]

0

In [20]:
fd["america"]

0

In [21]:
fd["AMERICA"]

0

In [40]:
#Try creating a new frequency distribution that’s based on the initial one but normalizes all words to lowercase
lower_fd = nltk.FreqDist([w.lower() for w in fd])

In [41]:
#Extracting Concordance and Collocations
text = nltk.Text(nltk.corpus.state_union.words())
text.concordance("america", lines=5)

Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [44]:
"""since .concordance() only prints information to the console, it’s 
not ideal for data manipulation. To obtain a usable list that will also give you
information about the location of each occurrence, use .concordance_list()"""
concordance_list = text.concordance_list("america", lines=2)
for entry in concordance_list:
    print(entry.line)

 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace


In [45]:
"""Revisiting nltk.word_tokenize(), check out how quickly you can create a c
ustom nltk.Text instance and an accompanying frequency distribution"""
words: list[str] = nltk.word_tokenize(
     """Beautiful is better than ugly.
     Explicit is better than implicit.
     Simple is better than complex."""
)
text = nltk.Text(words)
fd = text.vocab()  # Equivalent to fd = nltk.FreqDist(words)
fd.tabulate(3)

    is better   than 
     3      3      3 


In [46]:
#NLTK provides specific classes to find collocations text. Following the pattern seen so far, these classes are also built from lists of words
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]
finder = nltk.collocations.TrigramCollocationFinder.from_words(words)

In [48]:
#Using ngram_fd, you can find the most common collocations in the supplied text
finder.ngram_fd.most_common(2)

[(('the', 'United', 'States'), 294), (('the', 'American', 'people'), 185)]

In [49]:
finder.ngram_fd.tabulate(2)

  ('the', 'United', 'States') ('the', 'American', 'people') 
                          294                           185 


In [50]:
#Using NLTK’s Pre-Trained Sentiment Analyzer
#To use VADER, first create an instance of nltk.sentiment.SentimentIntensityAnalyzer, then use .polarity_scores() on a raw string
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [51]:
#load the twitter_samples corpus into a list of strings, making a replacement to render URLs inactive to avoid accidental clicks
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()]

In [52]:
#use the .polarity_scores() function of your SentimentIntensityAnalyzer instance to classify tweets
from random import shuffle

def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(">", is_positive(tweet), tweet)

> False RT @mygibbo: I hate Tories. And yes, it's tribal | Gary Younge http//t.co/eaqyVXIqCy
> True @Awaishooo Shahid Afridi tou young hi hai na abhi. talk about Mahnor baloch. she will never going to be old. :)
> False Y is no one up :-(
> False i miss them :(
> False @KEEMSTARx And people ask why I don't leave my house, the world is scary and fucked. 
Shootings and explosions everywhere :(
> False what usually happens :( https//t.co/6o3ZgNOnvh
> True @SpongeZim .. Thank you Mario... Have a wonderful Friday.... :))
> True RT @NicolaSturgeon: If Miliband is going to let Tories in rather than work with SNP, we will definitely need lots of SNP MPs to protect Sco…
> False RT @billbanjos: I'm #SNPbecause we have witnessed tonight on tv Ed Miliband finally betraying the legacy of James Keir Hardie - #scottishla…
> True RT @beaubeau888: Ed Milliband came on top on the Sun's Twitter worm. Bet they won't mention that in their paper tomorrow. http//t.co/QdBJr…


In [53]:
#making a list of the file IDs that the corpus uses, which you can use later to reference individual reviews
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = positive_review_ids + negative_review_ids

In [54]:
#redefine is_positive() to work on an entire review. we need to obtain that specific review using its file ID 
#and then split it into sentences before rating
from statistics import mean

def is_positive(review_id: str) -> bool:
    """True if the average of all sentence compound scores is positive."""
    text = nltk.corpus.movie_reviews.raw(review_id)
    scores = [
        sia.polarity_scores(sentence)["compound"]
        for sentence in nltk.sent_tokenize(text)
    ]
    return mean(scores) > 0

In [55]:
# taking the opportunity to rate all the reviews and see how accurate VADER is with this setup
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
     if is_positive(review_id):
         if review_id in positive_review_ids:
             correct += 1
     else:
         if review_id in negative_review_ids:
             correct += 1

print(F"{correct / len(all_review_ids):.2%} correct")

64.00% correct


In [56]:
#Customizing NLTK’s Sentiment Analysis
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]
negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

In [57]:
#Since many words are present in both positive and negative sets, 
#begin by finding the common set so you can remove it from the distribution objects
positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(100)}
top_100_negative = {word for word, count in negative_fd.most_common(100)}

In [58]:
#setting up the positive and negative bigram finders
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted
])
negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted
])

In [62]:
#Training and Using a Classifier
#for positive movie reviews, focus on the features that indicate positivity, including VADER scores
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    # since some classifiers you'll use later don't work with negative numbers.
    features["mean_compound"] = mean(compound_scores) + 1
    features["mean_positive"] = mean(positive_scores)
    features["wordcount"] = wordcount

    return features

In [63]:
#In order to train and evaluate a classifier, we’ll need to build a list of features for each text you’ll analyze
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [64]:
"""Training the classifier involves splitting the feature set so that one portion can be
used for training and the other for evaluation, then calling .train()"""
# Use 1/4 of the set for training
train_count = len(features) // 4
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)

Most Informative Features
               wordcount = 2                 pos : neg    =      3.2 : 1.0
               wordcount = 4                 pos : neg    =      3.0 : 1.0
               wordcount = 0                 neg : pos    =      1.6 : 1.0
               wordcount = 1                 pos : neg    =      1.5 : 1.0


In [65]:
nltk.classify.accuracy(classifier, features[train_count:])

0.6633333333333333

In [None]:
"""To classify new data, find a movie review somewhere and pass it to
classifier.classify(). we can also use extract_features() to tell us
exactly how it was scored"""
new_review = ...
classifier.classify(new_review)
extract_features(new_review)

In [67]:
#Comparing Additional Classifiers
#The following classifiers are a subset of all classifiers available to you. These will work within NLTK for sentiment analysis
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [68]:
#To aid in accuracy evaluation, it’s helpful to have a mapping of classifier names and their instances
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [69]:
#Using scikit-learn Classifiers With NLTK
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [70]:
# The first half of the list contains only positive reviews, begin by shuffling it, 
#then iterate over all classifiers to train and evaluate each one
# Use 1/4 of the set for training
train_count = len(features) // 4
shuffle(features)
for name, sklearn_classifier in classifiers.items():
     classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
     classifier.train(features[:train_count])
     accuracy = nltk.classify.accuracy(classifier, features[train_count:])
     print(F"{accuracy:.2%} - {name}")

66.27% - BernoulliNB
65.73% - ComplementNB
65.00% - MultinomialNB
68.87% - KNeighborsClassifier
62.73% - DecisionTreeClassifier
68.67% - RandomForestClassifier
71.80% - LogisticRegression
72.47% - MLPClassifier
69.53% - AdaBoostClassifier


