In [1]:
# pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\I522400\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\twitter_samples.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I522400\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\I522400\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\I522400\AppData\Roaming\nltk_data...


True

In [3]:
from nltk.corpus import twitter_samples
import random

# Load twitter samples dataset
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# Combine the datasets and create labels
tweets = positive_tweets + negative_tweets
labels = ['Positive'] * len(positive_tweets) + ['Negative'] * len(negative_tweets)

# Shuffle the dataset
combined = list(zip(tweets, labels))
random.shuffle(combined)
tweets, labels = zip(*combined)

In [20]:
combined

[("@tammysoffices You can! :) Here's a quick rundown: https://t.co/ajnqMJww1e -Mary",
  'Positive'),
 ('IM NOT SCREENSHOTTING IM SO PROUD OF MYSELF :(((', 'Negative'),
 ('I really really really really really really like you :) :)\n\n#PSYGustoKita http://t.co/0fRd6bgkjE',
  'Positive'),
 ("@monolifemusic @DJANORAK I'm getting there! It's been super slow because of the move. Definitely want a Mono Life remix when it's done :-))",
  'Positive'),
 ('@KjorObbinsL @wideglide96 @vivijo86 @beingsomebody11 @sophlicious84 @shaymc3796 @ShaymcSharon @ParaJanitor @kelseabellum  thanks :)',
  'Positive'),
 ('@MSaito6 @rekoinmanila @AdeccoWaytoWork \nI will keep fighting for what i wanted to be :) \n\n#KunoriforCEO #CEO1Month',
  'Positive'),
 ('Deepthroat was a good movie :)) https://t.co/rtz4SsXA0O', 'Positive'),
 ("justinbieber I bet $20 to a friend that you will follow ✧｡ Chelny ｡✧ before the end of the month. Don't disappoint me! :(",
  'Negative'),
 ('@achiralk thanks for the feedback. Here I w

In [4]:
from nltk.tokenize import word_tokenize

sample_text = "NLTK is a powerful library for NLP."
tokens = word_tokenize(sample_text)
print(tokens)

['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '.']


In [5]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

filtered_tokens = remove_stopwords(tokens)
print(filtered_tokens)

['NLTK', 'powerful', 'library', 'NLP', '.']


In [6]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(stemmed_tokens)
print(lemmatized_tokens)

['nltk', 'power', 'librari', 'nlp', '.']
['NLTK', 'powerful', 'library', 'NLP', '.']


In [7]:
from nltk.probability import FreqDist

all_words = [word.lower() for tweet in tweets for word in word_tokenize(tweet)]
all_words_freq = FreqDist(all_words)

# Select the top 2000 words as features
word_features = list(all_words_freq.keys())[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

# Create feature sets for training and testing
feature_sets = [(document_features(word_tokenize(tweet)), label) for (tweet, label) in zip(tweets, labels)]
train_set, test_set = feature_sets[1000:], feature_sets[:1000]

In [8]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train_set)

In [9]:
import nltk.classify.util

accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 99.30%


In [10]:
classifier.show_most_informative_features(10)

Most Informative Features
             contains()) = True           Positi : Negati =     68.0 : 1.0
             contains(() = True           Negati : Positi =     64.8 : 1.0
           contains(sad) = True           Negati : Positi =     27.1 : 1.0
          contains(miss) = True           Negati : Positi =     20.4 : 1.0
         contains(loves) = True           Positi : Negati =     18.2 : 1.0
     contains(community) = True           Positi : Negati =     17.6 : 1.0
       contains(arrived) = True           Positi : Negati =     17.0 : 1.0
          contains(blog) = True           Positi : Negati =     15.6 : 1.0
          contains(glad) = True           Positi : Negati =     15.6 : 1.0
        contains(thanks) = True           Positi : Negati =     12.7 : 1.0


In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
text = "NLTK is a great library for text processing!"
sentiment_scores = sid.polarity_scores(text)
print(sentiment_scores)

{'neg': 0.0, 'neu': 0.577, 'pos': 0.423, 'compound': 0.6588}


In [14]:
# Classifying a new sentence using the trained classifier
test_sentence = "This is an amazing movie!"
test_features = document_features(word_tokenize(test_sentence))
classification = classifier.classify(test_features)
print(classification)

Positive
