In [1]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\sarth\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sarth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sarth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sarth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sarth\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
from nltk.corpus import twitter_samples
import random

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

tweets = positive_tweets + negative_tweets
labels = ['Positive'] * len(positive_tweets) + ['Negative'] * len(negative_tweets)

combined = list(zip(tweets, labels))
random.shuffle(combined)
tweets, labels = zip(*combined)

In [3]:
from nltk.tokenize import word_tokenize

sample_text = "This is a very good test message!"
tokens = word_tokenize(sample_text)
print(tokens)

['This', 'is', 'a', 'very', 'good', 'test', 'message', '!']


In [4]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

filtered_tokens = remove_stopwords(tokens)
print(filtered_tokens)


['good', 'test', 'message', '!']


In [5]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print(stemmed_tokens)
print(lemmatized_tokens)

['good', 'test', 'messag', '!']
['good', 'test', 'message', '!']


In [15]:
from nltk.probability import FreqDist

all_words = [word.lower() for tweet in tweets for word in word_tokenize(tweet)]
all_words_freq = FreqDist(all_words)

word_features = list(all_words_freq.keys())[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

feature_sets = [(document_features(word_tokenize(tweet)), label) for (tweet, label) in zip(tweets, labels)]
train_set, test_set = feature_sets[700:], feature_sets[:700]


In [16]:
from nltk.classify import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_set)

In [17]:
import nltk.classify.util

accuracy = nltk.classify.util.accuracy(classifier, test_set)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 99.29%


In [18]:
classifier.show_most_informative_features(10)

Most Informative Features
             contains()) = True           Positi : Negati =     65.8 : 1.0
             contains(() = True           Negati : Positi =     63.1 : 1.0
           contains(sad) = True           Negati : Positi =     29.1 : 1.0
          contains(miss) = True           Negati : Positi =     21.2 : 1.0
     contains(community) = True           Positi : Negati =     19.8 : 1.0
       contains(arrived) = True           Positi : Negati =     17.7 : 1.0
          contains(blog) = True           Positi : Negati =     15.8 : 1.0
          contains(lost) = True           Negati : Positi =     14.2 : 1.0
         contains(loves) = True           Positi : Negati =     12.8 : 1.0
           contains(via) = True           Positi : Negati =     12.5 : 1.0


In [19]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
text = "this is amazing!"
sentiment_scores = sid.polarity_scores(text)
print(sentiment_scores)


{'neg': 0.0, 'neu': 0.328, 'pos': 0.672, 'compound': 0.6239}


In [28]:
# Classifying a new sentence using the trained classifier
test_sentence = "this is a missed opportunity"
test_features = document_features(word_tokenize(test_sentence))
classification = classifier.classify(test_features)
print(classification)


Positive


In [None]:
# Visualizing Sentiment Scores
import matplotlib.pyplot as plt
import seaborn as sns

def plot_sentiment_scores(sentiment_scores):
    plt.figure(figsize=(8, 4))
    sns.barplot(x=list(sentiment_scores.keys()), y=list(sentiment_scores.values()))
    plt.title('Sentiment Scores')
    plt.ylabel('Score')
    plt.show()

plot_sentiment_scores(sentiment_scores)
