Skip to content


Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

85 lines (61 sloc) 2.437 kb
An implementation of the Twitter analysis tool mentioned on
happy.txt contains 80 tweets that are happy (positive),
sad.txt contains 80 tweets that are sad (negative).
happy_test.txt and sad_test.txt contains 10 positive
and 10 negative tweets respectively to test the
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def read_tweets(fname, t_type):
tweets = []
f = open(fname, 'r')
line = f.readline()
while line != '':
tweets.append([line, t_type])
line = f.readline()
return tweets
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def classify_tweet(tweet):
return \
# read in postive and negative training tweets
pos_tweets = read_tweets('happy.txt', 'positive')
neg_tweets = read_tweets('sad.txt', 'negative')
# filter away words that are less than 3 letters to form the training data
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
# extract the word features out from the training data
word_features = get_word_features(\
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)
# read in the test tweets and check accuracy
# to add your own test tweets, add them in the respective files
test_tweets = read_tweets('happy_test.txt', 'positive')
test_tweets.extend(read_tweets('sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))
for tweet in test_tweets:
if classify_tweet(tweet[0]) != tweet[1]:
accuracy -= 1
print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
Jump to Line
Something went wrong with that request. Please try again.