## 基于机器学习的情感分析

注：graphlab 不支持32位系统

In [2]:
import nltk

pos_tweets = [('I love this car', 'positive'),
    ('This view is amazing', 'positive'),
    ('I feel great this morning', 'positive'),
    ('I am so excited about the concert', 'positive'),
    ('He is my best friend', 'positive')]

neg_tweets = [('I do not like this car', 'negative'),
    ('This view is horrible', 'negative'),
    ('I feel tired this morning', 'negative'),
    ('I am not looking forward to the concert', 'negative'),
    ('He is my enemy', 'negative')]

In [3]:
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))
tweets[:2]

[(['love', 'this', 'car'], 'positive'),
 (['this', 'view', 'amazing'], 'positive')]

In [4]:
test_tweets = [
    (['feel', 'happy', 'this', 'morning'], 'positive'),
    (['larry', 'friend'], 'positive'),
    (['not', 'like', 'that', 'man'], 'negative'),
    (['house', 'not', 'great'], 'negative'),
    (['your', 'song', 'annoying'], 'negative')]

In [5]:
# get the word lists of tweets
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

# get the unique word from the word list	
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

word_features = get_word_features(get_words_in_tweets(tweets))
' '.join(word_features)

'forward great like love concert tired this car about morning looking feel amazing friend horrible not the enemy excited best view'

In [6]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [7]:
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [8]:
# 定义train
# You may want to know how to define the ‘train’ method in NLTK here:

def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
    # Create the P(label) distribution
    label_probdist = estimator(label_freqdist)
    # Create the P(fval|label, fname) distribution
    feature_probdist = {}
    model = NaiveBayesClassifier(label_probdist, feature_probdist)
    return model

In [9]:
tweet_positive = 'Larry is my friend'
print classifier.classify(extract_features(tweet_positive.split()))

positive


In [10]:
tweet_negative = 'Larry is not my friend'
print classifier.classify(extract_features(tweet_negative.split()))

negative


In [11]:
# 另一个方法
# Don’t be too positive, let’s try another example:

tweet_negative2 = 'Your song is annoying'
print classifier.classify(extract_features(tweet_negative2.split()))

positive


In [12]:
def classify_tweet(tweet):
    return classifier.classify(extract_features(tweet)) 
    # nltk.word_tokenize(tweet)

total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))

Total accuracy: 80.000000% (4/20).


### nltk分类器

In [13]:
nltk_classifiers = dir(nltk)
for i in nltk_classifiers:
    if 'Classifier' in i:
        print i

ClassifierBasedPOSTagger
ClassifierBasedTagger
ClassifierI
ConditionalExponentialClassifier
DecisionTreeClassifier
MaxentClassifier
MultiClassifierI
NaiveBayesClassifier
PositiveNaiveBayesClassifier
SklearnClassifier
WekaClassifier


In [14]:
from sklearn.svm import LinearSVC
from nltk.classify.scikitlearn import SklearnClassifier
classif = SklearnClassifier(LinearSVC())
svm_classifier = classif.train(training_set)

In [15]:
tweet_negative2 = 'Your song is annoying'
print svm_classifier.classify(extract_features(tweet_negative2.split()))

negative


使用https://github.com/victorneo/Twitter-Sentimental-Analysis 所提供的推特数据进行情感分析

代码 https://github.com/victorneo/Twitter-Sentimental-Analysis/blob/master/classification.py

In [16]:
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier

In [17]:
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

In [18]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features


In [19]:

def read_tweets(fname, t_type):
    tweets = []
    f = open(fname, 'r')
    line = f.readline()
    while line != '':
        tweets.append([line, t_type])
        line = f.readline()
    f.close()
    return tweets

In [20]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features

In [21]:
def classify_tweet(tweet):
    return \
        classifier.classify(extract_features(nltk.word_tokenize(tweet)))

In [22]:
# read in postive and negative training tweets
pos_tweets = read_tweets('F:/test/TwitterSentimentalAnalysisMaster/happy.txt','positive')
neg_tweets = read_tweets('F:/test/TwitterSentimentalAnalysisMaster/sad.txt','negative')

In [23]:
# filter away words that are less than 3 letters to form the training data
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
    tweets.append((words_filtered, sentiment))

In [24]:
# extract the word features out from the training data
word_features = get_word_features(\
                    get_words_in_tweets(tweets))

In [25]:
# get the training set and train the Naive Bayes Classifier
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)


In [26]:
# read in the test tweets and check accuracy
# to add your own test tweets, add them in the respective files
test_tweets = read_tweets('F:/test/TwitterSentimentalAnalysisMaster/happy_test.txt', 'positive')
test_tweets.extend(read_tweets('F:/test/TwitterSentimentalAnalysisMaster/sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))

for tweet in test_tweets:
    if classify_tweet(tweet[0]) != tweet[1]:
        accuracy -= 1

print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))

Total accuracy: 90.000000% (18/20).
