In [48]:
import json
import pandas as pd

In [49]:
WorldCup_labeled = pd.read_csv('WCSemi_Sentiment_Labeled.csv', encoding='latin1', header=0)

In [50]:
WorldCup_labeled.head(n=10)

Unnamed: 0,id,screenName,userId,text,location,multi-team,sentiment
0,0,Shrupti,60885990.0,Its coming home #England https://t.co/jGuIRBj46I,"Arlington, VA",False,pos
1,1,PropSwap,2862105000.0,SOLD! This Croatia to Win the #WorldCup ticket...,"Enterprise, NV",False,
2,2,MentorPlanet,86502570.0,#France is pledging 1.5 billion pure governme...,"Minneapolis, MN",False,
3,3,NFLMarquise,9.08e+17,.@MDbankroll Thanks to you for being my 300th ...,"Moon, PA",False,
4,4,FassiCarlo,915223600.0,.@GianluigiBuffon says France is the most ser...,"Winter Haven, FL",False,pos
5,5,Caitlin11790,916015200.0,People at my work are emailing and calling ask...,"Clinton, MI",False,pos
6,6,faustaLV,260340700.0,??Pure moment of love ??#motherandson #love #m...,"Las Vegas, NV",False,
7,7,Miiikey,15495590.0,"@DannyWelbeck Welbz, how many retweets for a W...","San Diego, CA",False,pos
8,8,TheRealPolyG,539082200.0,These World Snooze off days have been throwing...,"Philadelphia, PA",False,
9,9,vivathematadors,17922360.0,We made it #worldcup https://t.co/tIxmyhAbGz,"Lubbock, TX",False,


In [51]:
#Use list comprehensions to make lists of positive and negative tweets
pos_tweets = [(WorldCup_labeled.loc[row,'text'],'positive') for row in range(len(WorldCup_labeled)) if \
              WorldCup_labeled.loc[row,'sentiment'] == 'pos']

neg_tweets = [(WorldCup_labeled.loc[row,'text'],'negative') for row in range(len(WorldCup_labeled)) if \
              WorldCup_labeled.loc[row,'sentiment'] == 'neg']

print('Number of tweets labeled positive: %d' % len(pos_tweets))
print('Number of tweets labeled negative: %d' % len(neg_tweets))

Number of tweets labeled positive: 544
Number of tweets labeled negative: 15


In [52]:
#half the negative tweets go in training
#Downsampling the positive tweets at 1 pos:1 neg
len_train = int(round(len(neg_tweets)/2)*2)
train_tweets = neg_tweets[:int(len_train/2)] + pos_tweets[:int(len_train/2)]

#half of the remaining half go in cv
cv_neg_cutoff = int( (len_train/2) + round((len(neg_tweets) - len_train/2)/2) )
cv_pos_cutoff = int( (len_train/2) + round((len(pos_tweets) - len_train/2)/2) )
cv_tweets =  neg_tweets[int(len_train/2):cv_neg_cutoff] +  pos_tweets[int(len_train/2):cv_pos_cutoff]  

#rest go into testing
test_tweets = neg_tweets[cv_neg_cutoff:] +  pos_tweets[cv_pos_cutoff:]

In [53]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patrick.smyth.TPLGIS\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [55]:
from nltk.sentiment.util import mark_negation

In [56]:
example = "I do not like #England".split()
nltk.sentiment.util.mark_negation(example)

['I', 'do', 'not', 'like_NEG', '#England_NEG']

In [57]:
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
import string
import itertools

In [58]:
#Set to exclude punctuation marks
exclude = set(string.punctuation)

#List to exclude words that can identify the team from list of features
excluded_words = ['england','belgium','france','croatia',\
                  'eng','bel','fra','cro',\
                  'itscominghome','lions','bleus','devils','blues']

#Function that provides a list of filtered unigrams and bigrams from each tweet
def filter_tweets(tweets):
    filtered_tweets = []
    
    #Get a list of words, and the sentiment for each tweet
    for (words, sentiment) in tweets: 
        words_filtered=[]
        
        #For each word in the list of words, filter on our feature requirements. 
        for word in words.split(): 
            
            #Remove punctuation
            word = ''.join(ch for ch in word if ch not in exclude)

            #Remove zero letter "words"
            if len(word) >= 1: 
                
                    #treat URLs the same
                    if word[:4] == 'http':
                        word='http'
                        
                    #remove hashtags
                    if word[0] == '#': 
                        word=word[1:]
                        
                    #remove team identifiers
                    if (word.lower() not in excluded_words):
        
                        #require lower case
                        words_filtered.append(word.lower()) 

        #Identify top 200 bigams in the filtered word list using chi_sq measure of importance
        bigram_finder = BigramCollocationFinder.from_words(words_filtered)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 200)      

        #Add the final bigrams and unigrams for this tweet to the filtered list
        filtered_tweets.append(([ngram for ngram in itertools.chain(words_filtered, bigrams)],sentiment))

    #Return the filtered list for all tweets
    return filtered_tweets

In [59]:
#Filter each set of data
train_tweets = filter_tweets(train_tweets)
cv_tweets = filter_tweets(cv_tweets)
test_tweets = filter_tweets(test_tweets)

In [60]:
train_tweets[9]

(['gianluigibuffon',
  'says',
  '\x93france',
  'is',
  'the',
  'most',
  'serious',
  'semifinalist',
  'capable',
  'of',
  'winning',
  'the',
  'fifaworldcup\x94',
  'based',
  'on',
  'thei',
  'http',
  ('based', 'on'),
  ('capable', 'of'),
  ('fifaworldcup\x94', 'based'),
  ('gianluigibuffon', 'says'),
  ('most', 'serious'),
  ('of', 'winning'),
  ('on', 'thei'),
  ('says', '\x93france'),
  ('semifinalist', 'capable'),
  ('serious', 'semifinalist'),
  ('thei', 'http'),
  ('\x93france', 'is'),
  ('is', 'the'),
  ('the', 'fifaworldcup\x94'),
  ('the', 'most'),
  ('winning', 'the')],
 'positive')

In [61]:
#Function that builds a list of features from the list of unigrams and bigrams
#Requires each unigram or bigram to show up some minimum number of times to be considered a feature
def get_word_features(tweets,min_freq):

    #Create a list of ALL unigrams and bigrams
    wordlist = []
    for (words, sentiment) in tweets:
        wordlist.extend(words)
    
    #Count the frequency of each unigram and bigram
    wordlist = nltk.FreqDist(wordlist)
    
    #Sort the list of unigrams and bigrams based on frequency
    sorted_word_list = sorted(wordlist.items(), key=lambda x: x[1], reverse=True)
    
    #Only include the unigrams and bigrams as features if they appear at least min_freq times
    word_features = [sorted_word_list[word][0] for word in \
    	range(len(sorted_word_list)) if sorted_word_list[word][1] >= min_freq]
    
    #Return the list of features
    return word_features

word_features = get_word_features(train_tweets,3)

In [62]:
#Feature extractor - determines which word features are in each tweet
def extract_features(filtered_tweet):

    #list of unigrams and bigrams in the tweet
    filtered_tweet_words = set(filtered_tweet)
    
    #Define a features dictionary
    features = {}

    #Loop of all word features
    for word in word_features:
        
        #Set 'contains(word_feature)' as a key in the dictionary
        #Set the value for that key to True or False
        features['contains(%s)' % str(word)] = (word in filtered_tweet_words)

    #Return the final features dictionary for that tweet
    return features

In [63]:
#Extract features from each tweets
training_set = nltk.classify.apply_features(extract_features, train_tweets)
cv_set = nltk.classify.apply_features(extract_features, cv_tweets)
test_set = nltk.classify.apply_features(extract_features, test_tweets)

In [64]:
tweet_number=1
training_set[tweet_number][0]

{"contains(('coming', 'home'))": False,
 "contains(('worldcup', 'http'))": False,
 'contains(a)': True,
 'contains(coming)': False,
 'contains(fans)': False,
 'contains(home)': False,
 'contains(http)': False,
 'contains(is)': True,
 'contains(of)': False,
 'contains(on)': False,
 'contains(the)': False,
 'contains(win)': False,
 'contains(world)': False,
 'contains(worldcup)': False}

In [65]:
#Train the classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [66]:
from nltk.metrics import precision as prec
from nltk.metrics import recall as rec
from nltk.metrics import f_measure as fmeas
import collections

In [67]:
#Function to evaluate our classifier on the given subset of data
def eval_classifier(data_set):

    #Use .accuracy method to calculate accuracy
    cross_valid_accuracy = nltk.classify.accuracy(classifier, data_set)

    #Create two sets which we'll use to count positive and negative tweets
    ref_set = collections.defaultdict(set)
    obs_set = collections.defaultdict(set)


    #Loop over each tweet in our cross validation set
    for i, (feats, label) in enumerate(data_set):

        #Classify the tweet by feeding the classifier the tweet's features
        observed = classifier.classify(feats)

        #Add the current tweet to the "reference" set under the actual class
        ref_set[label].add(i)

        #Add the current tweet to the "observation" set under the predicted class
        obs_set[observed].add(i)


    #Calculate F score, precision, an recall for positive and negative labels
    #Also calculate accuracy and NTC improvement
    print ('Accuracy:', cross_valid_accuracy)
    print ('F-measure [negative]:', fmeas(ref_set['negative'], obs_set['negative']))
    print ('F-measure [positive]:', fmeas(ref_set['positive'], obs_set['positive']))
    print ('Precision [negative]:', prec(ref_set['negative'], obs_set['negative']))
    print ('Precision [positive]:', prec(ref_set['positive'], obs_set['positive']))
    rec_neg=rec(ref_set['negative'], obs_set['negative'])
    rec_pos=rec(ref_set['positive'], obs_set['positive'])
    print ('Recall [negative]:', rec_neg)
    print ('Recall [positive]:', rec_pos)
    total_neg=len(neg_tweets)
    total_pos=len(pos_tweets)
    ntc_improvement = 100*((total_pos + total_neg)/total_neg)*( (total_neg/(total_neg+total_pos)) - \
    	(total_neg*(1-rec_neg))/(total_neg*(1-rec_neg) + total_pos*rec_pos))
    print ('Negative contamination improved by ', ntc_improvement, 'percent')

In [68]:
eval_classifier(cv_set)

Accuracy: 0.8860294117647058
F-measure [negative]: 0.06060606060606061
F-measure [positive]: 0.939334637964775
Precision [negative]: 0.034482758620689655
Precision [positive]: 0.9876543209876543
Recall [negative]: 0.25
Recall [positive]: 0.8955223880597015
Negative contamination improved by  15.88321167883211 percent


In [69]:
#Show the 5 most important features of our classifier
print (classifier.show_most_informative_features(5))

Most Informative Features
contains(('worldcup', 'http')) = True           negati : positi =      3.0 : 1.0
          contains(fans) = False          positi : negati =      2.4 : 1.0
            contains(is) = True           positi : negati =      2.3 : 1.0
      contains(worldcup) = False          positi : negati =      1.8 : 1.0
            contains(on) = True           positi : negati =      1.7 : 1.0
None


In [70]:
import pickle

#Save the classifier for later use
f = open('WorldCup_tweet_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

#Save document_words as well
with open('WorldCup_classifier_feats.pickle', 'wb') as f:
    pickle.dump(word_features, f)

In [71]:
#Evaluating classifier on test set
eval_classifier(test_set)

Accuracy: 0.9188191881918819
F-measure [negative]: 0
F-measure [positive]: 0.9576923076923076
Precision [negative]: 0.0
Precision [positive]: 0.9880952380952381
Recall [negative]: 0.0
Recall [positive]: 0.9291044776119403
Negative contamination improved by  -7.410593937308235 percent


In [72]:
#Calculating initial and final negative contamination
total_neg=len(neg_tweets)
total_pos=len(pos_tweets)

print('Original systematic error from uncut negative tweets was: ', \
      round(10000*(total_neg/(total_neg + total_pos)))/100, 'percent')
      
print('Improved systematic error from uncut negative tweets is: ', \
      round((1-0.714)*10000*(total_neg/(total_neg + total_pos)))/100, 'percent')

Original systematic error from uncut negative tweets was:  2.68 percent
Improved systematic error from uncut negative tweets is:  0.77 percent
