In [10]:
import pandas as pd
import numpy as np

In [11]:
#Load the full collection of World Cup tweets
WorldCup_tweets = pd.read_csv('WCSemi_Sentiment_AllTweets.csv', encoding='latin1')
WorldCup_tweets.drop('Unnamed: 0', axis=1, inplace=True)
WorldCup_tweets.head(n=5)

Unnamed: 0,screenName,userId,text,location,multi-team
0,Shrupti,60885990,Its coming home #England https://t.co/jGuIRBj46I,"Arlington, VA",False
1,PropSwap,2862104585,SOLD! This Croatia to Win the #WorldCup ticket...,"Enterprise, NV",False
2,MentorPlanet,86502570,#France is pledging 1.5 billion pure governme...,"Minneapolis, MN",False
3,NFLMarquise,907610098801025024,.@MDbankroll Thanks to you for being my 300th ...,"Moon, PA",False
4,FassiCarlo,915223567,.@GianluigiBuffon says France is the most ser...,"Winter Haven, FL",False


In [12]:
#load the classifier
import pickle
f = open('WorldCup_tweet_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

#load the list of features used in the classifier
with open('WorldCup_classifier_feats.pickle', 'rb') as f:
    word_features = pickle.load(f)

In [13]:
#Load the required modules
import nltk
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
import itertools
import string

#Set of punctuation to exclude from unigrams and bigrams
exclude = set(string.punctuation)

#List to exclude words that can identify the team from list of features
excluded_words = ['england','belgium','france','croatia',\
                  'eng','bel','fra','cro',\
                  'itscominghome','lions','bleus','devils','blues']


#Function that provides a list of filtered unigrams and bigrams from each tweet
def filter_tweets(tweet_text):
    words_filtered=[]

    #For each word in the tweet, filter on our feature requirements.
    for word in tweet_text.split(): 

        #Remove punctuation
        word = ''.join(ch for ch in word if ch not in exclude)

        #Remove one letter words
        if len(word) >= 1: 

                #treat URLs the same
                if word[:4] == 'http':
                    word='http'

                #remove hashtags
                if word[0] == '#': 
                    word=word[1:]

                #remove team identifiers
                if (word.lower() not in excluded_words):

                    #require lower case
                    words_filtered.append(word.lower()) 
    
    #If the word list contains only duplicates of one word, it causes problems for bigram finder
    #In this case, don't bother trying to find bigrams, just find the unigram since there are no bigrams anyway
    if len(set(words_filtered)) == 1:
        tweet_feats = words_filtered[0]
    else:  
        #Identify top 200 bigams in the filtered word list using chi_sq measure of importance
        bigram_finder = BigramCollocationFinder.from_words(words_filtered)
        bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 200)  

        tweet_feats = [ngram for ngram in itertools.chain(words_filtered, bigrams)]

    return tweet_feats


#Feature extractor - determines which word features are in each tweet
def extract_features(filtered_tweet):

    #list of unigrams and bigrams in the tweet
    filtered_tweet_words = set(filtered_tweet)
    
    #Define a features dictionary
    features = {}

    #Loop of all word features
    for word in word_features:
        
        #Set 'contains(word_feature)' as a key in the dictionary
        #Set the value for that key to True or False
        features['contains(%s)' % str(word)] = (word in filtered_tweet_words)

    #Return the final features dictionary for that tweet
    return features

In [14]:
#Classifying tweets
def ClassifyTweets(dFrame):
    for row in range(len(dFrame)):
        
        #Get filtered unigrams and bigrams
        filtered_text = filter_tweets(dFrame.loc[row,'text'])
        
        #Evaluate the contains(word) statements
        tweet_feats = extract_features(filtered_text)
        
        #Add result to new sentimenet column in the dataframe
        dFrame.loc[row,'sentiment']=classifier.classify(tweet_feats) 

In [15]:
ClassifyTweets(WorldCup_tweets)
WorldCup_tweets.head(n=5)

Unnamed: 0,screenName,userId,text,location,multi-team,sentiment
0,Shrupti,60885990,Its coming home #England https://t.co/jGuIRBj46I,"Arlington, VA",False,positive
1,PropSwap,2862104585,SOLD! This Croatia to Win the #WorldCup ticket...,"Enterprise, NV",False,positive
2,MentorPlanet,86502570,#France is pledging 1.5 billion pure governme...,"Minneapolis, MN",False,positive
3,NFLMarquise,907610098801025024,.@MDbankroll Thanks to you for being my 300th ...,"Moon, PA",False,positive
4,FassiCarlo,915223567,.@GianluigiBuffon says France is the most ser...,"Winter Haven, FL",False,positive


In [16]:
#Creating a new data frame where we do not apply sentiment analysis
WorldCup_tweets_nosenti = WorldCup_tweets

#Applying the positive sentiment restriction on the original dataframe
WorldCup_tweets = WorldCup_tweets[WorldCup_tweets['sentiment'] == 'positive']
WorldCup_tweets = WorldCup_tweets.reset_index(drop=True)

In [17]:
def FindUniqueUsers(dFrame):
    unique_users=[]
    
    #Loop through the tweets in our dataframe
    for row in range(len(dFrame)):
        
        #If we have already seen this Twitter user, flag the tweet as a repeat
        if dFrame.loc[row,'screenName'] in unique_users:
            dFrame.loc[row,'repeatUser'] = True
            
        #Otherwise, add the user to a list so we can identify if they tweet again 
        else:
            dFrame.loc[row,'repeatUser'] = False
            unique_users.append(dFrame.loc[row,'screenName'])

In [18]:
#Apply the unique user function to both of our data frames
FindUniqueUsers(WorldCup_tweets_nosenti)
FindUniqueUsers(WorldCup_tweets)

In [19]:
#Remove all but the first tweet from each user in our data frames
WorldCup_tweets = WorldCup_tweets[WorldCup_tweets['repeatUser'] == False]
WorldCup_tweets_nosenti = WorldCup_tweets_nosenti[WorldCup_tweets_nosenti['repeatUser'] == False]

#Reindex the data frames
WorldCup_tweets = WorldCup_tweets.reset_index(drop=True)
WorldCup_tweets_nosenti = WorldCup_tweets_nosenti.reset_index(drop=True)

In [20]:
#Dictionary of state abbreviations and names
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

#Create a list of state abbreviations
state_abbrevs=[state for state in states] 

#Create a list of state names
state_names=[states[state] for state in states] 

In [21]:
#Function to determine the state that each tweet originated in
def GetTweetState(dFrame):
    
    #Get a list of all the tweet's locations from our dataframe
    loc_list=list(dFrame['location'])
    
    state_list=[]
    
    #Loop over the location info for each tweet
    for info in loc_list:
        
        #See if a state abbreviation or name is in the location information
        new_state = [state for state in state_names if state in info] + \
                    [states[state] for state in states if state in info]

        #Handle cases like "New York, NY" and "Alabama, New York"
        if (len(new_state) > 1) and new_state[0] != 'District of Columbia':
            new_state = new_state[1].split('junk')
            
        #Handle "District of Columbia, Washington"     
        elif len(new_state) > 1 and new_state[0] == 'District of Columbia':
            new_state = new_state[0].split('junk')
            
        #Handle cases where no state is mentioned    
        if not new_state:
            new_state = ['None']

        #After we determine the state, add it to a list
        state_list+=new_state
     
    #Once we have the entire list of states, add it to the dataframe
    dFrame['state']=state_list

In [23]:
GetTweetState(WorldCup_tweets)
GetTweetState(WorldCup_tweets_nosenti)

WorldCup_tweets.tail(n=3)

Unnamed: 0,screenName,userId,text,location,multi-team,sentiment,repeatUser,state
4592,jenvargas,14707086,#WorldCup! BELvJPN (@ #worldbunnydomination HQ...,"Williamsburg, FL",False,positive,False,Florida
4593,daviddesola,22278349,Yup... This is (theoretically) the easiest pat...,"Los Angeles, CA",False,positive,False,California
4594,BTS_NoonasRock,23631270,@DungNgu33711964 @FIFAWorldCup Please remember...,"Severn, MD",False,positive,False,Maryland


In [24]:
#stateInfo and stateInfo_ns
def MakeStateInfo(dFrame):
    dFrame['State']=states.values()
    dFrame['Num Tweets'] = 0
    dFrame['Num France'] = 0
    dFrame['Num Belgium'] = 0
    dFrame['Num England'] = 0
    dFrame['Num Croatia'] = 0

In [25]:
stateInfo = pd.DataFrame()
stateInfo_nosenti = pd.DataFrame()

MakeStateInfo(stateInfo)
MakeStateInfo(stateInfo_nosenti)

stateInfo.head(n=5)

Unnamed: 0,State,Num Tweets,Num France,Num Belgium,Num England,Num Croatia
0,Alaska,0,0,0,0,0
1,Alabama,0,0,0,0,0
2,Arkansas,0,0,0,0,0
3,Arizona,0,0,0,0,0
4,California,0,0,0,0,0


In [26]:
stateInfo = stateInfo.set_index(['State'])
stateInfo_nosenti = stateInfo_nosenti.set_index(['State'])

stateInfo.head(n=5)

Unnamed: 0_level_0,Num Tweets,Num France,Num Belgium,Num England,Num Croatia
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,0,0,0,0,0
Alabama,0,0,0,0,0
Arkansas,0,0,0,0,0
Arizona,0,0,0,0,0
California,0,0,0,0,0


In [27]:
#Lists of team identifying phrases
france_text = ['#france','#lesbleus','les bleus','france','#fra']
belgium_text = ['#belgium','#reddevils','red devils','belgium','#bel']
england_text = ['#england','#threelions','three lions','england','#itscominghome','#eng']
croatia_text = ['#croatia','#hrvatska','hrvatska','croatia','#cro']

#Function to populate the stateInfo data frame
def PopStateInfo(state_dFrame,tweet_dFrame):
    
    #Loop over the tweet collection
    for idx in range(len(tweet_dFrame)):
        
        #If we have state information for the tweet
        if tweet_dFrame.loc[idx,'state'] != 'None':
            
            #Search for the team identifying phrases and
            #Add one count to that team in state where the tweet came from 
            if any(hashtag in tweet_dFrame.loc[idx,'text'].lower() for hashtag in france_text):
                state_dFrame.loc[tweet_dFrame.loc[idx,'state'],'Num France'] += 1
            elif any(hashtag in tweet_dFrame.loc[idx,'text'].lower() for hashtag in belgium_text):
                state_dFrame.loc[tweet_dFrame.loc[idx,'state'],'Num Belgium'] += 1
            elif any(hashtag in tweet_dFrame.loc[idx,'text'].lower() for hashtag in england_text):
                state_dFrame.loc[tweet_dFrame.loc[idx,'state'],'Num England'] += 1
            elif any(hashtag in tweet_dFrame.loc[idx,'text'].lower() for hashtag in croatia_text):
                state_dFrame.loc[tweet_dFrame.loc[idx,'state'],'Num Croatia'] += 1

    #After we've process all tweets
    #Count all the tweets in each state by adding france, belgium, england, croatia
    state_dFrame['Num Tweets'] = state_dFrame.sum(axis=1)

In [28]:
PopStateInfo(stateInfo,WorldCup_tweets)
PopStateInfo(stateInfo_nosenti,WorldCup_tweets_nosenti)

stateInfo.head(n=5)

Unnamed: 0_level_0,Num Tweets,Num France,Num Belgium,Num England,Num Croatia
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,3,0,1,0,2
Alabama,15,2,2,5,6
Arkansas,7,0,2,1,4
Arizona,38,1,10,11,16
California,394,36,80,126,152


In [29]:
#Function to populate the stateInfo data frame
def CalcDominance(dFrame):
    dFrame['Fra Frac']=dFrame['Num France']/dFrame['Num Tweets']
    dFrame['Bel Frac']=dFrame['Num Belgium']/dFrame['Num Tweets']
    dFrame['Eng Frac']=dFrame['Num England']/dFrame['Num Tweets']
    dFrame['Cro Frac']=dFrame['Num Croatia']/dFrame['Num Tweets']

In [30]:
CalcDominance(stateInfo)
CalcDominance(stateInfo_nosenti)

stateInfo.head(n=5)

Unnamed: 0_level_0,Num Tweets,Num France,Num Belgium,Num England,Num Croatia,Fra Frac,Bel Frac,Eng Frac,Cro Frac
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alaska,3,0,1,0,2,0.0,0.333333,0.0,0.666667
Alabama,15,2,2,5,6,0.133333,0.133333,0.333333,0.4
Arkansas,7,0,2,1,4,0.0,0.285714,0.142857,0.571429
Arizona,38,1,10,11,16,0.026316,0.263158,0.289474,0.421053
California,394,36,80,126,152,0.091371,0.203046,0.319797,0.385787


In [31]:
def PopErrInfo(dFrame,sys_err):
    
    #Systematic errors from negative tweet contamination
    f_sys_err = sys_err*dFrame['Num France']
    b_sys_err = sys_err*dFrame['Num Belgium']
    e_sys_err = sys_err*dFrame['Num England']
    c_sys_err = sys_err*dFrame['Num Croatia']

    #Statistical errors from Poisson counting
    f_stat_err = dFrame['Num France'] ** (1/2)
    b_stat_err = dFrame['Num Belgium'] ** (1/2)
    e_stat_err = dFrame['Num England'] ** (1/2)
    c_stat_err = dFrame['Num Croatia'] ** (1/2)

    #Total errors
    f_tot_err = (f_stat_err ** 2 + f_sys_err **2) ** (1/2)
    b_tot_err = (b_stat_err ** 2 + b_sys_err **2) ** (1/2)
    e_tot_err = (e_stat_err ** 2 + e_sys_err **2) ** (1/2)
    c_tot_err = (c_stat_err ** 2 + c_sys_err **2) ** (1/2)


    #Propagation of total errors to the team dominance metric
    dFrame['F Frac Err']= ( (f_tot_err ** 2)*(dFrame['Num Belgium'] + dFrame['Num England'] + dFrame['Num Croatia'])**2 + \
                                dFrame['Num France']**2 * (b_tot_err**2 + e_tot_err + c_tot_err**2) ) ** (1/2) / \
                             (dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num England'] + dFrame['Num Croatia'])**2

    dFrame['B Frac Err']= ( (b_tot_err ** 2)*(dFrame['Num France'] + dFrame['Num England'] + dFrame['Num Croatia'])**2 + \
                                dFrame['Num Belgium']**2 * (f_tot_err**2 + e_tot_err + c_tot_err**2) ) ** (1/2) / \
                             (dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num England'] + dFrame['Num Croatia'])**2


    dFrame['E Frac Err']= ( (e_tot_err ** 2)*(dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num Croatia'])**2 + \
                                dFrame['Num England']**2 * (f_tot_err**2 + b_tot_err + c_tot_err**2) ) ** (1/2) / \
                             (dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num England'] + dFrame['Num Croatia'])**2
        
    dFrame['C Frac Err']= ( (c_tot_err ** 2)*(dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num England'])**2 + \
                                dFrame['Num Croatia']**2 * (f_tot_err**2 + b_tot_err + e_tot_err**2) ) ** (1/2) / \
                             (dFrame['Num France'] + dFrame['Num Belgium'] + dFrame['Num England'] + dFrame['Num Croatia'])**2

In [32]:
sys_err_nosenti= 0.0892
sys_err_senti  = 0.0240

PopErrInfo(stateInfo,sys_err_senti)
PopErrInfo(stateInfo_nosenti,sys_err_nosenti)

In [33]:
stateInfo.head(n=5)

Unnamed: 0_level_0,Num Tweets,Num France,Num Belgium,Num England,Num Croatia,Fra Frac,Bel Frac,Eng Frac,Cro Frac,F Frac Err,B Frac Err,E Frac Err,C Frac Err
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Alaska,3,0,1,0,2,0.0,0.333333,0.0,0.666667,0.0,0.27227,0.0,0.272244
Alabama,15,2,2,5,6,0.133333,0.133333,0.333333,0.4,0.086574,0.086574,0.120689,0.125016
Arkansas,7,0,2,1,4,0.0,0.285714,0.142857,0.571429,0.0,0.170864,0.131383,0.176425
Arizona,38,1,10,11,16,0.026316,0.263158,0.289474,0.421053,0.025905,0.069019,0.071058,0.074963
California,394,36,80,126,152,0.091371,0.203046,0.319797,0.385787,0.014472,0.019979,0.02328,0.023986


In [41]:
#relative error = frac_err/frac
def AverageRelErr(dFrame):
    row_errs = []
    for row in range(len(dFrame)): 
        rel_errs = []
        if dFrame.loc[row,'Fra Frac'] != 0:
            rel_errs.append(dFrame.loc[row,'F Frac Err']/dFrame.loc[row,'Fra Frac'])

        if dFrame.loc[row,'Bel Frac'] != 0:
            rel_errs.append(dFrame.loc[row,'B Frac Err']/dFrame.loc[row,'Bel Frac'])

        if dFrame.loc[row,'Eng Frac'] != 0:
            rel_errs.append(dFrame.loc[row,'E Frac Err']/dFrame.loc[row,'Eng Frac'])
            
        if dFrame.loc[row,'Cro Frac'] != 0:
            rel_errs.append(dFrame.loc[row,'C Frac Err']/dFrame.loc[row,'Cro Frac'])

        row_errs.append(np.average(rel_errs))
    return row_errs

In [43]:
#stateInfo['Average Rel Err']=AverageRelErr(stateInfo)
#stateInfo_nosenti['Average Rel Err']=AverageRelErr(stateInfo_nosenti)

stateInfo.to_csv('stateInfo.csv')
stateInfo_nosenti.to_csv('stateInfo_nosenti.csv')