# Tweet Collection, Cleaning, and Sentiment Analysis

## Due to the time it took to collect and process the tweets, it was necessary to run most of this code in a loop to both collect, process, and save the tweets to files

In [None]:
import pandas as pd
import nltk
import twint
import numpy as np
from nltk.corpus import stopwords 
set(stopwords.words('english'))
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import WordPunctTokenizer
from bs4 import BeautifulSoup
import string, re
import sys
import twint
import preprocessor as p
import time
sys.setrecursionlimit(1500)

In [None]:
#Creating function to clean tweets

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [None]:
#Creating empty lists and setting up sentiment analyzer
tweetList = []
sentimentList = []
posList = []
negList = []
neuList = []
sid = SentimentIntensityAnalyzer()
p.set_options(p.OPT.URL, p.OPT.EMOJI)
pos_cnt = 0
neu_cnt = 0
neg_cnt = 0
sb = pd.read_csv('SuperBowl.csv')

#Creating dictionary to run the tweet collection loop 
sbDict = {col: list(sb[col]) for col in sb.columns}
sbFinalDF = pd.DataFrame()
for game in sbDict:
    i = 0
    while i < 17:   
        print('running')
        #Twint collection code
        c = twint.Config()
        c.Hide_output = True
        c.Lang = 'en'
        c.Format = "{id}|{date}|{time}|{tweet}|{nlikes}|{nretweets}|{hashtags}"
        c.Search = '#SuperBowl'
        #Refers to dictionary based on datafile that contains the events and dates 
        c.Since = sbDict[game][i+1]
        c.Until = sbDict[game][i+2]
        c.Pandas = True
        c.Store_pandas = True
        twint.run.Search(c)
        gameDF = twint.storage.panda.Tweets_df
        #Changing date into datetime format
        gameDF['date'] = pd.to_datetime(gameDF['date'], format = '%Y-%m-%d')
        #Dropping irrelevant columns
        gameDF = gameDF.drop(columns=['id', 'conversation_id', 'created_at', 'timezone','place', 'cashtags', 'user_id', 'user_id_str', 'username', 'name', 'day', 'hour', 'link', 'retweet', 'nreplies', 'quote_url', 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt', 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src', 'trans_dest'])
        sbFinalDF = pd.concat([sbFinalDF, gameDF], ignore_index=True)
        i+=1
        print('Loop Done' + str(i))
        if i < 17:
            #Sleeping for 8 minutes to get around Twitter's rate limiter (14,000 tweets per ~8 minutes)
            time.sleep(480)
        else:
            seconds = time.time()
            local_time = time.ctime(seconds)
            print(local_time)
            break
    #Setting tweet column to a list for tweet cleaning and sentiment analysis
    tweets = sbFinalDF['tweet'].values.tolist()
    #Cleaning tweets, taking the sentiment polarity of each tweet, and counting the number positive, negative, and neutral
    for t in tweets:
        cleaned = p.clean(t)
        cleanedTweet = tweet_cleaner(cleaned)
        tweetList.append(cleanedTweet)
        sentimentScore = sid.polarity_scores(cleanedTweet)
        sentimentList.append(sentimentScore['compound'])
        if sentimentScore['compound'] > 0:
            pos_cnt += 1
        elif sentimentScore['compound'] == 0:
            neu_cnt += 1
        elif sentimentScore['compound'] < 0:
            neg_cnt += 1
    cnt = {'positive_count': pos_cnt, 'neutral_count': neu_cnt, 'negative_count': neg_cnt}
    countHeader = ['positive_count', 'neutral_count', 'negative_count']
    countDF = pd.DataFrame(cnt, index = [0])
    countDF.columns = countHeader
    #Creating new file with the number positive, negative, and neutral
    countDF.to_csv('SentimentCountFile.csv', mode = 'a', header = True)
    sbFinalDF['cleaned_tweet'] = tweetList
    sbFinalDF['sentiment_polarity'] = sentimentList  
    sbFinalDF = sbFinalDF.sort_values(by=['date'], ascending = True)
    #Resampling the dataframe by day to find the mean polarity, the number of tweets, likes, and retweets
    sbFinalDF = sbFinalDF.set_index(['date'])
    sbByDay = sbFinalDF.resample('D').agg({'sentiment_polarity':'mean','tweet':'count','nlikes':'sum','nretweets':'sum'})
    #Saving each game to its own file
    gameFile = game + '.csv'
    dailyFile = game + 'daily.csv'
    sbFinalDF.to_csv(gameFile,sep ='|', mode = 'a', header = True)
    sbByDay.to_csv(dailyFile, mode = 'a', header = True)

# Term Frequency and Bigram Analysis

In [None]:
import pandas as pd
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.util import ngrams
import pandas as pd
import numpy as np
import preprocessor as p

In [None]:
#Opening Files and setting stop words / cleaning preferences
sbFile = pd.read_csv('C:/Users/elffa/Desktop/Project_final/SBXLVII.csv', sep = '|')
sbFile = sbFile.drop(columns=['hashtags','nlikes','nretweets','cleaned_tweet','sentiment_polarity'])
sbFile['date'] = pd.to_datetime(sbFile['date'])
sbFile['date'] = [d.date() for d in sbFile['date']]
sbFile = sbFile.groupby('date')['tweet'].apply(' '.join).reset_index()
cachedStopWords = stopwords.words("english")
posTags = nltk.pos_tag(tokens)
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION)

In [None]:
#Creating loop to open file, tokenize, remove stop words, take term frequency, and bigram frequency
for index, row in sbFile.iterrows():
    name = row['date']
    bowfile = str(name) + '.csv'
    bigfile = str(name) + '_bigram.csv'
    #Cleaning tweet using preprocessor package
    ct = p.clean(row['tweet'])

    #Tokenizing
    tokens = nltk.word_tokenize(ct)
    #Making each word lowercase, removing non alphabetic characters, and removing stop words
    words = [w.lower() for w in tokens if w.isalpha() if w.lower().replace('[^\w\s]',' ').replace('\s\s+', ' ') not in cachedStopWords]

    #Finding frequency distribution
    freq = FreqDist(words)
    sortedFreq = sorted(freq.items(),key = lambda k:k[1], reverse = True)

    #Adding Frequency Distribution to dataframe and saving to file
    frequencyDataframe = pd.DataFrame(sortedFreq)
    frequencyDataframe = frequencyDataframe.rename(columns = {0:'Word', 1:'Frequency'}) 
    frequencyDataframe.to_csv(bowfile, mode = 'a', index=False, header=0)
    
    #Assigning pos tags for each word in the word list - pulling all nouns and adjectives
    posTagListBigrams = [(word) for (word,tag) in posTags if (tag.startswith('NN') or tag.startswith('J'))]
    #Creating list of bigrams
    Bigrams=list(ngrams(posTagListBigrams,2))

    #Finding frequency distribution of bigrams
    freqBigrams = FreqDist(Bigrams)
    sortedFreqBigrams = sorted(freqBigrams.items(),key = lambda k:k[1], reverse = True)
    
    #Adding Frequency Distribution to dataframe and saving to file
    frequencyBiDataframe = pd.DataFrame(sortedFreqBigrams)
    frequencyBiDataframe = frequencyBiDataframe.rename(columns = {0:'Bigram', 1:'Frequency'}) 
    frequencyBiDataframe.to_csv(bigfile, mode = 'a', sep = '|', index=False, header=0)

# Topic Modeling 

In [None]:
import pandas as pd 
import gensim
from gensim import corpora,models
from gensim.models import LdaModel, LsiModel
import warnings
warnings.filterwarnings("ignore")

## 1. Preprocessing 

In [29]:
# Read data
df = pd.read_excel('SBXLV.xlsx')
df

Unnamed: 0,date,tweet,hashtags,nlikes,nretweets,cleaned_tweet,sentiment_polarity
0,2011-01-23 16:59:27,@DanFenner I rescind my previous comment about...,"['#bears', '#jets', '#superbowl']",0,0,i rescind my previous comment about a bears je...,0.0258
1,2011-01-23 16:59:38,Its looking more and more like Greenbay is goi...,['#superbowl'],0,0,its looking more and more like greenbay is goi...,0.4690
2,2011-01-23 16:59:45,@1dizzyb no way quite the opposite it's the #N...,"['#nfl', '#superbowl']",0,0,no way quite the opposite it s the nfl playoff...,0.2263
3,2011-01-23 16:59:54,@LordBieber we gotta wait to see the champs he...,['#superbowl'],0,0,we gotta wait to see the champs here we go sup...,0.4215
4,2011-01-23 17:00:04,I love the way the packers/bears game is going...,['#superbowl'],0,0,i love the way the packers bears game is going...,0.6369
...,...,...,...,...,...,...,...
141278,2011-02-06 18:29:59,#superbowl commercials qo hard. lls,['#superbowl'],0,0,superbowl commercials qo hard lls,-0.1027
141279,2011-02-06 18:29:59,SUPERBOWL TIME #superbowl,['#superbowl'],0,0,superbowl time superbowl,0.0000
141280,2011-02-06 18:29:59,This feels so #Epic. Fuck You British Sport. #...,"['#epic', '#superbowl', '#steelers', '#aguiler...",0,0,this feels so epic fuck you british sport supe...,-0.5829
141281,2011-02-06 18:29:59,Ok red carpet is over...starlets *singing* is ...,['#superbowl'],0,0,ok red carpet is over starlets singing is done...,0.5423


In [30]:
#convert all tweet text into list format
tweets = df['cleaned_tweet'].tolist()
tweets

['i rescind my previous comment about a bears jets superbowl it s such a big event allegiance doesn t matter',
 'its looking more and more like greenbay is going to the superbowl',
 'no way quite the opposite it s the nfl playoffs it s the semi finals n the winners go to the superbowl baby',
 'we gotta wait to see the champs here we go superbowl go jets',
 'i love the way the packers bears game is going packers jets superbowl',
 'man i don t even care who wins cuz eagles been out but i am havin a superbowl party to get fucked up teamana lol',
 'cutler out collins in not looking good for the bears plus side the nfc north will be at the superbowl nfl',
 'green bay superbowl enoughsaid',
 'rt the superbowl will be jets vs packers soserious',
 'packers and jets superbowl',
 'nfl doesnt want to see black coaches n the superbowl they want the golden boy rodgers vs the rapist roethlisberger',
 'i can t wait to watch the packers vs jets in this year s superbowl nfl',
 'collins in packers super

In [31]:
#remove stop words, to lowercase and tokenize
from nltk.corpus import stopwords
mystopwords = stopwords.words('english')

tokens_list = [[word for word in tweet.split(' ') if word not in mystopwords and word.isalpha()]
         for tweet in tweets]

#remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)

for tokens in tokens_list:
    for token in tokens:
        frequency[token] += 1
        
tokens_list = [[token for token in tokens if frequency[token]>1]
              for tokens in tokens_list]

print(tokens_list)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## 2. Build Corpus

In [32]:
# generate token dictionary class
dictionary = corpora.Dictionary(tokens_list) 
print(dictionary)

Dictionary(23391 unique tokens: ['allegiance', 'bears', 'big', 'comment', 'event']...)


In [33]:
# generate a unique token list 
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse = False)
unique_token = [token for (ID,token) in sort_token]

In [34]:
# build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokens_list]
print(corpus) 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## 3. Topic modeling using LDA

In [35]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5) #fit lda model

lda.print_topics(10) # Topic matrix (V matrix)

[(0,
  '0.114*"superbowl" + 0.039*"rt" + 0.030*"sing" + 0.024*"bowl" + 0.023*"super" + 0.022*"singing" + 0.020*"today" + 0.016*"sunday" + 0.015*"lea" + 0.015*"banner"'),
 (1,
  '0.130*"superbowl" + 0.030*"yellow" + 0.021*"black" + 0.019*"rt" + 0.017*"green" + 0.016*"glee" + 0.012*"mess" + 0.011*"game" + 0.010*"day" + 0.009*"tomorrow"'),
 (2,
  '0.149*"superbowl" + 0.040*"go" + 0.039*"steelers" + 0.035*"packers" + 0.014*"rt" + 0.014*"words" + 0.011*"win" + 0.011*"let" + 0.008*"think" + 0.007*"sb"'),
 (3,
  '0.130*"superbowl" + 0.092*"christina" + 0.062*"anthem" + 0.049*"national" + 0.049*"aguilera" + 0.017*"love" + 0.016*"like" + 0.011*"xtina" + 0.010*"rt" + 0.008*"michael"'),
 (4,
  '0.158*"superbowl" + 0.018*"commercials" + 0.017*"watch" + 0.014*"watching" + 0.014*"time" + 0.013*"get" + 0.013*"football" + 0.011*"game" + 0.010*"like" + 0.009*"party"')]

## 4. Visualization of LDA topics using pyLDAvis

In [36]:
import pyLDAvis.gensim

In [37]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, dictionary)

In [44]:
view = pd.read_excel('superbowls-2010-2020.xlsx')
view
view['TV Vwrs'] = view['TV Vwrs'].str.replace('M','')
view['A18-49 Vwrs'] = view['A18-49 Vwrs'].str.replace('M','')
view
view.to_csv('viewers.csv', index = False)