In [1]:
import pandas as pd
import numpy as np
import gensim
import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
from collections import Counter
import altair as alt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stops = nltk.corpus.stopwords.words('english')

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('Brady 2021_Sep.csv')
df1 = pd.read_csv('Brady 2021_Oct.csv')
df2 = pd.read_csv('Brady 2021_Nov.csv')
df3 = pd.read_csv('Brady 2021_Dec.csv')
df4 = pd.read_csv('Brady 2022_Jan.csv')

df = pd.concat([df,df1,df2,df3,df4])
df.dropna(subset=['Text'],inplace=True)
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,Replys,Retweets,Quotes,Likes,Language,Place,Hashtags,Source,Mentions
0,2021-09-09 23:59:56+00:00,1436117197399674881,@badlangel27 He should be playing against Tom ...,@badlangel27 He should be playing against Tom ...,randysavage6699,2528,1999.0,39564.0,860,on the road,1.0,0.0,0.0,1.0,en,,,"<a href=""http://twitter.com/#!/download/ipad"" ...","[User(username='badlangel27', id=2743291521, d..."
1,2021-09-09 23:59:54+00:00,1436117188939862019,@JHugo13 Tom Brady,@JHugo13 Tom Brady,Persona956,1127,92.0,3.0,96,"Texas, USA",0.0,0.0,0.0,1.0,en,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='JHugo13', id=75009909, display..."
2,2021-09-09 23:59:53+00:00,1436117184623882242,RIGHT NOW: The 2021 NFL season is starting and...,RIGHT NOW: The 2021 NFL season is starting and...,NBC10Boston,33683,442.0,1517.0,18036,"Boston, MA",0.0,1.0,0.0,3.0,en,,,"<a href=""http://www.socialflow.com"" rel=""nofol...",


In [3]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)

    # will return a list of tags

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    result = ''.join(i for i in temp2 if i not in UNICODE_EMOJI['en'])
    return result

    # return the cleaned tweet without any mention/tag/url/emoji


def sentiment(cleaned_tweet):
    blob = TextBlob(cleaned_tweet)
    return blob.sentiment.polarity

    # return the sentiment score
    
def sentiment_label(score):
    if score >= 0.75:
        return 'POSITIVE'
    elif score >= 0.2:
        return 'positive'
    elif score >= -0.2:
        return 'neutral'
    elif score >= -0.75:
        return 'negative'
    else:
        return 'NEGATIVE'
    
    # label the tweet based on sentiment score
    
def word_tokens(text): 
    tokens = word_tokenize(text)
    result = []
    for tok in tokens: 
        if (tok.lower().isalpha()) & (tok.lower() not in stops):
            result.append(tok)
    
    return result
    # return a list of tokens for the input text string

In [4]:
df['tags']= df.apply(lambda row: extract_tags(row['Text']), axis=1)

df['clean_text']= df.apply(lambda row: clean_tweet(row['Text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['senti_label']= df.apply(lambda row: sentiment_label(row['sentiment_score']), axis=1)
df['words'] = df['clean_text'].apply(word_tokens)

df[['tags','clean_text','sentiment_score','senti_label']].head()

Unnamed: 0,tags,clean_text,sentiment_score,senti_label
0,[],He should be playing against Tom Brady should...,0.0,neutral
1,[],Tom Brady,0.0,neutral
2,[],RIGHT NOW: The 2021 NFL season is starting and...,0.054762,neutral
3,[],Let’s go Old Man aka Tom Brady!,0.125,neutral
4,[],I can’t back Tom Brady! I don’t like him!,0.0,neutral


### Latent Dirichlet Allocation

In [5]:
vectorizer = TfidfVectorizer(min_df=10, stop_words='english',lowercase=True,max_df=0.9,max_features=1000)

data_matrix = vectorizer.fit_transform(df.clean_text)
data_matrix

<690518x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4520834 stored elements in Compressed Sparse Row format>

In [6]:
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=695, n_jobs = -1)

lda_output = lda_model.fit_transform(data_matrix)
topic_dict = dict()

for i, topic in enumerate(lda_model.components_):
    print(f'Top 25 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]])
    topic_dict[i] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]]
    print('\n')

Top 25 words for topic #0:
['allen', 'record', 'greatest', 'career', 'season', 'ball', 'stop', 'way', 'sure', 'pass', 'going', 'big', 'gets', 'playoff', 'td', 'nfl', 'didn', 'ab', 'time', 'got', 'right', 'aaron', 'rodgers', 'tom', 'brady']


Top 25 words for topic #1:
['want', 'playoffs', 'vs', 'defense', 'nfl', 'hate', 'buccaneers', 'tampa', 'won', 'patriots', 'great', 'best', 'new', 'year', 'qb', 'mvp', 'game', 'bowl', 'win', 'goat', 'team', 'lol', 'super', 'tom', 'brady']


Top 25 words for topic #2:
['half', 'nfc', 'lebron', 'home', 'touchdown', 'beat', '20', 'run', 'guys', 'lose', 'needs', 'doing', 'lost', 'come', 'bucs', 'gonna', 'fans', 'saints', 'antonio', 'need', 'eagles', 'brown', 'fuck', 'tom', 'brady']


Top 25 words for topic #3:
['game', 'yes', 'night', 'pats', 'football', 'refs', 'fucking', 've', 'jets', 'people', 'day', 'throw', 'bad', 'know', 'shit', 'don', 'yards', 'did', 'today', 'think', 'say', 'just', 'good', 'tom', 'brady']


Top 25 words for topic #4:
['bro', 'sa

In [7]:
lda_model.score(data_matrix), lda_model.perplexity(data_matrix)

(-10552521.310721088, 1047.637129252472)

In [8]:
df['Topic'] = lda_output.argmax(axis=1)
df['Topic'].head()

0    4
1    2
2    0
3    4
4    4
Name: Topic, dtype: int64

In [9]:
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,...,Place,Hashtags,Source,Mentions,tags,clean_text,sentiment_score,senti_label,words,Topic
0,2021-09-09 23:59:56+00:00,1436117197399674881,@badlangel27 He should be playing against Tom ...,@badlangel27 He should be playing against Tom ...,randysavage6699,2528,1999.0,39564.0,860,on the road,...,,,"<a href=""http://twitter.com/#!/download/ipad"" ...","[User(username='badlangel27', id=2743291521, d...",[],He should be playing against Tom Brady should...,0.0,neutral,"[playing, Tom, Brady, give, motivation]",4
1,2021-09-09 23:59:54+00:00,1436117188939862019,@JHugo13 Tom Brady,@JHugo13 Tom Brady,Persona956,1127,92.0,3.0,96,"Texas, USA",...,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='JHugo13', id=75009909, display...",[],Tom Brady,0.0,neutral,"[Tom, Brady]",2
2,2021-09-09 23:59:53+00:00,1436117184623882242,RIGHT NOW: The 2021 NFL season is starting and...,RIGHT NOW: The 2021 NFL season is starting and...,NBC10Boston,33683,442.0,1517.0,18036,"Boston, MA",...,,,"<a href=""http://www.socialflow.com"" rel=""nofol...",,[],RIGHT NOW: The 2021 NFL season is starting and...,0.054762,neutral,"[RIGHT, NFL, season, starting, surprise, Tom, ...",0


In [10]:
POS_count = df[df['senti_label']=='POSITIVE'].shape[0]
POS_ratio = POS_count/df.shape[0]

NEG_count = df[df['senti_label']=='NEGATIVE'].shape[0]
NEG_ratio = NEG_count/df.shape[0]

print('Amid the Brady tweets from 2021/09/09 ~ 2022/01/19 :')
print(POS_count,'tweets are highly positive, that is',round(POS_ratio*100,3),'% of the total tweets')
print(NEG_count,'tweets are highly negative, that is',round(NEG_ratio*100,3),'% of the total tweets')

Amid the Brady tweets from 2021/09/09 ~ 2022/01/19 :
28424 tweets are highly positive, that is 4.116 % of the total tweets
8097 tweets are highly negative, that is 1.173 % of the total tweets


### Topic/Sentiment Exploration

In [11]:
dic = dict()

for i in df.Topic.unique():
    label = 'topic ' + str(i)
    data = df[df['Topic']==i]
    dic[label] = data

dic.keys()

dict_keys(['topic 4', 'topic 2', 'topic 0', 'topic 1', 'topic 3'])

In [12]:
def topic_summary(idx):
    label = 'topic ' + str(idx)
    data = dic[label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
        
    top = c.most_common(25)
    top_tags = []
    for combo in top:
        top_tags.append(combo[0])
    
    top_words = topic_dict[idx]
    
    total = data.shape[0]
    senti_group = data.groupby('senti_label').count()['Topic']
    POS = senti_group['POSITIVE']
    NEG = senti_group['NEGATIVE']
    pos = senti_group['positive']
    neg = senti_group['negative']
    neu = senti_group['neutral']
    pos_ratio = round(100*(POS + pos) / total,3)
    neg_ratio = round(100*(NEG + neg) / total,3)
    neu_ratio = round(100*neu/total,3)
    
    score = round(data['sentiment_score'].mean(),3)
    
    print('There are a total of', total, f'tweets in topic #{idx}')
    print(' ')
    print('Top 25 words : ',top_words)
    print(' ')
    print('Top 25 tags : ',top_tags)
    print(' ')
    print(round(100*POS/total,3), '% of the tweets in this topic cluster are highly positive,')
    print('while', round(100*NEG/total,3), '% of the tweets are highly negative')
    print(' ')
    print(POS + pos, 'tweets are scored as positive, while',NEG + neg , 'tweets are negative')
    print(pos_ratio,'% positive tweets,',neu_ratio,'% neutral, and', neg_ratio,'% negative')
    print('The average sentiment score is', score)

In [13]:
topic_summary(0)

There are a total of 146085 tweets in topic #0
 
Top 25 words :  ['allen', 'record', 'greatest', 'career', 'season', 'ball', 'stop', 'way', 'sure', 'pass', 'going', 'big', 'gets', 'playoff', 'td', 'nfl', 'didn', 'ab', 'time', 'got', 'right', 'aaron', 'rodgers', 'tom', 'brady']
 
Top 25 tags :  ['NFL', 'TomBrady', 'GoBucs', 'dogelon', 'dogelonmars', 'DraftKingsNFT', 'Bucs', 'Buccaneers', 'Patriots', 'tombrady', 'nfl', 'GOAT', 'AutographNFT', 'thehobby', 'TB12', 'NFLTwitter', '1', 'ForeverNE', 'Eagles', 'FlyEaglesFly', 'Bitcoin', 'TheHobby', 'eBay', 'SportsCards', 'Auction']
 
3.404 % of the tweets in this topic cluster are highly positive,
while 0.799 % of the tweets are highly negative
 
39559 tweets are scored as positive, while 12098 tweets are negative
27.079 % positive tweets, 64.639 % neutral, and 8.281 % negative
The average sentiment score is 0.087


In [14]:
topic_summary(1)

There are a total of 233621 tweets in topic #1
 
Top 25 words :  ['want', 'playoffs', 'vs', 'defense', 'nfl', 'hate', 'buccaneers', 'tampa', 'won', 'patriots', 'great', 'best', 'new', 'year', 'qb', 'mvp', 'game', 'bowl', 'win', 'goat', 'team', 'lol', 'super', 'tom', 'brady']
 
Top 25 tags :  ['NFL', 'TomBrady', 'GoBucs', 'Patriots', 'Bucs', 'Buccaneers', 'nfl', 'tombrady', 'ForeverNE', 'NFLTwitter', 'GOAT', '1', 'TB12', 'Saints', 'TBvsNE', 'TampaBayBuccaneers', 'Eagles', 'BillsMafia', 'Brady', 'TheReturn', 'SNF', 'FlyEaglesFly', 'Cowboys', 'thehobby', 'football']
 
5.795 % of the tweets in this topic cluster are highly positive,
while 1.251 % of the tweets are highly negative
 
83902 tweets are scored as positive, while 20684 tweets are negative
35.914 % positive tweets, 55.233 % neutral, and 8.854 % negative
The average sentiment score is 0.135


In [15]:
topic_summary(4)

There are a total of 106075 tweets in topic #4
 
Top 25 words :  ['bro', 'says', 'dude', 'help', 'yeah', 'amp', 'real', 'just', 'card', 'watching', 'ass', 'guy', 'really', 'years', 'old', 'better', 'look', 'let', 'said', 'does', 'love', 'man', 'like', 'brady', 'tom']
 
Top 25 tags :  ['TomBrady', 'NFL', 'GoBucs', 'Bucs', 'GOAT', 'Buccaneers', 'Patriots', 'nfl', 'tombrady', 'thehobby', 'TB12', 'ForeverNE', 'DALvsTB', 'NFLTwitter', 'ManInTheArena', 'NFLKickoff', 'Saints', 'whodoyoucollect', 'LFG', '1', 'TampaBayBuccaneers', 'BillsMafia', 'TheReturn', 'Bitcoin', 'football']
 
3.136 % of the tweets in this topic cluster are highly positive,
while 0.898 % of the tweets are highly negative
 
33046 tweets are scored as positive, while 11674 tweets are negative
31.153 % positive tweets, 57.841 % neutral, and 11.005 % negative
The average sentiment score is 0.092


### Didn't work out that well from topic perspective
#### Try analyze from the sentiment perspective

In [16]:
def senti_summary(label):
    data = df[df['senti_label'] == label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
    
    top = c.most_common(25)
    top_tags = []
    
    for combo in top:
        top_tags.append(combo[0])
    
    tags_df = pd.DataFrame(top,columns = ['Tag','count'])
    
    c1 = Counter()
    for i in data['words']:
        c1.update(Counter(i))
        
    top1 = c1.most_common(25) 
    top_words = []
    for combo in top1:
        top_words.append(combo[0])
    
    overall = df.shape[0]
    total = data.shape[0]
    topic_group = data.groupby('Topic').count()['senti_label']
    
    print('Tweets labeled as',label,'account for',np.round(100*total/overall,3),'% of the total tweets about Tom Brady')
    print(' ')
    print('For the tweets in this sentiment label: ')
    print(f'Top 25 words : ',top_words)
    print(' ')
    print(f'Top 25 tags : ',top_tags)
    print(' ')
    print('the topic label distribution looks like: ', topic_group)


In [17]:
senti_summary('POSITIVE')

Tweets labeled as POSITIVE account for 4.116 % of the total tweets about Tom Brady
 
For the tweets in this sentiment label: 
Top 25 words :  ['Brady', 'Tom', 'win', 'best', 'lol', 'great', 'greatest', 'ever', 'time', 'QB', 'like', 'Lol', 'NFL', 'good', 'would', 'one', 'team', 'Great', 'Bucs', 'still', 'brady', 'tom', 'football', 'get', 'amp']
 
Top 25 tags :  ['TomBrady', 'NFL', 'GoBucs', 'Bucs', 'GOAT', 'Buccaneers', 'Patriots', 'tombrady', 'nfl', 'ForeverNE', 'NFLTwitter', 'TB12', 'NFT', 'LFG', 'TampaBayBuccaneers', '1', 'MadTrooper', 'Solana', 'NFLKickoff', 'TheReturn', 'DALvsTB', 'TampaBay', 'ManInTheArena', 'FlyEaglesFly', 'football']
 
the topic label distribution looks like:  Topic
0     4973
1    13539
2     1856
3     4729
4     3327
Name: senti_label, dtype: int64


In [18]:
senti_summary('NEGATIVE')

Tweets labeled as NEGATIVE account for 1.173 % of the total tweets about Tom Brady
 
For the tweets in this sentiment label: 
Top 25 words :  ['Brady', 'Tom', 'hate', 'like', 'worst', 'tom', 'brady', 'stupid', 'one', 'insane', 'fucking', 'Terrible', 'would', 'throw', 'terrible', 'TOM', 'BRADY', 'still', 'shocking', 'idiot', 'video', 'fan', 'tablet', 'team', 'Microsoft']
 
Top 25 tags :  ['TomBrady', 'NFL', 'GoBucs', 'Buccaneers', 'GOAT', 'Bucs', 'FlyEaglesFly', 'nfl', 'tombrady', 'DALvsTB', 'Patriots', 'TampaBayBuccaneers', 'Saints', 'BillsMafia', 'TB12', 'Jets', 'Diet', 'NFLTwitter', 'MNF', 'thehobby', 'NFLKickoff', 'firekellenmoore', '1', 'LFG', 'Eagles']
 
the topic label distribution looks like:  Topic
0    1167
1    2923
2    1249
3    1805
4     953
Name: senti_label, dtype: int64


In [19]:
df.to_pickle('Brady_topicdf.pkl')

# save the complete dataframe in pickle format, for further visualization purpose
# save as csv format will lose some feature (like tag lists)

### PyLDAvis 

For the detailed explanation of the plot, please check out the official documentation.

In [20]:
import pyLDAvis
import pyLDAvis.sklearn

In [21]:
pyLDAvis.enable_notebook()

In [22]:
vis = pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer)

pyLDAvis.save_html(vis,'Brady_LDAvis.html')

  and should_run_async(code)
