In [1]:
import pandas as pd
import numpy as np
import gensim
import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
from collections import Counter
import altair as alt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stops = nltk.corpus.stopwords.words('english')

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
df1 = pd.read_csv('Lebron 2021_Oct.csv')
df2 = pd.read_csv('Lebron 2021_Nov.csv')
df3 = pd.read_csv('Lebron 2021_Dec.csv')
df4 = pd.read_csv('Lebron 2022_Jan.csv')

df = pd.concat([df1,df2,df3,df4])
df.dropna(subset=['Text'],inplace=True)
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,Replys,Retweets,Quotes,Likes,Language,Place,Hashtags,Source,Mentions
0,2021-10-19 23:59:55+00:00,1450612709205159939,@Lakers @KingJames Coming into the arena looki...,@Lakers @KingJames Coming into the arena looki...,SteveNe13350562,1,35,109,0,,0.0,0,0.0,0,en,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='Lakers', id=20346956, displayn..."
1,2021-10-19 23:59:44+00:00,1450612663503917056,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,markFin73763459,27,156,516,18,,0.0,0,0.0,0,en,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Awesemo_Com', id=9561989702037..."
2,2021-10-19 23:59:42+00:00,1450612655090278402,@RealSkipBayless where you at #COAWRD? @KDTrey...,@RealSkipBayless where you at #COAWRD? @KDTrey...,ByrdyStrokes,2,17,269,32,,0.0,0,0.0,0,en,,"['COAWRD', 'WEAK']","<a href=""http://twitter.com/download/iphone"" r...","[User(username='RealSkipBayless', id=43139414,..."


In [3]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)

    # will return a list of tags

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    result = ''.join(i for i in temp2 if i not in UNICODE_EMOJI['en'])
    return result

    # return the cleaned tweet without any mention/tag/url/emoji


def sentiment(cleaned_tweet):
    blob = TextBlob(cleaned_tweet)
    return blob.sentiment.polarity

    # return the sentiment score
    
def sentiment_label(score):
    if score >= 0.75:
        return 'POSITIVE'
    elif score >= 0.2:
        return 'positive'
    elif score >= -0.2:
        return 'neutral'
    elif score >= -0.75:
        return 'negative'
    else:
        return 'NEGATIVE'
    
    # label the tweet based on sentiment score
    
def word_tokens(text): 
    tokens = word_tokenize(text)
    result = []
    for tok in tokens: 
        if (tok.lower().isalpha()) & (tok.lower() not in stops):
            result.append(tok)
    
    return result
    # return a list of tokens for the input text string

In [4]:
df['tags']= df.apply(lambda row: extract_tags(row['Text']), axis=1)

df['clean_text']= df.apply(lambda row: clean_tweet(row['Text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['senti_label']= df.apply(lambda row: sentiment_label(row['sentiment_score']), axis=1)
df['words'] = df['clean_text'].apply(word_tokens)

df[['tags','clean_text','sentiment_score','senti_label']].head()

Unnamed: 0,tags,clean_text,sentiment_score,senti_label
0,[],Coming into the arena looking like a boss,0.0,neutral
1,[],Curry,0.0,neutral
2,"[COAWRD, WEAK]",where you at ? playing. Ready to ride back ...,0.1,neutral
3,[LebronJames],Let's go Lakers,0.0,neutral
4,[],Steph Cury,0.0,neutral


In [5]:
vectorizer = TfidfVectorizer(min_df=10, stop_words='english',lowercase=True,max_df=0.9,max_features=1000)

data_matrix = vectorizer.fit_transform(df.clean_text)
data_matrix

<690111x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 3514010 stored elements in Compressed Sparse Row format>

In [6]:
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=695, n_jobs = -1)

lda_output = lda_model.fit_transform(data_matrix)
topic_dict = dict()

for i, topic in enumerate(lda_model.components_):
    print(f'Top 25 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]])
    topic_dict[i] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]]
    print('\n')

Top 25 words for topic #0:
['little', 'player', 'maybe', 'played', 'world', 'oh', 'nba', 'ain', 'like', 'bad', 'playing', 'bitch', 'new', 'covid', 'look', 'ass', 'fuck', 'love', 'didn', 'did', 'best', 'right', 'birthday', 'james', 'lebron']


Top 25 words for topic #1:
['gonna', 'shut', 'dude', 'lmao', 'won', 'old', 'fan', 'like', 'think', 'people', 'real', 'guy', 'year', 'stop', 'say', 'really', 'just', 'said', 'day', 'man', 'don', 'bron', 'good', 'know', 'goat']


Top 25 words for topic #2:
['today', 'russ', '37', 'shot', 'big', 'kobe', 'amp', '10', 'play', 'games', 'vote', 'curry', 'star', 'points', 'season', 'westbrook', 'jordan', 'nba', 'win', 'team', 'game', 'better', 'lakers', 'james', 'lebron']


Top 25 words for topic #3:
['great', 'thing', 'kyle', 'people', 'years', 'doing', 'point', 'tweet', 'come', 've', 'does', 'don', 'fans', 'want', 'make', 'captain', 'shit', 'china', 'like', 'soccer', 'bro', 'just', 'let', 'lol', 'happy']


Top 25 words for topic #4:
['believe', 'making'

In [7]:
lda_model.score(data_matrix), lda_model.perplexity(data_matrix)

(-9596240.871564606, 1160.679581178665)

In [8]:
df['Topic'] = lda_output.argmax(axis=1)
df['Topic'].head()

0    3
1    2
2    4
3    3
4    2
Name: Topic, dtype: int64

In [9]:
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,...,Place,Hashtags,Source,Mentions,clean_text,sentiment_score,Topic,tags,senti_label,words
0,2021-10-19 23:59:55+00:00,1450612709205159939,@Lakers @KingJames Coming into the arena looki...,@Lakers @KingJames Coming into the arena looki...,SteveNe13350562,1,35,109,0,,...,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='Lakers', id=20346956, displayn...",Coming into the arena looking like a boss,0.0,3,[],neutral,"[Coming, arena, looking, like, boss]"
1,2021-10-19 23:59:44+00:00,1450612663503917056,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,markFin73763459,27,156,516,18,,...,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Awesemo_Com', id=9561989702037...",Curry,0.0,2,[],neutral,[Curry]
2,2021-10-19 23:59:42+00:00,1450612655090278402,@RealSkipBayless where you at #COAWRD? @KDTrey...,@RealSkipBayless where you at #COAWRD? @KDTrey...,ByrdyStrokes,2,17,269,32,,...,,"['COAWRD', 'WEAK']","<a href=""http://twitter.com/download/iphone"" r...","[User(username='RealSkipBayless', id=43139414,...",where you at ? playing. Ready to ride back ...,0.1,4,"[COAWRD, WEAK]",neutral,"[playing, Ready, ride, back, play, pick, amp, ..."


In [10]:
POS_count = df[df['senti_label']=='POSITIVE'].shape[0]
POS_ratio = POS_count/df.shape[0]

NEG_count = df[df['senti_label']=='NEGATIVE'].shape[0]
NEG_ratio = NEG_count/df.shape[0]

print('Amid the Lebron tweets from 2021/10/19 ~ 2022/01/19 :')
print(POS_count,'tweets are highly positive, that is',round(POS_ratio*100,3),'% of the total tweets')
print(NEG_count,'tweets are highly negative, that is',round(NEG_ratio*100,3),'% of the total tweets')

Amid the Lebron tweets from 2021/10/19 ~ 2022/01/19 :
30901 tweets are highly positive, that is 4.478 % of the total tweets
9607 tweets are highly negative, that is 1.392 % of the total tweets


### Topic/Sentiment Exploration

In [11]:
dic = dict()

for i in df.Topic.unique():
    label = 'topic ' + str(i)
    data = df[df['Topic']==i]
    dic[label] = data

dic.keys()

dict_keys(['topic 3', 'topic 2', 'topic 4', 'topic 1', 'topic 0'])

In [12]:
def topic_summary(idx):
    label = 'topic ' + str(idx)
    data = dic[label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
        
    top = c.most_common(25)
    top_tags = []
    for combo in top:
        top_tags.append(combo[0])
    
    top_words = topic_dict[idx]
    
    total = data.shape[0]
    senti_group = data.groupby('senti_label').count()['Topic']
    POS = senti_group['POSITIVE']
    NEG = senti_group['NEGATIVE']
    pos = senti_group['positive']
    neg = senti_group['negative']
    neu = senti_group['neutral']
    pos_ratio = round(100*(POS + pos) / total,3)
    neg_ratio = round(100*(NEG + neg) / total,3)
    neu_ratio = round(100*neu/total,3)
    
    score = round(data['sentiment_score'].mean(),3)
    
    print('There are a total of', total, f'tweets in topic #{idx}')
    print(' ')
    print('Top 25 words : ',top_words)
    print(' ')
    print('Top 25 tags : ',top_tags)
    print(' ')
    print(round(100*POS/total,3), '% of the tweets in this topic cluster are highly positive,')
    print('while', round(100*NEG/total,3), '% of the tweets are highly negative')
    print(' ')
    print(POS + pos, 'tweets are scored as positive, while',NEG + neg , 'tweets are negative')
    print(pos_ratio,'% positive tweets,',neu_ratio,'% neutral, and', neg_ratio,'% negative')
    print('The average sentiment score is', score)

In [13]:
topic_summary(0)

There are a total of 172300 tweets in topic #0
 
Top 25 words :  ['little', 'player', 'maybe', 'played', 'world', 'oh', 'nba', 'ain', 'like', 'bad', 'playing', 'bitch', 'new', 'covid', 'look', 'ass', 'fuck', 'love', 'didn', 'did', 'best', 'right', 'birthday', 'james', 'lebron']
 
Top 25 tags :  ['LeBronJames', 'NBAAllStar', 'NBA', 'LakeShow', 'Lakers', 'lebronjames', 'nba', 'lakers', 'NBA75', 'LebronJames', 'LeBron', 'lebron', 'thehobby', 'NBATwitter', 'basketball', 'LakersNation', 'lakeshow', 'KingJames', 'Lebron', 'KyleRittenhouse', 'KaranKundrra', 'eBay', 'LeSnitch', 'KaranIsTheBoss', 'KKundrraSquad']
 
5.879 % of the tweets in this topic cluster are highly positive,
while 1.132 % of the tweets are highly negative
 
36742 tweets are scored as positive, while 18732 tweets are negative
21.324 % positive tweets, 67.804 % neutral, and 10.872 % negative
The average sentiment score is 0.061


In [14]:
topic_summary(1)

There are a total of 147965 tweets in topic #1
 
Top 25 words :  ['gonna', 'shut', 'dude', 'lmao', 'won', 'old', 'fan', 'like', 'think', 'people', 'real', 'guy', 'year', 'stop', 'say', 'really', 'just', 'said', 'day', 'man', 'don', 'bron', 'good', 'know', 'goat']
 
Top 25 tags :  ['LeBronJames', 'NBAAllStar', 'NBA', 'LakeShow', 'Lakers', 'LebronJames', 'NBA75', 'nba', 'lebronjames', 'KyleRittenhouse', '1', 'LeBron', 'LakersNation', 'lakers', 'NBATwitter', 'LeSnitch', 'GOAT', 'China', 'basketball', 'KingJames', 'NBAALLStar', 'NBAAIIStar', 'lebron', 'twitter', 'Lebron']
 
3.105 % of the tweets in this topic cluster are highly positive,
while 2.405 % of the tweets are highly negative
 
37884 tweets are scored as positive, while 19758 tweets are negative
25.603 % positive tweets, 61.043 % neutral, and 13.353 % negative
The average sentiment score is 0.052


In [15]:
topic_summary(4)

There are a total of 73983 tweets in topic #4
 
Top 25 words :  ['believe', 'making', 'hope', 'baby', 'night', 'hey', 'needs', 'ratio', 'kevin', 'finals', 'life', 'basketball', 'sure', 'getting', 'retweet', 'll', 'player', 'need', 'greatest', 'trade', 'got', 'james', 'lebron', 'time', 'king']
 
Top 25 tags :  ['LeBronJames', 'NBAAllStar', 'NBA', 'Lakers', 'LakeShow', 'lebronjames', 'nba', 'NBA75', 'NBAALLSTAR', 'LebronJames', 'lakers', 'LakersNation', 'basketball', 'LeBron', 'NBATwitter', 'NBAAIIStar', 'sports', 'COVID19', 'KyleRittenhouse', 'KingJames', 'NBATopShotThis', '1', 'GOAT', 'TheBlackSeries', 'LeSnitch']
 
5.59 % of the tweets in this topic cluster are highly positive,
while 0.93 % of the tweets are highly negative
 
19147 tweets are scored as positive, while 7350 tweets are negative
25.88 % positive tweets, 64.185 % neutral, and 9.935 % negative
The average sentiment score is 0.085


### Didn't work out that well from topic perspective
#### Try analyze from the sentiment perspective

In [16]:
def senti_summary(label):
    data = df[df['senti_label'] == label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
    
    top = c.most_common(25)
    top_tags = []
    
    for combo in top:
        top_tags.append(combo[0])
    
    tags_df = pd.DataFrame(top,columns = ['Tag','count'])
    
    c1 = Counter()
    for i in data['words']:
        c1.update(Counter(i))
        
    top1 = c1.most_common(25) 
    top_words = []
    for combo in top1:
        top_words.append(combo[0])
    
    overall = df.shape[0]
    total = data.shape[0]
    topic_group = data.groupby('Topic').count()['senti_label']
    
    print('Tweets labeled as',label,'account for',np.round(100*total/overall,3),'% of the total tweets about Lebron')
    print(' ')
    print('For the tweets in this sentiment label: ')
    print(f'Top 25 words : ',top_words)
    print(' ')
    print(f'Top 25 tags : ',top_tags)
    print(' ')
    print('the topic label distribution looks like: ', topic_group)


In [17]:
senti_summary('POSITIVE')

Tweets labeled as POSITIVE account for 4.478 % of the total tweets about Lebron
 
For the tweets in this sentiment label: 
Top 25 words :  ['James', 'Happy', 'LeBron', 'birthday', 'best', 'lol', 'player', 'Lebron', 'Birthday', 'win', 'greatest', 'time', 'basketball', 'great', 'Lol', 'happy', 'ever', 'Lakers', 'one', 'like', 'King', 'NBA', 'still', 'would', 'GOAT']
 
Top 25 tags :  ['LeBronJames', 'NBAAllStar', 'LakeShow', 'NBA', 'LebronJames', 'Lakers', 'NBA75', 'NBAALLStar', 'KingJames', 'GOAT', 'lebronjames', 'HappyBirthdayLeBronJames', 'nba', 'StriveForGreatness', 'LakersNation', 'lakers', 'LeBron', 'NBATwitter', 'basketball', 'lebron', 'Lebron', 'sports', '1', 'goat', 'NBAAIIStar']
 
the topic label distribution looks like:  Topic
0    10129
1     4594
2     5640
3     6402
4     4136
Name: senti_label, dtype: int64


In [18]:
senti_summary('NEGATIVE')

Tweets labeled as NEGATIVE account for 1.392 % of the total tweets about Lebron
 
For the tweets in this sentiment label: 
Top 25 words :  ['James', 'LeBron', 'idiot', 'hate', 'stupid', 'Lebron', 'like', 'Base', 'moron', 'worst', 'Set', 'bought', 'pathetic', 'Series', 'ETH', 'Floor', 'disgusting', 'people', 'get', 'fucking', 'one', 'terrible', 'insane', 'horrible', 'man']
 
Top 25 tags :  ['LakeShow', 'LeBronJames', 'NBAAllStar', 'NBA', 'lebronjames', 'Lakers', 'NBA75', 'lakers', 'nba', 'LeBron', 'LeSnitch', 'SmartNews', 'LakersNation', 'KyleRittenhouse', 'NBATwitter', 'LebronJames', 'KingJames', 'thehobby', 'IsaiahStewart', 'basketball', 'China', 'GOAT', 'lakeshow', 'pistons', 'player']
 
the topic label distribution looks like:  Topic
0    1950
1    3558
2    1548
3    1863
4     688
Name: senti_label, dtype: int64


In [19]:
df.to_pickle('Lebron_topicdf.pkl')

# save the complete dataframe in pickle format, for further visualization purpose
# save as csv format will lose some feature (like tag lists)

### PyLDAvis 

For the detailed explanation of the plot, please check out the official documentation.

In [20]:
import pyLDAvis
import pyLDAvis.sklearn

In [21]:
pyLDAvis.enable_notebook()

In [22]:
vis = pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer)

pyLDAvis.save_html(vis,'Lebron_LDAvis.html')

  and should_run_async(code)
