In [1]:
import pandas as pd
import numpy as np
import gensim
import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
from collections import Counter
import altair as alt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stops = nltk.corpus.stopwords.words('english')

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df1 = pd.read_csv('Dame 2021_Oct.csv')
df2 = pd.read_csv('Dame 2021_Nov.csv')
df3 = pd.read_csv('Dame 2021_Dec.csv')
df4 = pd.read_csv('Dame 2022_Jan.csv')

df = pd.concat([df1,df2,df3,df4])
df.dropna(subset=['Text'],inplace=True)
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,Replys,Retweets,Quotes,Likes,Language,Place,Hashtags,Source,Mentions
0,2021-10-19 23:59:03+00:00,1450612491860463616,@beebinton Is that @Dame_Lillard ?,@beebinton Is that @Dame_Lillard ?,Thomas_Ember,2892,457,18914,1752,ðŸ‡¨ðŸ‡¦,1,0,0,1,en,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='beebinton', id=145747165857547..."
1,2021-10-19 23:57:26+00:00,1450612083976925185,@CHold @Dame_Lillard I need to tell myself thi...,@CHold @Dame_Lillard I need to tell myself thi...,marc_fuller,47,171,6740,73,,0,0,0,3,en,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='CHold', id=14328072, displayna..."
2,2021-10-19 23:56:05+00:00,1450611743890182146,Day 25 for asking @Dame_Lillard for a follow back,Day 25 for asking @Dame_Lillard for a follow back,EdwinSiu10,1126,2282,1715,171,Block = I own you,0,0,0,0,en,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Dame_Lillard', id=267425142, d..."


In [3]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)

    # will return a list of tags

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    result = ''.join(i for i in temp2 if i not in UNICODE_EMOJI['en'])
    return result

    # return the cleaned tweet without any mention/tag/url/emoji


def sentiment(cleaned_tweet):
    blob = TextBlob(cleaned_tweet)
    return blob.sentiment.polarity

    # return the sentiment score
    
def sentiment_label(score):
    if score >= 0.75:
        return 'POSITIVE'
    elif score >= 0.2:
        return 'positive'
    elif score >= -0.2:
        return 'neutral'
    elif score >= -0.75:
        return 'negative'
    else:
        return 'NEGATIVE'
    
    # label the tweet based on sentiment score
    
def word_tokens(text): 
    tokens = word_tokenize(text)
    result = []
    for tok in tokens: 
        if (tok.lower().isalpha()) & (tok.lower() not in stops):
            result.append(tok)
    
    return result
    # return a list of tokens for the input text string

In [4]:
df['tags']= df.apply(lambda row: extract_tags(row['Text']), axis=1)

df['clean_text']= df.apply(lambda row: clean_tweet(row['Text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['senti_label']= df.apply(lambda row: sentiment_label(row['sentiment_score']), axis=1)
df['words'] = df['clean_text'].apply(word_tokens)

df[['tags','clean_text','sentiment_score','senti_label']].head()

Unnamed: 0,tags,clean_text,sentiment_score,senti_label
0,[],Is that ?,0.0,neutral
1,[],I need to tell myself this sometimes.,0.0,neutral
2,[],Day 25 for asking for a follow back,0.0,neutral
3,[],yall hyping up KD for being a 7 ft Damian Lill...,0.275,positive
4,[],God is Moving for You,0.0,neutral


### Latent Dirichlet Allocation

In [5]:
vectorizer = TfidfVectorizer(min_df=10, stop_words='english',lowercase=True,max_df=0.9,max_features=1000)

data_matrix = vectorizer.fit_transform(df.clean_text)
data_matrix

<62153x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 347528 stored elements in Compressed Sparse Row format>

In [6]:
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=695, n_jobs = -1)

lda_output = lda_model.fit_transform(data_matrix)
topic_dict = dict()

for i, topic in enumerate(lda_model.components_):
    print(f'Top 25 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]])
    topic_dict[i] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]]
    print('\n')

Top 25 words for topic #0:
['help', 'hope', 'happy', 'ass', 'way', 'really', 'new', 'watch', 'guy', 'won', 've', 'make', 'right', 'great', 'amp', 'injury', 'dame', 'surgery', 'lakers', 'day', 'year', 'play', 'game', 'bro', 'come']


Top 25 words for topic #1:
['nice', 'today', 'gotta', 'run', 'mccollum', 'isn', 'didn', 'nba', 'trade', 'tonight', 'real', 'going', 'gonna', 'vs', 'fans', 'cj', 'abdominal', 'said', 'love', 'trail', 'long', 'portland', 'blazers', 'damian', 'lillard']


Top 25 words for topic #2:
['2021', 'start', '75', 'raiders', 'needs', 'stop', 'soon', 'lmao', 'point', 'damian', 'just', 'star', 'thank', 'lillard', 'playing', 'll', 'game', 'don', 'lol', 'like', 'got', 'dame', 'let', 'know', 'time']


Top 25 words for topic #3:
['carr', 'don', 'portland', 'people', 'years', 'look', 'ain', 'philly', 'fan', 'win', 'like', 'shit', 'did', 'say', 'trade', 'team', 'need', 'vote', 'best', 'want', 'good', 'think', 'just', 'man', 'dame']


Top 25 words for topic #4:
['anthony', 'pla

In [7]:
lda_model.score(data_matrix), lda_model.perplexity(data_matrix)

(-884721.4157279303, 1043.0058024739137)

In [8]:
df['Topic'] = lda_output.argmax(axis=1)
df['Topic'].head()

0    0
1    3
2    0
3    4
4    4
Name: Topic, dtype: int64

In [9]:
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,...,Place,Hashtags,Source,Mentions,tags,clean_text,sentiment_score,senti_label,words,Topic
0,2021-10-19 23:59:03+00:00,1450612491860463616,@beebinton Is that @Dame_Lillard ?,@beebinton Is that @Dame_Lillard ?,Thomas_Ember,2892,457,18914,1752,ðŸ‡¨ðŸ‡¦,...,,,"<a href=""http://twitter.com/download/android"" ...","[User(username='beebinton', id=145747165857547...",[],Is that ?,0.0,neutral,[],0
1,2021-10-19 23:57:26+00:00,1450612083976925185,@CHold @Dame_Lillard I need to tell myself thi...,@CHold @Dame_Lillard I need to tell myself thi...,marc_fuller,47,171,6740,73,,...,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='CHold', id=14328072, displayna...",[],I need to tell myself this sometimes.,0.0,neutral,"[need, tell, sometimes]",3
2,2021-10-19 23:56:05+00:00,1450611743890182146,Day 25 for asking @Dame_Lillard for a follow back,Day 25 for asking @Dame_Lillard for a follow back,EdwinSiu10,1126,2282,1715,171,Block = I own you,...,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Dame_Lillard', id=267425142, d...",[],Day 25 for asking for a follow back,0.0,neutral,"[Day, asking, follow, back]",0


In [10]:
POS_count = df[df['senti_label']=='POSITIVE'].shape[0]
POS_ratio = POS_count/df.shape[0]

NEG_count = df[df['senti_label']=='NEGATIVE'].shape[0]
NEG_ratio = NEG_count/df.shape[0]

print('Amid the Damian Lillard tweets from 2021/10/19 ~ 2022/01/19 :')
print(POS_count,'tweets are highly positive, that is',round(POS_ratio*100,3),'% of the total tweets')
print(NEG_count,'tweets are highly negative, that is',round(NEG_ratio*100,3),'% of the total tweets')

Amid the Damian Lillard tweets from 2021/10/19 ~ 2022/01/19 :
2305 tweets are highly positive, that is 3.709 % of the total tweets
493 tweets are highly negative, that is 0.793 % of the total tweets


### Topic/Sentiment Exploration

In [11]:
dic = dict()

for i in df.Topic.unique():
    label = 'topic ' + str(i)
    data = df[df['Topic']==i]
    dic[label] = data

dic.keys()

dict_keys(['topic 0', 'topic 3', 'topic 4', 'topic 1', 'topic 2'])

In [12]:
def topic_summary(idx):
    label = 'topic ' + str(idx)
    data = dic[label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
        
    top = c.most_common(25)
    top_tags = []
    for combo in top:
        top_tags.append(combo[0])
    
    top_words = topic_dict[idx]
    
    total = data.shape[0]
    senti_group = data.groupby('senti_label').count()['Topic']
    POS = senti_group['POSITIVE']
    NEG = senti_group['NEGATIVE']
    pos = senti_group['positive']
    neg = senti_group['negative']
    neu = senti_group['neutral']
    pos_ratio = round(100*(POS + pos) / total,3)
    neg_ratio = round(100*(NEG + neg) / total,3)
    neu_ratio = round(100*neu/total,3)
    
    score = round(data['sentiment_score'].mean(),3)
    
    print('There are a total of', total, f'tweets in topic #{idx}')
    print(' ')
    print('Top 25 words : ',top_words)
    print(' ')
    print('Top 25 tags : ',top_tags)
    print(' ')
    print(round(100*POS/total,3), '% of the tweets in this topic cluster are highly positive,')
    print('while', round(100*NEG/total,3), '% of the tweets are highly negative')
    print(' ')
    print(POS + pos, 'tweets are scored as positive, while',NEG + neg , 'tweets are negative')
    print(pos_ratio,'% positive tweets,',neu_ratio,'% neutral, and', neg_ratio,'% negative')
    print('The average sentiment score is', score)

In [13]:
topic_summary(0)

There are a total of 14741 tweets in topic #0
 
Top 25 words :  ['help', 'hope', 'happy', 'ass', 'way', 'really', 'new', 'watch', 'guy', 'won', 've', 'make', 'right', 'great', 'amp', 'injury', 'dame', 'surgery', 'lakers', 'day', 'year', 'play', 'game', 'bro', 'come']
 
Top 25 tags :  ['NBAAllStar', 'DamianLillard', 'RipCity', 'NBA', 'NBA75', 'ripcity', 'damianlillard', 'Blazers', 'NBATwitter', 'DameTime', 'nba', 'dametime', 'thehobby', 'basketball', 'GamblingTwitter', 'SUGA', 'trailblazers', 'RaiderNation', 'LakeShow', '039', 'Portland', 'damedolla', 'Trailblazers', 'NBATopShotThis', 'Sixers']
 
4.036 % of the tweets in this topic cluster are highly positive,
while 0.834 % of the tweets are highly negative
 
3932 tweets are scored as positive, while 1242 tweets are negative
26.674 % positive tweets, 64.901 % neutral, and 8.425 % negative
The average sentiment score is 0.088


In [14]:
topic_summary(1)

There are a total of 11334 tweets in topic #1
 
Top 25 words :  ['nice', 'today', 'gotta', 'run', 'mccollum', 'isn', 'didn', 'nba', 'trade', 'tonight', 'real', 'going', 'gonna', 'vs', 'fans', 'cj', 'abdominal', 'said', 'love', 'trail', 'long', 'portland', 'blazers', 'damian', 'lillard']
 
Top 25 tags :  ['RipCity', 'NBA', 'NBAAllStar', 'NBATopShotThis', 'NBATwitter', 'NBA75', 'DamianLillard', 'Blazers', 'Waterfall', 'TrailBlazers', 'WATERFALL', 'waterfall', 'sports', 'nba', 'feedly', 'ripcity', 'trailblazers', 'PortlandTrailBlazers', 'damianlillard', 'thehobby', 'Airdrop', 'basketball', 'Portland', 'Sixers', 'NBA2K22MYTEAM']
 
2.603 % of the tweets in this topic cluster are highly positive,
while 0.6 % of the tweets are highly negative
 
2511 tweets are scored as positive, while 829 tweets are negative
22.155 % positive tweets, 70.531 % neutral, and 7.314 % negative
The average sentiment score is 0.067


In [15]:
topic_summary(4)

There are a total of 13297 tweets in topic #4
 
Top 25 words :  ['anthony', 'players', 'paul', 'season', 'harden', 'career', 'westbrook', 'list', '10', 'lebron', 'james', 'nba', 'simmons', 'team', 'ben', 'points', 'steph', 'distance', 'signature', 'curry', 'better', 'voting', 'shooting', 'damian', 'lillard']
 
Top 25 tags :  ['DamianSignatureMove', 'NBA', 'NBA75', 'RipCity', 'NBAAllStar', 'DamianLillard', 'NBATwitter', 'Blazers', 'Sixers', 'GamblingTwitter', '1', 'DubNation', 'nba', 'sports', 'ripcity', 'TTFL', 'NBATopShotThis', 'damianlillard', 'Trailblazers', 'LakeShow', 'feedly', 'TrailBlazers', 'Lakers', 'TopShotThanksgiving', '76ers']
 
2.068 % of the tweets in this topic cluster are highly positive,
while 0.79 % of the tweets are highly negative
 
4104 tweets are scored as positive, while 1038 tweets are negative
30.864 % positive tweets, 61.33 % neutral, and 7.806 % negative
The average sentiment score is 0.113


### Didn't work out that well from topic perspective
#### Try analyze from the sentiment perspective

In [16]:
def senti_summary(label):
    data = df[df['senti_label'] == label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
    
    top = c.most_common(25)
    top_tags = []
    
    for combo in top:
        top_tags.append(combo[0])
    
    tags_df = pd.DataFrame(top,columns = ['Tag','count'])
    
    c1 = Counter()
    for i in data['words']:
        c1.update(Counter(i))
        
    top1 = c1.most_common(25) 
    top_words = []
    for combo in top1:
        top_words.append(combo[0])
    
    overall = df.shape[0]
    total = data.shape[0]
    topic_group = data.groupby('Topic').count()['senti_label']
    
    print('Tweets labeled as',label,'account for',np.round(100*total/overall,3),'% of the total tweets about Damian Lillard')
    print(' ')
    print('For the tweets in this sentiment label: ')
    print(f'Top 25 words : ',top_words)
    print(' ')
    print(f'Top 25 tags : ',top_tags)
    print(' ')
    print('the topic label distribution looks like: ', topic_group)


In [17]:
senti_summary('POSITIVE')

Tweets labeled as POSITIVE account for 3.709 % of the total tweets about Damian Lillard
 
For the tweets in this sentiment label: 
Top 25 words :  ['Damian', 'Lillard', 'win', 'lol', 'best', 'great', 'Dame', 'Lol', 'Blazers', 'NBA', 'Portland', 'team', 'time', 'happy', 'get', 'would', 'player', 'good', 'greatest', 'one', 'Best', 'amp', 'like', 'na', 'PTS']
 
Top 25 tags :  ['NBAAllStar', 'RipCity', 'DamianLillard', 'NBA75', 'NBA', 'ripcity', 'DameTime', 'damianlillard', 'SUGA', 'NBATopShotThis', 'Sixers', 'DFS', 'InvestDFS', 'TimeToInvest', 'BTS', 'sports', 'RaiderNation', 'nba', 'thehobby', 'NBATwitter', 'feedly', 'truth', 'WeTheNorth', 'TTFL', 'TopShotThanksgiving']
 
the topic label distribution looks like:  Topic
0    595
1    295
2    487
3    653
4    275
Name: senti_label, dtype: int64


In [18]:
senti_summary('NEGATIVE')

Tweets labeled as NEGATIVE account for 0.793 % of the total tweets about Damian Lillard
 
For the tweets in this sentiment label: 
Top 25 words :  ['Damian', 'Lillard', 'Base', 'Set', 'bought', 'worst', 'hate', 'Series', 'ETH', 'Floor', 'terrible', 'shooting', 'Blazers', 'Dame', 'lillard', 'horrible', 'stupid', 'like', 'team', 'Portland', 'need', 'Moment', 'get', 'start', 'na']
 
Top 25 tags :  ['RipCity', 'thehobby', 'tradingcards', 'whodoyoucollect', '1', '1218', 'NBA75', 'NBA', '1009', 'sports', '869', '397', 'NBAAllStar', '1404', '1204', 'NBATopShotThis', '75', '1323', '447', '824', '4', '28', '951', '1067', '868']
 
the topic label distribution looks like:  Topic
0    123
1     68
2    129
3     68
4    105
Name: senti_label, dtype: int64


### PyLDAvis 

For the detailed explanation of the plot, please check out the official documentation, not going to explain everything here.

In [19]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [20]:
vis = pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer)
vis

  and should_run_async(code)
