In [1]:
import pandas as pd
import numpy as np
import gensim
import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
from collections import Counter
import altair as alt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stops = nltk.corpus.stopwords.words('english')

Slow version of gensim.models.doc2vec is being used
Slow version of Fasttext is being used
[nltk_data] Downloading package punkt to /opt/conda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
df1 = pd.read_csv('Curry 2021_Oct.csv')
df2 = pd.read_csv('Curry 2021_Nov.csv')
df3 = pd.read_csv('Curry 2021_Dec.csv')
df4 = pd.read_csv('Curry 2022_Jan.csv')

df = pd.concat([df1,df2,df3,df4])
df.dropna(subset=['Text'],inplace=True)
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,Replys,Retweets,Quotes,Likes,Language,Place,Hashtags,Source,Mentions
0,2021-10-19 23:59:44+00:00,1450612663503917056,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,markFin73763459,28,156,522,18,,0,0,0,0,en,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Awesemo_Com', id=9561989702037..."
1,2021-10-19 23:58:31+00:00,1450612355662897159,@AwesemoNBA @Awesemo_Com @NBA @NBAonTNT @Steph...,@AwesemoNBA @Awesemo_Com @NBA @NBAonTNT @Steph...,Iceman100001,25,365,1200,672,,0,0,0,0,en,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='AwesemoNBA', id=11040884038413..."
2,2021-10-19 23:57:35+00:00,1450612122661031940,With @StephenCurry30 being in best player in t...,With @StephenCurry30 being in best player in t...,TheErbaEffect,85,518,706,17,"Brooklyn, NY",1,0,0,0,en,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...","[User(username='StephenCurry30', id=42562446, ..."


In [3]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)

    # will return a list of tags

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    result = ''.join(i for i in temp2 if i not in UNICODE_EMOJI['en'])
    return result

    # return the cleaned tweet without any mention/tag/url/emoji


def sentiment(cleaned_tweet):
    blob = TextBlob(cleaned_tweet)
    return blob.sentiment.polarity

    # return the sentiment score
    
def sentiment_label(score):
    if score >= 0.75:
        return 'POSITIVE'
    elif score >= 0.2:
        return 'positive'
    elif score >= -0.2:
        return 'neutral'
    elif score >= -0.75:
        return 'negative'
    else:
        return 'NEGATIVE'
    
    # label the tweet based on sentiment score
    
def word_tokens(text): 
    tokens = word_tokenize(text)
    result = []
    for tok in tokens: 
        if (tok.lower().isalpha()) & (tok.lower() not in stops):
            result.append(tok)
    
    return result
    # return a list of tokens for the input text string

In [4]:
df['tags']= df.apply(lambda row: extract_tags(row['Text']), axis=1)

df['clean_text']= df.apply(lambda row: clean_tweet(row['Text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['senti_label']= df.apply(lambda row: sentiment_label(row['sentiment_score']), axis=1)
df['words'] = df['clean_text'].apply(word_tokens)

df[['tags','clean_text','sentiment_score','senti_label']].head()

Unnamed: 0,tags,clean_text,sentiment_score,senti_label
0,[],Curry,0.0,neutral
1,[],Steph Cury,0.0,neutral
2,[],"With being in best player in the league form,...",0.435,positive
3,[],will lose to tonight facts,0.0,neutral
4,[],Me too!!! Can’t wait!!!! ️,0.0,neutral


### Latent Dirichlet Allocation

In [5]:
vectorizer = TfidfVectorizer(min_df=10, stop_words='english',lowercase=True,max_df=0.9,max_features=1000)

data_matrix = vectorizer.fit_transform(df.clean_text)
data_matrix

<144069x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 564653 stored elements in Compressed Sparse Row format>

In [6]:
lda_model = LatentDirichletAllocation(n_components=5, learning_method='online', random_state=695, n_jobs = -1)

lda_output = lda_model.fit_transform(data_matrix)
topic_dict = dict()

for i, topic in enumerate(lda_model.components_):
    print(f'Top 25 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]])
    topic_dict[i] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[-25:]]
    print('\n')

Top 25 words for topic #0:
['does', 'game', 'shoot', 'wait', 'going', 'god', 'steph', 'points', 'thanks', 'amazing', 'curry', 'win', 'look', 'lebron', 'gonna', 'come', 'thank', 'tonight', 'right', 'mvp', 'good', 'let', 'man', 'congratulations', 'congrats']


Top 25 words for topic #1:
['games', 'today', 'legend', 'real', 'people', 'day', 've', 'll', 'say', 'just', 'game', 'night', 'shot', 'nft', 'watch', 'ball', 'shit', 'king', 'play', 'basketball', 'great', 'history', 'lol', 'better', 'goat']


Top 25 words for topic #2:
['vs', 'chef', 'hit', 'ur', 'allen', 'like', 'shots', 'wow', 'doing', 'ray', 'steph', 'dude', 'bad', 'elon', 'damn', 'mars', 'lmao', 'break', 'nice', 'said', 'stephen', 'guy', 'join', 'really', 'curry']


Top 25 words for topic #3:
['hey', 'doesn', 'respect', 'steph', 'yes', 'playing', 'drop', 'ratio', 'happy', 'amp', 'think', 'warriors', 'big', 'new', 'fan', 'stop', 'klay', 'don', 'just', 'got', 'need', 'like', 'game', 'bro', 'love']


Top 25 words for topic #4:
['co

In [7]:
lda_model.score(data_matrix), lda_model.perplexity(data_matrix)

(-1749645.596907002, 1151.628922928819)

In [8]:
df['Topic'] = lda_output.argmax(axis=1)
df['Topic'].head()

0    2
1    4
2    4
3    0
4    0
Name: Topic, dtype: int64

In [9]:
df.head(3)

Unnamed: 0,Created_Date,Tweet Id,Text,Rendered Tweet,User_Name,Followers,Friends,Favourites,Media,Location,...,Place,Hashtags,Source,Mentions,tags,clean_text,sentiment_score,senti_label,words,Topic
0,2021-10-19 23:59:44+00:00,1450612663503917056,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,@Awesemo_Com @AwesemoNBA @NBA @NBAonTNT @Steph...,markFin73763459,28,156,522,18,,...,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='Awesemo_Com', id=9561989702037...",[],Curry,0.0,neutral,[Curry],2
1,2021-10-19 23:58:31+00:00,1450612355662897159,@AwesemoNBA @Awesemo_Com @NBA @NBAonTNT @Steph...,@AwesemoNBA @Awesemo_Com @NBA @NBAonTNT @Steph...,Iceman100001,25,365,1200,672,,...,,,"<a href=""http://twitter.com/download/iphone"" r...","[User(username='AwesemoNBA', id=11040884038413...",[],Steph Cury,0.0,neutral,"[Steph, Cury]",4
2,2021-10-19 23:57:35+00:00,1450612122661031940,With @StephenCurry30 being in best player in t...,With @StephenCurry30 being in best player in t...,TheErbaEffect,85,518,706,17,"Brooklyn, NY",...,,,"<a href=""https://mobile.twitter.com"" rel=""nofo...","[User(username='StephenCurry30', id=42562446, ...",[],"With being in best player in the league form,...",0.435,positive,"[best, player, league, form, still, versatile,...",4


In [10]:
POS_count = df[df['senti_label']=='POSITIVE'].shape[0]
POS_ratio = POS_count/df.shape[0]

NEG_count = df[df['senti_label']=='NEGATIVE'].shape[0]
NEG_ratio = NEG_count/df.shape[0]

print('Amid the Stephen Curry tweets from 2021/09/09 ~ 2022/01/19 :')
print(POS_count,'tweets are highly positive, that is',round(POS_ratio*100,3),'% of the total tweets')
print(NEG_count,'tweets are highly negative, that is',round(NEG_ratio*100,3),'% of the total tweets')

Amid the Stephen Curry tweets from 2021/10/19 ~ 2022/01/19 :
10351 tweets are highly positive, that is 7.185 % of the total tweets
1191 tweets are highly negative, that is 0.827 % of the total tweets


### Topic/Sentiment Exploration

In [11]:
dic = dict()

for i in df.Topic.unique():
    label = 'topic ' + str(i)
    data = df[df['Topic']==i]
    dic[label] = data

dic.keys()

dict_keys(['topic 2', 'topic 4', 'topic 0', 'topic 1', 'topic 3'])

In [12]:
def topic_summary(idx):
    label = 'topic ' + str(idx)
    data = dic[label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
        
    top = c.most_common(25)
    top_tags = []
    for combo in top:
        top_tags.append(combo[0])
    
    top_words = topic_dict[idx]
    
    total = data.shape[0]
    senti_group = data.groupby('senti_label').count()['Topic']
    POS = senti_group['POSITIVE']
    NEG = senti_group['NEGATIVE']
    pos = senti_group['positive']
    neg = senti_group['negative']
    neu = senti_group['neutral']
    pos_ratio = round(100*(POS + pos) / total,3)
    neg_ratio = round(100*(NEG + neg) / total,3)
    neu_ratio = round(100*neu/total,3)
    
    score = round(data['sentiment_score'].mean(),3)
    
    print('There are a total of', total, f'tweets in topic #{idx}')
    print(' ')
    print('Top 25 words : ',top_words)
    print(' ')
    print('Top 25 tags : ',top_tags)
    print(' ')
    print(round(100*POS/total,3), '% of the tweets in this topic cluster are highly positive,')
    print('while', round(100*NEG/total,3), '% of the tweets are highly negative')
    print(' ')
    print(POS + pos, 'tweets are scored as positive, while',NEG + neg , 'tweets are negative')
    print(pos_ratio,'% positive tweets,',neu_ratio,'% neutral, and', neg_ratio,'% negative')
    print('The average sentiment score is', score)

In [13]:
topic_summary(0)

There are a total of 38329 tweets in topic #0
 
Top 25 words :  ['does', 'game', 'shoot', 'wait', 'going', 'god', 'steph', 'points', 'thanks', 'amazing', 'curry', 'win', 'look', 'lebron', 'gonna', 'come', 'thank', 'tonight', 'right', 'mvp', 'good', 'let', 'man', 'congratulations', 'congrats']
 
Top 25 tags :  ['DubNation', 'NBA75', 'NBA', 'StephenCurry', 'NBAAllStar', 'Warriors', 'StephCurry', 'NBATwitter', 'MVP', 'nba', 'stephcurry', 'Dubnation', 'dubnation', 'NFT', '30', 'warriors', 'KlayDay', '1', 'NBAonTNT', 'NFTs', 'goldenstatewarriors', 'TopShotThanksgiving', 'GOAT', 'Curry', 'NBATopShotThis']
 
4.075 % of the tweets in this topic cluster are highly positive,
while 0.59 % of the tweets are highly negative
 
9540 tweets are scored as positive, while 2209 tweets are negative
24.89 % positive tweets, 69.347 % neutral, and 5.763 % negative
The average sentiment score is 0.096


In [14]:
topic_summary(1)

There are a total of 30610 tweets in topic #1
 
Top 25 words :  ['games', 'today', 'legend', 'real', 'people', 'day', 've', 'll', 'say', 'just', 'game', 'night', 'shot', 'nft', 'watch', 'ball', 'shit', 'king', 'play', 'basketball', 'great', 'history', 'lol', 'better', 'goat']
 
Top 25 tags :  ['DubNation', 'NBA', 'NBA75', 'StephenCurry', 'StephCurry', 'Warriors', 'NBAAllStar', 'NFT', 'NBATwitter', 'NFTs', 'nba', 'dubnation', 'nft', 'stephcurry', 'MVP', 'KlayDay', 'NFTCommunity', 'GOAT', 'warriors', '1', 'NBATopShotThis', 'basketball', 'Curry', 'Dubnation', 'NBAonTNT']
 
6.093 % of the tweets in this topic cluster are highly positive,
while 1.245 % of the tweets are highly negative
 
9735 tweets are scored as positive, while 2735 tweets are negative
31.803 % positive tweets, 59.262 % neutral, and 8.935 % negative
The average sentiment score is 0.117


In [15]:
topic_summary(4)

There are a total of 26623 tweets in topic #4
 
Top 25 words :  ['cool', 'threes', 'favorite', 'breaking', 'awesome', 'gotta', '3pt', 'sure', 'yeah', 'league', 'splash', 'want', 'way', 'make', 'nba', 'point', 'did', 'player', 'steph', 'know', 'greatest', 'record', 'best', 'shooter', 'time']
 
Top 25 tags :  ['DubNation', 'StephenCurry', 'NBA75', 'NBA', 'NBAAllStar', 'StephCurry', 'Warriors', 'nba', 'NFT', 'NFTs', 'NBATwitter', 'stephcurry', 'Curry', 'GOAT', 'BAYC', 'dubnation', '1', 'NBATopShotThis', 'basketball', 'NFTCommunity', '2974', '30', 'warriors', 'MVP', 'CurryWatch']
 
16.418 % of the tweets in this topic cluster are highly positive,
while 0.545 % of the tweets are highly negative
 
11350 tweets are scored as positive, while 1766 tweets are negative
42.632 % positive tweets, 50.734 % neutral, and 6.633 % negative
The average sentiment score is 0.241


### Didn't work out that well from topic perspective
#### Try analyze from the sentiment perspective

In [16]:
def senti_summary(label):
    data = df[df['senti_label'] == label]
    
    c = Counter()
    for i in data['tags']:
        c.update(Counter(i))
    
    top = c.most_common(25)
    top_tags = []
    
    for combo in top:
        top_tags.append(combo[0])
    
    tags_df = pd.DataFrame(top,columns = ['Tag','count'])
    
    c1 = Counter()
    for i in data['words']:
        c1.update(Counter(i))
        
    top1 = c1.most_common(25) 
    top_words = []
    for combo in top1:
        top_words.append(combo[0])
    
    overall = df.shape[0]
    total = data.shape[0]
    topic_group = data.groupby('Topic').count()['senti_label']
    
    print('Tweets labeled as',label,'account for',np.round(100*total/overall,3),'% of the total tweets about Stephen Curry')
    print(' ')
    print('For the tweets in this sentiment label: ')
    print(f'Top 25 words : ',top_words)
    print(' ')
    print(f'Top 25 tags : ',top_tags)
    print(' ')
    print('the topic label distribution looks like: ', topic_group)


In [17]:
senti_summary('POSITIVE')

Tweets labeled as POSITIVE account for 7.185 % of the total tweets about Stephen Curry
 
For the tweets in this sentiment label: 
Top 25 words :  ['shooter', 'best', 'greatest', 'time', 'ever', 'lol', 'Greatest', 'win', 'Congrats', 'player', 'great', 'Best', 'Steph', 'Congratulations', 'NBA', 'Curry', 'history', 'one', 'record', 'Great', 'GREATEST', 'point', 'awesome', 'Lol', 'SHOOTER']
 
Top 25 tags :  ['DubNation', 'NBA75', 'NBA', 'StephCurry', 'StephenCurry', 'Warriors', 'nba', 'NBAAllStar', 'stephcurry', 'GOAT', 'Dubnation', 'NFT', 'MVP', 'goldenstatewarriors', 'Curry', '2974', 'BAYC', 'dubnation', 'basketball', 'NBATwitter', 'NFTs', 'warriors', '30', 'CurryWatch', 'KlayDay']
 
the topic label distribution looks like:  Topic
0    1562
1    1865
2    1107
3    1446
4    4371
Name: senti_label, dtype: int64


In [18]:
senti_summary('NEGATIVE')

Tweets labeled as NEGATIVE account for 0.827 % of the total tweets about Stephen Curry
 
For the tweets in this sentiment label: 
Top 25 words :  ['hate', 'insane', 'worst', 'stupid', 'man', 'bad', 'game', 'like', 'Steph', 'terrible', 'bro', 'curry', 'Curry', 'na', 'record', 'go', 'horrible', 'one', 'sick', 'gon', 'tonight', 'u', 'get', 'ever', 'got']
 
Top 25 tags :  ['DubNation', 'NBA', 'Warriors', 'dubnation', 'NBA75', 'NBAAllStar', 'GSW', 'WEAREKONG', 'stephcurry', '49ers', '30', 'RKL', '1', 'StephenCurry', 'Faith', 'Christmas2021', 'Christian', 'Christians', 'Christianity', 'fake', 'real', 'worldly', 'Aintitmane', 'WNBATwitter', 'KyrieIrving']
 
the topic label distribution looks like:  Topic
0    226
1    381
2    222
3    217
4    145
Name: senti_label, dtype: int64


### PyLDAvis

In [19]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [20]:
vis = pyLDAvis.sklearn.prepare(lda_model, data_matrix, vectorizer)
vis

  and should_run_async(code)
