In [1]:
import pandas as pd

import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
import altair as alt
import numpy as np
from collections import Counter
import string

import nltk
nltk.download('vader_lexicon')
nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### The data cleaning/manipulation technique/functions

In [2]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)
    
def extract_emoji(text):
    return [ch for ch in text if ch in UNICODE_EMOJI['en']]

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    
    result=''.join(i for i in temp2.lower() if (i.isalpha() or i==' '))
    return result

def word_list(tweet):
    
    lst = word_tokenize(tweet)
    lst1 = []
    stops = list(stopwords.words('english'))
    for w in lst:
        if w not in stops:
            lst1.append(w)
    
    return lst1

def sentiment(tweet):
    blob = TextBlob(tweet)
    
    return blob.sentiment.polarity


def get_date(date):
    
    return date[:10]

def get_hour(date):
    
    return date[11:13]
def get_10min(date):
    
    return date[14]+'0'

def get_min(date):
    
    return date[14:16]

def firm_pos(score):
    if score >= 0.7:
        return 1
    else: return 0
    
def pos(score):
    if (score >= 0.25) & (score < 0.7):
        return 1
    else: return 0
    
def neutral(score):
    if (score >= -0.25) & (score < 0.25):
        return 1
    else: return 0
    
def neg(score):
    if (score > -0.7) & (score < -0.25):
        return 1
    else: return 0
    
def firm_neg(score):
    if score <= -0.7:
        return 1
    else: return 0

### **Import data, check duplicate and missing/incomplete data. Remove if exists**

In [3]:
df = pd.read_csv('Project Data/Simone Biles.csv')
df['id'].duplicated(keep='last').sum()

0

In [4]:
df.isnull().sum()

id      0
date    2
text    2
dtype: int64

In [5]:
df = df.dropna().reset_index()
df.drop(columns=['index'],inplace=True)

### Apply the data cleaning/manipulation techniques on the data, we now have the used words, tags, emojis, sentiment score, and specific date/hour/min data. 

In [6]:
df['tags']= df.apply(lambda row: extract_tags(row['text']), axis=1)
df['emojis']= df.apply(lambda row: extract_emoji(row['text']), axis=1)
df['clean_text']= df.apply(lambda row: clean_tweet(row['text']), axis=1)
df['words']= df.apply(lambda row: word_list(row['clean_text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['day']= df.apply(lambda row: get_date(row['date']), axis=1)
df['hour']= df.apply(lambda row: get_hour(row['date']), axis=1)
df['10min']= df.apply(lambda row: get_10min(row['date']), axis=1)
df['min']= df.apply(lambda row: get_min(row['date']), axis=1)
df['POS']= df.apply(lambda row: firm_pos(row['sentiment_score']), axis=1)
df['pos']= df.apply(lambda row: pos(row['sentiment_score']), axis=1)
df['neu']= df.apply(lambda row: neutral(row['sentiment_score']), axis=1)
df['neg']= df.apply(lambda row: neg(row['sentiment_score']), axis=1)
df['NEG']= df.apply(lambda row: firm_neg(row['sentiment_score']), axis=1)

df.head()

Unnamed: 0,id,date,text,tags,emojis,clean_text,words,sentiment_score,day,hour,10min,min,POS,pos,neu,neg,NEG
0,1418360000888647681,2021-07-22 23:59:10+00:00,the olympics r literally like tmrw??? anyways ...,[],[],the olympics r literally like tmrw anyways can...,"[olympics, r, literally, like, tmrw, anyways, ...",0.0,2021-07-22,23,50,59,0,0,1,0,0
1,1418359846253047810,2021-07-22 23:58:33+00:00,omg it’s the goat #SimoneBiles LOOK AT THE GOAT,[SimoneBiles],[],omg its the goat look at the goat,"[omg, goat, look, goat]",0.0,2021-07-22,23,50,58,0,0,1,0,0
2,1418359748500418573,2021-07-22 23:58:10+00:00,Simone Biles is a gymnast and an entertainer.I...,[],[],simone biles is a gymnast and an entertainerim...,"[simone, biles, gymnast, entertainerim, glad, ...",0.166667,2021-07-22,23,50,58,0,0,1,0,0
3,1418359695681728515,2021-07-22 23:57:57+00:00,Just wanted to see the emoji. #SimoneBiles,[SimoneBiles],[],just wanted to see the emoji,"[wanted, see, emoji]",0.0,2021-07-22,23,50,57,0,0,1,0,0
4,1418359661028331526,2021-07-22 23:57:49+00:00,"God, I’m not worthy enough to share a planet w...",[],[],god im not worthy enough to share a planet wit...,"[god, im, worthy, enough, share, planet, simon...",-0.083333,2021-07-22,23,50,57,0,0,1,0,0


## **See the overall flow of tweet & sentiment**

In [7]:
score = df.groupby(['day','hour']).agg([np.sum,np.size]).sentiment_score
score = score.reset_index()

score['date'] = score['day'] + ' ' + score['hour'] + ':00'
score[['6hr_sum','6hr_count']] = score.rolling(window=6,min_periods=1).sum()[['sum','size']]
score['6hr_avg'] = score['6hr_sum']/score['6hr_count']

score.head()

Unnamed: 0,day,hour,sum,size,date,6hr_sum,6hr_count,6hr_avg
0,2021-07-22,0,28.321041,106.0,2021-07-22 00:00,28.321041,106.0,0.26718
1,2021-07-22,1,27.016179,125.0,2021-07-22 01:00,55.33722,231.0,0.239555
2,2021-07-22,2,18.57741,88.0,2021-07-22 02:00,73.91463,319.0,0.231707
3,2021-07-22,3,16.880703,97.0,2021-07-22 03:00,90.795333,416.0,0.218258
4,2021-07-22,4,12.141448,57.0,2021-07-22 04:00,102.936781,473.0,0.217625


In [8]:
alt.Chart(score).mark_line().encode(
    x=alt.X('date:T',title='Date'),
    y=alt.Y('size:Q',title='Tweet Count (by hour)')
).properties(width=840,height=400,title='Simone Biles Olympics - Overall Tweet Flow')

### **Plot the sentiment flow**

In [9]:
alt.Chart(score).mark_line().encode(
    x=alt.X('date:T',title='Date'),
    y=alt.Y('6hr_avg:Q',title='Avg Sentiment')
).properties(width=840,height=500,title='Simone Biles Olympics - Overall Sentiment Flow On Twitter (6hour-rolling)')

In [10]:
flow = df.groupby(['day','hour']).mean()[['POS','pos','neu','neg','NEG']]
flow = flow.reset_index()
flow['date'] = flow['day'] + ' ' + flow['hour'] + ':00'
flow[['POSITIVE', 'positive', 'neutral', 'negative', 'NEGATIVE']] = flow.rolling(window=6,min_periods=1).mean()[['POS','pos','neu','neg','NEG']]

flow1 = pd.DataFrame()

dates = []
values = []
labels = []

for i in ['POSITIVE', 'positive', 'neutral', 'negative', 'NEGATIVE']:
    lst = []
    lst1 = []
    lst2 = list(flow.date.values)
    
    for j in range(len(flow)):
        lst.append(i)
        lst1.append(flow[i][j])
        
    dates += lst2
    labels += lst
    values += lst1
    
flow1['date'] = pd.Series(dates)
flow1['sentiment_label'] = pd.Series(labels)
flow1['percentage'] = pd.Series(values)
flow1.head(10)

Unnamed: 0,date,sentiment_label,percentage
0,2021-07-22 00:00,POSITIVE,0.141509
1,2021-07-22 01:00,POSITIVE,0.102755
2,2021-07-22 02:00,POSITIVE,0.087443
3,2021-07-22 03:00,POSITIVE,0.083623
4,2021-07-22 04:00,POSITIVE,0.084442
5,2021-07-22 05:00,POSITIVE,0.093096
6,2021-07-22 06:00,POSITIVE,0.076951
7,2021-07-22 07:00,POSITIVE,0.078096
8,2021-07-22 08:00,POSITIVE,0.078016
9,2021-07-22 09:00,POSITIVE,0.076953


In [11]:
alt.Chart(flow1).mark_line().encode(
    x='date:T',
    y='percentage:Q',                               
    color='sentiment_label:N'                               
).properties(width=840,height=480,title='sentiments flow (percentage of each 5 degree)')

## **Emoji/Tags**

In [12]:
# this return the top 50 most common items in the columns (emoji/tag/word)

def top_item(data,label):
    
    lst = []
    for i in data[label]:
        lst += i
        
    C = Counter(lst)
    top50 = C.most_common(50)
    count_df = pd.DataFrame(top50,columns = [label,'count'])
    
    return count_df

In [13]:
c= top_item(df,'tags')
c

# the top 50 most frequently used tags      within the 'simone biles' tweets during the 2021 olympics

Unnamed: 0,tags,count
0,SimoneBiles,26354
1,Olympics,9137
2,Tokyo2020,5795
3,TokyoOlympics,3054
4,Simone,2835
5,MentalHealthMatters,2327
6,GOAT,2214
7,mentalhealth,2030
8,TeamUSA,1900
9,OlympicGames,1841


In [14]:
c1=c[1:26]

alt.Chart(c1).mark_bar().encode(
    x=alt.X('tags',sort=['count'],title='Tags',axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count',title='Total Usage')
).properties(width=900,height=400,title={
      "text": ["Most popular tags - Simone Biles Olympics"],
      "subtitle":["The top 25 most popular tags used within the tweets about Simone Biles during Olympics"]  
    }).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_title(
    anchor='start',
    fontSize = 24,
    subtitleFontSize = 15
)

### **Emoji**

In [15]:
count = top_item(df, 'emojis')
count

Unnamed: 0,emojis,count
0,❤,19096
1,👏,7312
2,🏾,3903
3,🐐,3764
4,😂,3712
5,🙏,3616
6,🏽,2802
7,🤍,2753
8,🏻,2526
9,🏼,2307


In [16]:
c1=count[:25]

alt.Chart(c1).mark_bar().encode(
    x=alt.X('emojis',sort=['count'],title='Emojis',axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count',title='Total Usage')
).properties(width=900,height=400,title={
      "text": ["Most popular emojis - Simone Biles Olympics"],
      "subtitle":["The top 25 most popular emojis used within the tweets about Simone Biles during Olympics"]  
    }).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_title(
    anchor='start',
    fontSize = 24,
    subtitleFontSize = 15
)

## **Specifically target the period after the withdrawl, see how Twitter react**

In [17]:
df['Date'] = pd.to_datetime(df['date'])
mask = (df['Date'] > '2021-07-27 00:00') & (df['Date'] < '2021-07-28 23:59')
drawl = df.loc[mask].sort_values('Date')
drawl = drawl.reset_index()
drawl.drop(columns=['index','Date'],inplace=True)

drawl.head()

Unnamed: 0,id,date,text,tags,emojis,clean_text,words,sentiment_score,day,hour,10min,min,POS,pos,neu,neg,NEG
0,1420059487524765703,2021-07-27 16:32:19+00:00,@NYCMayor @Simone_Biles Omg what a hero! Remem...,[],[],omg what a hero remember the days we used to...,"[omg, hero, remember, days, used, praise, real...",-0.1,2021-07-27,16,30,32,0,0,1,0,0
1,1420059488002920459,2021-07-27 16:32:20+00:00,The conversation around Simone Biles is exactl...,[],[],the conversation around simone biles is exactl...,"[conversation, around, simone, biles, exactly,...",0.525,2021-07-27,16,30,32,0,1,0,0,0
2,1420059487931617281,2021-07-27 16:32:20+00:00,"@Simone_Biles @lourdesgnavarro ❤️ you, Simone ...",[],"[❤, ❤]",you simone and so proud of you,"[simone, proud]",0.8,2021-07-27,16,30,32,1,0,0,0,0
3,1420059491706433536,2021-07-27 16:32:20+00:00,@Simone_Biles @tonyposnanski You rock so hard!,[],[],you rock so hard,"[rock, hard]",-0.291667,2021-07-27,16,30,32,0,0,0,1,0
4,1420059491484110848,2021-07-27 16:32:20+00:00,Logging in to say: good for Simone Biles.\nShe...,[],[],logging in to say good for simone bilesshe has...,"[logging, say, good, simone, bilesshe, nothing...",0.075,2021-07-27,16,30,32,0,0,1,0,0


In [18]:
c = top_item(drawl,'tags')
c

Unnamed: 0,tags,count
0,SimoneBiles,9193
1,Olympics,2427
2,MentalHealthMatters,1225
3,Tokyo2020,1079
4,GOAT,926
5,Simone,876
6,mentalhealth,746
7,TokyoOlympics,583
8,TeamUSA,506
9,OlympicGames,499


In [19]:
c1=c[1:26]

alt.Chart(c1).mark_bar().encode(
    x=alt.X('tags',sort=['count'],title='Tags',axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('count',title='Total Usage')
).properties(width=900,height=400,title={
      "text": ["Most frequent tags - Simone Biles Withdrawl"],
      "subtitle":["The top 25 most frequently used tags within the tweets about Simone Biles after withdraw from Olympics"]  
    }).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_title(
    anchor='start',
    fontSize = 24,
    subtitleFontSize = 15
)