# C-More

### 1. Process text for sentiment analysis

In [1]:
import json
import pandas as pd

import nltk
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

import spacy

#### 1.1. Load json data into a dataframe

In [2]:
list_json = []

with open('tweet_json_1day.txt') as file:
    for line in file:
        data = json.loads(line)
        list_json.append(data)

df = pd.DataFrame(list_json, columns = ['id', 'text', 'lang', 'created_at', 'public_metrics'])

#### 1.2. Select only tweets in English

In [3]:
df_en = df[df['lang'] == 'en'].copy()

In [4]:
df_en.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5377 entries, 0 to 7151
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              5377 non-null   object
 1   text            5377 non-null   object
 2   lang            5377 non-null   object
 3   created_at      5377 non-null   object
 4   public_metrics  5377 non-null   object
dtypes: object(5)
memory usage: 252.0+ KB


In [5]:
columns_to_remove = ['lang', 'created_at', 'public_metrics']

df_en.drop(columns_to_remove, axis=1, inplace=True)

In [6]:
df_en.head()

Unnamed: 0,id,text
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...


#### 1.3. Process text with NLTK

We are now processing our text for sentiment analysis. Our first approach will be to tokenize it and remove stop words.

In [7]:
# tokenization

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
# redule_len=True replaces repeated character sequences of length 3 or greater with sequences of length 3
# examples: waaaaayyyy --> waaayyy
# strip_handles=True removes Twitter handles (@xxxx...)

In [8]:
# stop words

stopwords = set(nltk.corpus.stopwords.words('english'))

In [9]:
# remove stop words

def remove_stop(tokens):
    return [token for token in tokens if token not in stopwords]

In [10]:
pipeline = [tweet_tokenizer.tokenize, remove_stop] # this will be our default pipeline
# tokenizes text and removes stop words

def process_text(text, pipeline):
    tokens = text
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [11]:
df_nltk = df_en.copy()

In [12]:
%%time

df_nltk['tokens'] = df_nltk['text'].apply(process_text, pipeline=pipeline)

Wall time: 771 ms


In [13]:
df_nltk

Unnamed: 0,id,text,tokens
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,[breakfast]
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[respect, chicken, nuggets, crispy, chicken, s..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[All, right, I'm, tapping, save, sanity, ., Su..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[bad, im, sending, 8, mcdonalds, large, fries,..."
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[Last, time, I, went, McDonalds, ., lol, https..."
...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[belos, comes, back, give, mcdonalds, sprite]"
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[mcdonalds, suddenly, 4, school, buses, filled..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, inflation, shrinking, large, fr..."


Instead of using simple tokens, we can stem or lemmatize them. We will lemmatize them in our next approach. To do this, we need to have the part-of-speech tags for each token.

In [14]:
# select longest tweet for testing purposes

test_text = sorted(df_en['text'], key=lambda x: len(x), reverse=True)[0]

test_text

'@bungoman @moothought @__justplaying @heavenbent11 @IgorBrigadir @huggingpuppy @flybottlemist @panchromaticity @temujin9 @Duderichy @jicapal @goblinodds @_brentbaum @scrmshw @OccultBoyscout @bogmeat @Knipps @OneEyedAlpaca @SoupOfToday @Acre108 @ObserverSuns @ZacharyHundley @_holyweather @anonynaut @magicianbrain @mimi10v3 @karnagraha @er1enney0ung @Lithros @parafactual @KrikkitMotel @__frye @eggprophet @pareinoia @TeddyRaccovelt @dancinghorse16 @storebrandguy @NLRG_ @irafeierabend @bloobsandnoods @RootOfUnity @VesselOfSpirit @MaskOfFace @ObjectOfObjects @CurlOfGradient @FingerOfHand @CauseOfProblem @ModelOfTheory @ReneeSolana @quotidiania Good morning to all you wonderful people. I just tried a new coffee from McDonalds and it is wonderful, just like you'

In [15]:
pos_tag(tweet_tokenizer.tokenize(test_text))

[('Good', 'JJ'),
 ('morning', 'NN'),
 ('to', 'TO'),
 ('all', 'DT'),
 ('you', 'PRP'),
 ('wonderful', 'JJ'),
 ('people', 'NNS'),
 ('.', '.'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('tried', 'VBD'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('coffee', 'NN'),
 ('from', 'IN'),
 ('McDonalds', 'NNP'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('wonderful', 'JJ'),
 (',', ','),
 ('just', 'RB'),
 ('like', 'IN'),
 ('you', 'PRP')]

Since our lemmatizer, `WordNetLemmatizer`, uses WordNet tags, we need to convert NLTK's default tags (Penn Treebank part-of-speech tags) to WordNet tags.

In [16]:
# check tags (example for PRP)

nltk.help.upenn_tagset('PRP')

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [17]:
def normalize(text, tokenizer=TweetTokenizer(reduce_len=True, strip_handles=True), lemmatizer=WordNetLemmatizer(), stop_words=stopwords):
    '''
    Normalizes review by removing stopwords and lemmatizing tokens.
    '''
    
    def lemmatize(token, tag):
        '''
        Converts Penn Treebank part-of-speech tags (the default tag set in nltk.pos_tag)
        to WordNet tags - defaults to wn.Noun if the first letter of the Penn Treebank pos tag
        is neither 'N', 'V', 'R' or 'J'.
        Returns lemmatized token.
        '''        
        wordnet_tag = {
            'N': wn.NOUN, 
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        
        return lemmatizer.lemmatize(token, wordnet_tag)
    
    return [lemmatize(token, tag)  
            for (token, tag) in pos_tag(tokenizer.tokenize(text)) 
            if token not in stop_words]

In [18]:
normalize(test_text)

['Good',
 'morning',
 'wonderful',
 'people',
 '.',
 'I',
 'try',
 'new',
 'coffee',
 'McDonalds',
 'wonderful',
 ',',
 'like']

In [19]:
%%time

df_nltk['lemmas'] = df_nltk['text'].apply(normalize)

Wall time: 7.65 s


In [20]:
df_nltk

Unnamed: 0,id,text,tokens,lemmas
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,[breakfast],[breakfast]
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[respect, chicken, nuggets, crispy, chicken, s...","[respect, chicken, nugget, crispy, chicken, sa..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[All, right, I'm, tapping, save, sanity, ., Su...","[All, right, I'm, tap, save, sanity, ., Summin..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[bad, im, sending, 8, mcdonalds, large, fries,...","[bad, im, send, 8, mcdonalds, large, fry, u, rn]"
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[Last, time, I, went, McDonalds, ., lol, https...","[Last, time, I, go, McDonalds, ., lol, https:/..."
...,...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[belos, comes, back, give, mcdonalds, sprite]","[belos, come, back, give, mcdonalds, sprite]"
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[mcdonalds, suddenly, 4, school, buses, filled...","[mcdonalds, suddenly, 4, school, bus, fill, te..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ...","[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, inflation, shrinking, large, fr...","[Tough, time, inflation, shrink, large, fry, h..."


#### 1.4. Process text with spaCy

In [21]:
nlp = spacy.load('en_core_web_sm')

In [22]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x18ed507c4c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x18ed507cc40>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x18ecd960270>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x18ed5210c80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x18ed5210940>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x18ecd960190>)]

These are the default components of spaCy's pipeline.

We can easily disable the components we are not interested in:

In [48]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x18ee292b7c0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x18ee0068ac0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x18edffc9800>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x18edff8fac0>)]

We will use the default components for now:

In [49]:
nlp = spacy.load('en_core_web_sm')

In [50]:
doc = nlp(test_text)

In [51]:
# tokens

for token in doc:
    print(token.text, end="|")

Good|morning|to|all|you|wonderful|people|.|I|just|tried|a|new|coffee|from|McDonalds|and|it|is|wonderful|,|just|like|you|

SpaCy's default tokenizer does not recognize and exclude the Twitter handles. We can try to customise this at a later stage. For now, let's focus on the information we can easily get with spaCy.

In [52]:
test_text = "Good morning to all you wonderful people. I just tried a new coffee from McDonalds and it is wonderful, just like you"
doc = nlp(test_text)

In [53]:
# tokens

for token in doc:
    print(token.text, end="|")

Good|morning|to|all|you|wonderful|people|.|I|just|tried|a|new|coffee|from|McDonalds|and|it|is|wonderful|,|just|like|you|

In [54]:
# lemmas

for token in doc:
    print(token.lemma_, end="|")

good|morning|to|all|you|wonderful|people|.|I|just|try|a|new|coffee|from|McDonalds|and|it|be|wonderful|,|just|like|you|

In [55]:
# is the token part of a “stop list”? 

for token in doc:
    print(token.text, token.is_stop, end="|")

Good False|morning False|to True|all True|you True|wonderful False|people False|. False|I True|just True|tried False|a True|new False|coffee False|from True|McDonalds False|and True|it True|is True|wonderful False|, False|just True|like False|you True|

In [56]:
# does the token consist of alphabetic characters?

for token in doc:
    print(token.text, token.is_alpha, end="|")

Good True|morning True|to True|all True|you True|wonderful True|people True|. False|I True|just True|tried True|a True|new True|coffee True|from True|McDonalds True|and True|it True|is True|wonderful True|, False|just True|like True|you True|

In [57]:
# is the token punctuation? 

for token in doc:
    print(token.text, token.is_punct, end="|")

Good False|morning False|to False|all False|you False|wonderful False|people False|. True|I False|just False|tried False|a False|new False|coffee False|from False|McDonalds False|and False|it False|is False|wonderful False|, True|just False|like False|you False|

In [58]:
# part-of-speech

for token in doc:
    print(token.text, token.pos_, end="|")

Good ADJ|morning NOUN|to ADP|all PRON|you PRON|wonderful ADJ|people NOUN|. PUNCT|I PRON|just ADV|tried VERB|a DET|new ADJ|coffee NOUN|from ADP|McDonalds PROPN|and CCONJ|it PRON|is AUX|wonderful ADJ|, PUNCT|just ADV|like ADP|you PRON|

In [59]:
# syntactic dependency relation

for token in doc:
    print(token.text, token.dep_, end="|")

Good amod|morning npadvmod|to ROOT|all predet|you nmod|wonderful amod|people pobj|. punct|I nsubj|just advmod|tried ROOT|a det|new amod|coffee dobj|from prep|McDonalds pobj|and cc|it nsubj|is conj|wonderful acomp|, punct|just advmod|like prep|you pobj|

In [60]:
# named entity type

for token in doc:
    print(token.text, token.ent_type_, end="|")

Good |morning TIME|to |all |you |wonderful |people |. |I |just |tried |a |new |coffee |from |McDonalds ORG|and |it |is |wonderful |, |just |like |you |

In [61]:
# index of the token within the parent document

for token in doc:
    print(token.text, token.i, end="|")

Good 0|morning 1|to 2|all 3|you 4|wonderful 5|people 6|. 7|I 8|just 9|tried 10|a 11|new 12|coffee 13|from 14|McDonalds 15|and 16|it 17|is 18|wonderful 19|, 20|just 21|like 22|you 23|

For a complete list of token attributes, check https://spacy.io/api/token#attributes .

In [62]:
df_spacy = df_en.copy()

In [63]:
# tokenize text

def tokens(text):
    doc = nlp(text)
    return [tokens.text for tokens in doc]

In [64]:
%%time

# pipeline with all the components

df_spacy['tokens'] = df_spacy['text'].map(tokens)

Wall time: 40.1 s


In [67]:
# disable the parser and ner components

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x18ee006d460>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x18ee006d160>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x18edb1d7c00>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x18edb1d9d80>)]

In [68]:
%%time

# pipeline with disabled components

df_spacy['tokens'] = df_spacy['text'].map(tokens)

Wall time: 19.8 s


In [69]:
df_spacy

Unnamed: 0,id,text,tokens
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,"[@trashevrythng, @hardevrythng, @McDonalds, an..."
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[@trashevrythng, @hardevrythng, @McDonalds, re..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[@PeePosh2, @Scottschlittenh, @sceley2011, @Jo..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[@auauwra, too, bad, i, m, sending, 8, mcdonal..."
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[@_idkjia, Last, time, I, went, to, McDonalds,..."
...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[if, belos, comes, back, we, should, just, giv..."
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[we, were, at, mcdonalds, and, suddenly, 4, sc..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, @McDonalds, inflation, is, shri..."


In [70]:
# tokenize text and get lemmas

def tokens_and_lemmas(text):
    
    doc = nlp(text)
    
    tokens = [tokens.text for tokens in doc]
    
    lemmas = [tokens.lemma_ for tokens in doc]
    
    return tokens, lemmas

In [71]:
%%time

df_spacy[['tokens', 'lemmas']] = df_spacy.apply(lambda row: tokens_and_lemmas(row['text']), axis='columns', result_type='expand')

Wall time: 18.4 s


In [72]:
df_spacy

Unnamed: 0,id,text,tokens,lemmas
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,"[@trashevrythng, @hardevrythng, @McDonalds, an...","[@trashevrythng, @hardevrythng, @McDonalds, an..."
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[@trashevrythng, @hardevrythng, @McDonalds, re...","[@trashevrythng, @hardevrythng, @McDonalds, re..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[@PeePosh2, @Scottschlittenh, @sceley2011, @Jo...","[@PeePosh2, @Scottschlittenh, @sceley2011, @Jo..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[@auauwra, too, bad, i, m, sending, 8, mcdonal...","[@auauwra, too, bad, I, m, send, 8, mcdonald, ..."
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[@_idkjia, Last, time, I, went, to, McDonalds,...","[@_idkjia, last, time, I, go, to, McDonalds, ...."
...,...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[if, belos, comes, back, we, should, just, giv...","[if, belos, come, back, we, should, just, give..."
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[we, were, at, mcdonalds, and, suddenly, 4, sc...","[we, be, at, mcdonald, and, suddenly, 4, schoo..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ...","[IM, sorry, I, tell, u, to, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, @McDonalds, inflation, is, shri...","[tough, time, @McDonalds, inflation, be, shrin..."


We have tested 2 ways of processing our text: NLTK and spaCy.

We could now use the processed text to train a model for sentiment analysis if we had labelled data. Since we don't (at least for now), we will leave this supervised approach for later.

We will now focus on a rule-based (or lexicon-based) approach.

### 2. Sentiment analysis

In [75]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#### 2.1. VADER

VADER (Valence Aware Dictionary and sEntiment Reasoner) is a lexicon and rule-based sentiment analysis tool that is specifically **attuned to sentiments expressed in social media**, https://github.com/cjhutto/vaderSentiment#features-and-updates .

We'll append 3 columns to our dataset:

* scores with the polarity scores (negative, neutral, positive and compound)
* compound with the extracted compound score
* comp_label with the label derived from the compound score

In [74]:
df_vader = df_en.copy()

In [76]:
sid = SentimentIntensityAnalyzer()

In [77]:
df_vader['scores'] = df_vader['text'].map(lambda tweet: sid.polarity_scores(tweet))

df_vader['compound']  = df_vader['scores'].map(lambda score_dict: score_dict['compound'])

df_vader['comp_label'] = df_vader['compound'].map(lambda comp: 'pos' if comp >=0.05 else ('neg' if comp<=-0.05 else 'neu'))

In [78]:
df_vader

Unnamed: 0,id,text,scores,compound,comp_label
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"{'neg': 0.0, 'neu': 0.763, 'pos': 0.237, 'comp...",0.4767,pos
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"{'neg': 0.028, 'neu': 0.932, 'pos': 0.04, 'com...",0.2500,pos
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"{'neg': 0.28, 'neu': 0.72, 'pos': 0.0, 'compou...",-0.5423,neg
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"{'neg': 0.0, 'neu': 0.714, 'pos': 0.286, 'comp...",0.4215,pos
...,...,...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,neu
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"{'neg': 0.225, 'neu': 0.775, 'pos': 0.0, 'comp...",-0.2577,neg
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"{'neg': 0.13, 'neu': 0.87, 'pos': 0.0, 'compou...",-0.1280,neg


In [87]:
print(df_vader[['text', 'comp_label']][0:20].values)

[['@trashevrythng @hardevrythng @McDonalds and the breakfast' 'neu']
 ['@trashevrythng @hardevrythng @McDonalds respect chicken nuggets and crispy chicken sandwich nigga'
  'pos']
 ["@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Maya @stargatesg201 @tucsongirl1 @BD97 @NotRichHerrera @koby_thiel @gabeintucson9 @doublez_e @EricDTownsend @ironmikeluke @Free_Lantz @Ben1White @AZJanCR @BearDown_Ray @chango2213 @MaxKIIMFM @jasonscheer @MatthewRandle @KG7MAJ @MPShrike @Chad_Mcdonalds @thestevenwoods @CumulusNetworks @David_AZJourno @TrajanWealth @RBIrich @KIIM995 @_boog1 @JustinESports @FRomeroAstros @ESPNTucson @PHNX_Wildcats @JoeHealey42 @WaffleHouse All right I'm tapping out to save my sanity. \n\nSumming up the first 2 hours:\n\nColorado Buffaloes 🐃 talk yawn.\nTopCat bracket filler.\nNo phone calls.\n\nYou are all gemstones for reading this thread.\n\nWe made it ... together 🤲 ❤️. https://t.co/F8jWl62vYB"
  'pos']
 ['@auauwra too bad im sending 8 mcdonalds large fries to u rn' 'neg']
 ['@_

On a quick inspection, Vader seems to be working fairly well with our tweets.

We defined a threshold of 0.05 for positive sentiment and -0.05 for negative sentiment. We can also change this to better fit our needs.

Our compound values can range from -1 (most extreme negative) and +1 (most extreme positive). With this is mind, we can explore our results a little bit further.

In [90]:
# number of positive, neutral and negative comments

df_vader['comp_label'].value_counts()

pos    2126
neu    1931
neg    1320
Name: comp_label, dtype: int64

In [93]:
# % of positive, neutral and negative comments

(df_vader['comp_label'].value_counts()/len(df_vader)).round(2)

pos    0.40
neu    0.36
neg    0.25
Name: comp_label, dtype: float64

In [108]:
# comments with compound >= 0.7 (extremely positive)

print(df_vader[df_vader['compound'] >= 0.7]['text'].values)

['Spicy Soda Solstice: McDonald’s Celebrates First Day Of Summer With FREE\xa0Sprite https://t.co/zUgY2wTmKx'
 'Spicy Soda Solstice: McDonald’s Celebrates First Day Of Summer With FREE Sprite https://t.co/otEM7TEYKL'
 '@MuuInuOfficial @McDonalds The McMuu Burger three all beef patties, special Muu sauce, triple cheese, onions and a hash brown.  The value meal comes with a Muu muu thickshake.  People will love this and can build a strong association with @MuuInuOfficial https://t.co/cGIvexAgLu'
 '@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Maya @stargatesg201 @tucsongirl1 @BD97 @NotRichHerrera @koby_thiel @gabeintucson9 @doublez_e @EricDTownsend @ironmikeluke @Free_Lantz @Ben1White @AZJanCR @BearDown_Ray @chango2213 @MaxKIIMFM @jasonscheer @MatthewRandle @KG7MAJ @MPShrike @Chad_Mcdonalds @thestevenwoods @CumulusNetworks @David_AZJourno @TrajanWealth @RBIrich @KIIM995 @_boog1 @JustinESports @FRomeroAstros @ESPNTucson @PHNX_Wildcats @JoeHealey42 @WaffleHouse Rich is offended by retirees i

In [109]:
# number of comments with compound >= 0.7 (extremely positive)

len(df_vader[df_vader['compound'] >= 0.7])

448

In [110]:
# comments with compound <= -0.7 (extremely negative)

print(df_vader[df_vader['compound'] <= -0.7]['text'].values)

['did i cry today? yes✅. do i know why i cried? yes✅. did my reason for crying have anything to do with not having a mcdonalds diet coke? no❌. am i going to blame it on the lack of having a mcdonalds diet coke? yes✅.'
 '@Winning4Him He wants to reduce cancer 50% in 25 years? Then let’s get rid of @McDonalds @tacobell and all the other places that serve cancer food.\n\nThen let’s get rid of instant meals laden with chemicals.\n\nAnd to hell with Impossible Meat that is all chemicals.'
 '@_NPOB1 @PeePosh2 @Scottschlittenh @sceley2011 @Joe_Maya @tucsongirl1 @BD97 @NotRichHerrera @koby_thiel @gabeintucson9 @doublez_e @EricDTownsend @ironmikeluke @Free_Lantz @Ben1White @AZJanCR @BearDown_Ray @chango2213 @MaxKIIMFM @jasonscheer @MatthewRandle @KG7MAJ @MPShrike @Chad_Mcdonalds @thestevenwoods @CumulusNetworks @David_AZJourno @TrajanWealth @RBIrich @KIIM995 @_boog1 @JustinESports @FRomeroAstros @ESPNTucson Hey dumb ass that’s what you get for hanging out with scummy people named Sam.'
 "@KingN

In [111]:
# number of comments with compound <= -0.7 (extremely negative)

len(df_vader[df_vader['compound'] <= -0.7])

236