# C-More

### 1. Process text for sentiment analysis

In [1]:
import json
import pandas as pd

import nltk
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

#### 1.1. Load json data into a dataframe

In [2]:
list_json = []

with open('tweet_json_1day.txt') as file:
    for line in file:
        data = json.loads(line)
        list_json.append(data)

df = pd.DataFrame(list_json, columns = ['id', 'text', 'lang', 'created_at', 'public_metrics'])

#### 1.2. Select only tweets in English

In [3]:
df_en = df[df['lang'] == 'en'].copy()

In [4]:
df_en.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5377 entries, 0 to 7151
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              5377 non-null   object
 1   text            5377 non-null   object
 2   lang            5377 non-null   object
 3   created_at      5377 non-null   object
 4   public_metrics  5377 non-null   object
dtypes: object(5)
memory usage: 252.0+ KB


In [5]:
columns_to_remove = ['lang', 'created_at', 'public_metrics']

df_en.drop(columns_to_remove, axis=1, inplace=True)

In [6]:
df_en.head()

Unnamed: 0,id,text
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...


#### 1.3. Process text with NLTK

We are now processing our text for sentiment analysis. Our first approach will be to tokenize it and remove stop words.

In [7]:
# tokenization

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
# redule_len=True replaces repeated character sequences of length 3 or greater with sequences of length 3
# examples: waaaaayyyy --> waaayyy
# strip_handles=True removes Twitter handles (@xxxx...)

In [8]:
# stop words

stopwords = set(nltk.corpus.stopwords.words('english'))

In [9]:
# remove stop words

def remove_stop(tokens):
    return [token for token in tokens if token not in stopwords]

In [10]:
pipeline = [tweet_tokenizer.tokenize, remove_stop] # this will be our default pipeline
# tokenizes text and removes stop words

def process_text(text, pipeline):
    tokens = text
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [11]:
df_nltk = df_en.copy()

In [13]:
%%time

df_nltk['tokens'] = df_nltk['text'].apply(process_text, pipeline=pipeline)

Wall time: 646 ms


In [14]:
df_nltk

Unnamed: 0,id,text,tokens
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,[breakfast]
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[respect, chicken, nuggets, crispy, chicken, s..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[All, right, I'm, tapping, save, sanity, ., Su..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[bad, im, sending, 8, mcdonalds, large, fries,..."
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[Last, time, I, went, McDonalds, ., lol, https..."
...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[belos, comes, back, give, mcdonalds, sprite]"
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[mcdonalds, suddenly, 4, school, buses, filled..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, inflation, shrinking, large, fr..."


Instead of using simple tokens, we can stem or lemmatize them. We will lemmatize them in our next approach. To do this, we need to have the part-of-speech tags for each token.

In [15]:
# select longest tweet for testing purposes

test_text = sorted(df_en['text'], key=lambda x: len(x), reverse=True)[0]

test_text

'@bungoman @moothought @__justplaying @heavenbent11 @IgorBrigadir @huggingpuppy @flybottlemist @panchromaticity @temujin9 @Duderichy @jicapal @goblinodds @_brentbaum @scrmshw @OccultBoyscout @bogmeat @Knipps @OneEyedAlpaca @SoupOfToday @Acre108 @ObserverSuns @ZacharyHundley @_holyweather @anonynaut @magicianbrain @mimi10v3 @karnagraha @er1enney0ung @Lithros @parafactual @KrikkitMotel @__frye @eggprophet @pareinoia @TeddyRaccovelt @dancinghorse16 @storebrandguy @NLRG_ @irafeierabend @bloobsandnoods @RootOfUnity @VesselOfSpirit @MaskOfFace @ObjectOfObjects @CurlOfGradient @FingerOfHand @CauseOfProblem @ModelOfTheory @ReneeSolana @quotidiania Good morning to all you wonderful people. I just tried a new coffee from McDonalds and it is wonderful, just like you'

In [16]:
pos_tag(tweet_tokenizer.tokenize(test_text))

[('Good', 'JJ'),
 ('morning', 'NN'),
 ('to', 'TO'),
 ('all', 'DT'),
 ('you', 'PRP'),
 ('wonderful', 'JJ'),
 ('people', 'NNS'),
 ('.', '.'),
 ('I', 'PRP'),
 ('just', 'RB'),
 ('tried', 'VBD'),
 ('a', 'DT'),
 ('new', 'JJ'),
 ('coffee', 'NN'),
 ('from', 'IN'),
 ('McDonalds', 'NNP'),
 ('and', 'CC'),
 ('it', 'PRP'),
 ('is', 'VBZ'),
 ('wonderful', 'JJ'),
 (',', ','),
 ('just', 'RB'),
 ('like', 'IN'),
 ('you', 'PRP')]

Since our lemmatizer, `WordNetLemmatizer`, uses WordNet tags, we need to convert NLTK's default tags (Penn Treebank part-of-speech tags) to WordNet tags.

In [17]:
# check tags (example for PRP)

nltk.help.upenn_tagset('PRP')

PRP: pronoun, personal
    hers herself him himself hisself it itself me myself one oneself ours
    ourselves ownself self she thee theirs them themselves they thou thy us


In [18]:
def normalize(text, tokenizer=TweetTokenizer(reduce_len=True, strip_handles=True), lemmatizer=WordNetLemmatizer(), stop_words=stopwords):
    '''
    Normalizes review by removing stopwords and lemmatizing tokens.
    '''
    
    def lemmatize(token, tag):
        '''
        Converts Penn Treebank part-of-speech tags (the default tag set in nltk.pos_tag)
        to WordNet tags - defaults to wn.Noun if the first letter of the Penn Treebank pos tag
        is neither 'N', 'V', 'R' or 'J'.
        Returns lemmatized token.
        '''        
        wordnet_tag = {
            'N': wn.NOUN, 
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        
        return lemmatizer.lemmatize(token, wordnet_tag)
    
    return [lemmatize(token, tag)  
            for (token, tag) in pos_tag(tokenizer.tokenize(text)) 
            if token not in stop_words]

In [19]:
normalize(test_text)

['Good',
 'morning',
 'wonderful',
 'people',
 '.',
 'I',
 'try',
 'new',
 'coffee',
 'McDonalds',
 'wonderful',
 ',',
 'like']

In [20]:
%%time

df_nltk['lemmas'] = df_nltk['text'].apply(normalize)

Wall time: 7.8 s


In [21]:
df_nltk

Unnamed: 0,id,text,tokens,lemmas
0,1539397722595377152,@trashevrythng @hardevrythng @McDonalds and th...,[breakfast],[breakfast]
1,1539397657571074049,@trashevrythng @hardevrythng @McDonalds respec...,"[respect, chicken, nuggets, crispy, chicken, s...","[respect, chicken, nugget, crispy, chicken, sa..."
2,1539397645625458688,@PeePosh2 @Scottschlittenh @sceley2011 @Joe_Ma...,"[All, right, I'm, tapping, save, sanity, ., Su...","[All, right, I'm, tap, save, sanity, ., Summin..."
4,1539397571013115904,@auauwra too bad im sending 8 mcdonalds large ...,"[bad, im, sending, 8, mcdonalds, large, fries,...","[bad, im, send, 8, mcdonalds, large, fry, u, rn]"
5,1539397434715082754,@_idkjia Last time I went to McDonalds. lol ...,"[Last, time, I, went, McDonalds, ., lol, https...","[Last, time, I, go, McDonalds, ., lol, https:/..."
...,...,...,...,...
7146,1539035600451514375,if belos comes back we should just give him a ...,"[belos, comes, back, give, mcdonalds, sprite]","[belos, come, back, give, mcdonalds, sprite]"
7147,1539035573263863808,we were at mcdonalds and suddenly 4 school bus...,"[mcdonalds, suddenly, 4, school, buses, filled...","[mcdonalds, suddenly, 4, school, bus, fill, te..."
7149,1539035538300207104,IM SORRY I TOLD U TO DRINK MCDONALDS SPRITE ht...,"[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ...","[IM, SORRY, I, TOLD, U, TO, DRINK, MCDONALDS, ..."
7150,1539035499242864641,Tough times @McDonalds inflation is shrinking ...,"[Tough, times, inflation, shrinking, large, fr...","[Tough, time, inflation, shrink, large, fry, h..."
