In [None]:
import pandas as pd
import nltk
nltk.data.path.clear()
#add your path
nltk.data.path.append(r'')
from nltk import pos_tag, word_tokenize
nltk.download('omw-1.4')

import ssl
import fds_utils as utils
import importlib
_ = importlib.reload(utils)
import multiprocessing as mp

In [2]:
tweets = pd.read_pickle("data/tweets_sample.pkl")

## Filtering

__What is the proportion of non-english tweets? Would it make sense to disregard those tweets?__<br>
The 'und' (undefined) language tag is used when a tweet consists only of hashtags / links / emojis, we treat them as english tweets.
By selecting all tweets which do not have 'und' or 'en' (english) as a language tag, we see that the proportion of non-english tweets accounts for 3.3 %. Because of this small value it seems reasonable to disregard any non-english tweets. This would simplify the sentiment analysis, where the training would be based only on english text data.

In [3]:
tweets.shape

(657307, 5)

In [4]:
# printing the proportion of tweets that are NOT English and NOT "undefined" language.
print(round(tweets[(tweets['lang'] != 'und') & (tweets['lang'] != 'en')].size / tweets.size, 4))
# Drop all non-english 
tweets = tweets[(tweets['lang'] == 'und') | (tweets['lang'] == 'en')]
tweets = tweets.reset_index(drop=True)

0.0332


Filter out geolocation /= USA

In [5]:
print(tweets['country'].value_counts().head(20))

country
United States     583985
Canada             16893
United Kingdom      8494
Australia           2572
India               1536
Mexico              1385
México              1226
Singapore            892
Ireland              821
Germany              751
France               654
Japan                622
South Africa         618
Vietnam              530
Estados Unidos       492
Deutschland          468
New Zealand          466
Armenia              465
Spain                454
Thailand             419
Name: count, dtype: int64


In [6]:
# Filter out non-USA tweets
tweets = tweets[tweets['country'] == 'United States'].reset_index(drop=True)
print(f"Number of USA tweets: {tweets.shape[0]}")

Number of USA tweets: 583985


## Tokenizing
The following chapters are partly based on this [tutorial](https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk#step-1-%E2%80%94-installing-nltk-and-downloading-the-data).

In [7]:
%%time
# Seems to be the 'best' tokenizer for tweets data ()
tweet_tokenizer = nltk.tokenize.TweetTokenizer()
with mp.Pool(mp.cpu_count()) as pool:
    tweets['tokens'] = pool.map(tweet_tokenizer.tokenize, tweets['text'])

CPU times: total: 2.38 s
Wall time: 8.07 s


In [8]:
pd.set_option('display.max_colwidth', None)
tweets.head()

Unnamed: 0,date,text,lang,country,city,tokens
0,2016-08-12,@theblaze @realDonaldTrump https://t.co/TY9DlZ584c,und,United States,Frontenac,"[@theblaze, @realDonaldTrump, https://t.co/TY9DlZ584c]"
1,2016-08-12,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \n\n@realDonaldTrump \n#TrumpPence \n\nhttps://t.co/5GMNZq40V3,en,United States,Baton Rouge,"[@BarackObama, @FBI, @LORETTALYNCH, ALL, IN, COLLUSION, TOGETHER, #NOJUSTICE, @realDonaldTrump, #TrumpPence, https://t.co/5GMNZq40V3]"
2,2016-08-12,@theblaze @realDonaldTrump https://t.co/n050DBSpv0,und,United States,Frontenac,"[@theblaze, @realDonaldTrump, https://t.co/n050DBSpv0]"
3,2016-08-12,"#CNN #newday clear #Trump deliberately throwing this race,in 2007 he knew that #ISIS and destabilization of Mideast started w/Iraq invasion",en,United States,Baltimore,"[#CNN, #newday, clear, #Trump, deliberately, throwing, this, race, ,, in, 2007, he, knew, that, #ISIS, and, destabilization, of, Mideast, started, w, /, Iraq, invasion]"
4,2016-08-12,"@realDonaldTrump, you wouldn't recognize a lie if it came from your own mouth, and they do continually. #NeverTrump https://t.co/pKSQM8yikm",en,United States,Palm Springs,"[@realDonaldTrump, ,, you, wouldn't, recognize, a, lie, if, it, came, from, your, own, mouth, ,, and, they, do, continually, ., #NeverTrump, https://t.co/pKSQM8yikm]"


## Normalizing
From the tutorial: "Words have different forms—for instance, ran, runs, and running are various forms of the same verb, run. Depending on the requirement of your analysis, all of these versions may need to be converted to the same form, run. Normalization in NLP is the process of converting a word to its canonical form. <br>
Two common techniques are stemming and lemmatization. We will use lemmatization because of its higher accuracy in normalizing words correctly. It "normalizes a word with the context of vocabulary and morphological analysis of words in text".

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


In [10]:

with mp.Pool(mp.cpu_count()) as pool:
    tweets['tokens_normal'] = pool.map(utils.normalize, tweets['tokens'])

In [11]:
tweets.to_pickle("data/tweets_sample_normalized.pkl")

In [12]:
tweets

Unnamed: 0,date,text,lang,country,city,tokens,tokens_normal
0,2016-08-12,@theblaze @realDonaldTrump https://t.co/TY9DlZ584c,und,United States,Frontenac,"[@theblaze, @realDonaldTrump, https://t.co/TY9DlZ584c]","[@theblaze, @realdonaldtrump]"
1,2016-08-12,@BarackObama \n@FBI\n@LORETTALYNCH \nALL IN COLLUSION TOGETHER \n\n#NOJUSTICE \n\n@realDonaldTrump \n#TrumpPence \n\nhttps://t.co/5GMNZq40V3,en,United States,Baton Rouge,"[@BarackObama, @FBI, @LORETTALYNCH, ALL, IN, COLLUSION, TOGETHER, #NOJUSTICE, @realDonaldTrump, #TrumpPence, https://t.co/5GMNZq40V3]","[@barackobama, @fbi, @lorettalynch, collusion, together, #nojustice, @realdonaldtrump, #trumppence]"
2,2016-08-12,@theblaze @realDonaldTrump https://t.co/n050DBSpv0,und,United States,Frontenac,"[@theblaze, @realDonaldTrump, https://t.co/n050DBSpv0]","[@theblaze, @realdonaldtrump]"
3,2016-08-12,"#CNN #newday clear #Trump deliberately throwing this race,in 2007 he knew that #ISIS and destabilization of Mideast started w/Iraq invasion",en,United States,Baltimore,"[#CNN, #newday, clear, #Trump, deliberately, throwing, this, race, ,, in, 2007, he, knew, that, #ISIS, and, destabilization, of, Mideast, started, w, /, Iraq, invasion]","[#cnn, #newday, clear, #trump, deliberately, throw, race, 2007, know, #isis, destabilization, mideast, start, w, iraq, invasion]"
4,2016-08-12,"@realDonaldTrump, you wouldn't recognize a lie if it came from your own mouth, and they do continually. #NeverTrump https://t.co/pKSQM8yikm",en,United States,Palm Springs,"[@realDonaldTrump, ,, you, wouldn't, recognize, a, lie, if, it, came, from, your, own, mouth, ,, and, they, do, continually, ., #NeverTrump, https://t.co/pKSQM8yikm]","[@realdonaldtrump, recognize, lie, come, mouth, continually, #nevertrump]"
...,...,...,...,...,...,...,...
583980,2016-09-12,@CNBC @SquawkAlley @realDonaldTrump Kudlow is on @Morning_Joe tomorrow..What will he say this time? Just another #SwordSwallower for Trump.,en,United States,Florida,"[@CNBC, @SquawkAlley, @realDonaldTrump, Kudlow, is, on, @Morning_Joe, tomorrow, .., What, will, he, say, this, time, ?, Just, another, #SwordSwallower, for, Trump, .]","[@cnbc, @squawkalley, @realdonaldtrump, kudlow, @morning_joe, tomorrow, .., say, time, another, #swordswallower, trump]"
583981,2016-09-12,"TRUMP U, TAXES ,WEIRD MEDICAL REPORT WITH A WHACKED OUT DOCTOR ??? PAY4PLAY FLORIDA AND??MELANIA ILLEGAL ENTERING... https://t.co/2qDahl7A9n",en,United States,San Diego,"[TRUMP, U, ,, TAXES, ,, WEIRD, MEDICAL, REPORT, WITH, A, WHACKED, OUT, DOCTOR, ?, ?, ?, PAY, 4PLAY, FLORIDA, AND, ?, ?, MELANIA, ILLEGAL, ENTERING, ..., https://t.co/2qDahl7A9n]","[trump, u, taxes, weird, medical, report, whacked, doctor, pay, 4play, florida, melania, illegal, entering, ...]"
583982,2016-09-12,"@CarolCNN if MSM were honest watch any utube video of @realDonaldTrump rally to witness full force bigotry,misogyny,xenophobia,&amp; hatred",en,United States,Coral Gables,"[@CarolCNN, if, MSM, were, honest, watch, any, utube, video, of, @realDonaldTrump, rally, to, witness, full, force, bigotry, ,, misogyny, ,, xenophobia, ,, &, hatred]","[@carolcnn, msm, honest, watch, utube, video, @realdonaldtrump, rally, witness, full, force, bigotry, misogyny, xenophobia, hatred]"
583983,2016-09-12,It's interesting that Hillary Clinton's crowds are small &amp; yet her polls are polling good ? https://t.co/lrq2SOOwSO,en,United States,Fairbanks,"[It's, interesting, that, Hillary, Clinton's, crowds, are, small, &, yet, her, polls, are, polling, good, ?, https://t.co/lrq2SOOwSO]","[interest, hillary, clinton's, crowd, small, yet, poll, poll, good]"
