# Simple text analytics

We start with a compact version of the code we used to look at **D**ocument **F**requencies (DF). As always, we start with the necessary housekeeping, importing the necessary toolboxes and Twitter API credentials.

In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import itertools
import collections
import tweepy as tw

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

import re

# stopwords
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))


# My access keys are in ./credentials.py as variables:
# CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, and ACCESS_SECRET 
#
# Importing them in this manner will allow me to 
# use them as variables, below.
#
from credentials import * 

# authenticating with Twitter

authenticateMe = tw.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
authenticateMe.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
api = tw.API(authenticateMe, wait_on_rate_limit=True)

[nltk_data] Downloading package stopwords to /Users/lirakliotis-
[nltk_data]     old/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Define the scope of our search

The variable ``searchFor`` is the search argument we are passing to Twitter via the interface we've established above. We can include standard Twitter nomenclature here, e.g., ``-filter:retweets``, to narrow our search as needed. 

In [2]:
# Parameters
# what to search for?
# what language to conduct the search in?
# since when (date)?
# until when (date)?
# how many tweets to pull in?
# how many terms to plot?
#
################################################################################

SEARCH_TERM        = "hong_kong"
LANGUAGE           = "en"
SINCE_DATE         = '2016-01-01' # eventually need datetime()
UNTIL_DATE         = '2020-01-01' # formatted yyyy-mm-dd
TWEETS_TO_BRING_IN = 100         # KEEP THIS UNDER 3000 PLEASE!
TOP_N_TERMS        = 30

################################################################################

# the code below can take a few seconds to run, depending
# on the number of tweets we specify as argument in 
# method .items(). Run this cell sparingly to save time.
#
searchFor = SEARCH_TERM + " -filter:retweets"

tweets = tw.Cursor(api.search,
                   q=searchFor,
                   lang=LANGUAGE,
                   since=SINCE_DATE,
                   until=UNTIL_DATE).items(TWEETS_TO_BRING_IN)

textFromTweets = [tweet.text for tweet in tweets]

# Plain vanilla function to remove URI/URLs from tweets
# using the re package for regular expressions
#
def remove_url(txt):
    """Remove URL by replacing it with an empty
    substring within a given string txt 

    Parameters
    ----------
    txt : string
        the string to remove URLs from.

    Returns
    -------
    The same txt string without any URLs.
    """

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

## Main processing

Cleanup URLS, convert to lower case, tokenize, put everything in a list, remove stopwords, count, pass to dataframe, and visualize.

In [3]:
# Remove URLs
#
tweetsWithoutURL = [remove_url(tweet) for tweet in textFromTweets]

# Convert to lower case
#
tweetsTokens = [tweet.lower().split() for tweet in tweetsWithoutURL]

# .chain(*arg) is a Python iterator;
# see https://docs.python.org/2/library/itertools.html#itertools.chain
# Here, we direct the iterator's output to a list() constructor
#
allWordsFromTweets = list(itertools.chain(*tweetsTokens))

# remove stopwords
#
allWordsFromTweets = [word 
                       for word 
                       in allWordsFromTweets
                       if not word in stopWords]
#Adding The Similar Words Together
#
Hong_Kong = ["hong"],["kong"],["hongkong"]
Vice_President = ["mikepence"],["pence"],["vp"]
President ["Donald"], ["Trump"], ["realdonaldtrump"], ["POTUS"]
                    
for term in AllWordsInTweets if term in Hong_Kong wordcount(Hong_Kong) += 1
for AllWordsInTweets in range Vice_President
    wordcount(Vice_President) += 1
for AllWordsInTweets in range President
    wordcount(President) += 1

                      
# Count the occurences of each word now that trivial terms (stopwords)
# have been removed
#
wordCount = collections.Counter(allWordsFromTweets)
                         

# Do a bit of reporting
#
print('      Tweets brought in: ' + str(TWEETS_TO_BRING_IN))
print('Non-trivial terms found: ' + str(len(allWordsFromTweets)))

# from python list to pandas dataframe
# (Remember: TOP_N_TERMS is defined in the beginning of the notebook)
#
tweetsDF = pd.DataFrame(wordCount.most_common(TOP_N_TERMS),
                        columns=['words', 'count'])

# plotting, finally
#
fig, ax = plt.subplots(figsize=(8, 8))

# Plot horizontal bar graph
#
tweetsDF.sort_values(by='count').plot.barh(x='words',
                                           y='count',
                                           ax=ax,
                                           color="pink")

ax.set_title("The " + str(TOP_N_TERMS) + " most common terms in tweets about " + SEARCH_TERM)

plt.show()

SyntaxError: invalid syntax (<ipython-input-3-e836dbbda746>, line 27)