<h1> Tweets Analysis </h1>
<h2> Text Pre-processing </h2>

In [145]:
import json
 
with open('python.json', 'r') as f:
    line = f.readline() # read only the first tweet/line
    tweet = json.loads(line) # load it as Python dict
    print(json.dumps(tweet, indent=4)) # pretty-print

{
    "quote_count": 0, 
    "contributors": null, 
    "truncated": true, 
    "text": "\uff20null Happy Birthday to You/alan\n#\u76f8\u4e92\u53a8\u5b78\u6821 #\u76f8\u4e92\u30d5\u30a9\u30ed\u30fc #\u30d5\u30a9\u30ed\u30d0100% #\u3044\u3044\u306d\u3057\u305f\u4eba\u5168\u54e1\u30d5\u30a9\u30ed\u30fc\u3059\u308b #RT\u3057\u305f\u4eba\u5168\u54e1\u30d5\u30a9\u30ed\u30fc\u3059\u308b #\u76f8\u4e92\u5e0c\u671b #\u62e1\u6563\u5e0c\u671b #\u3044\u3044\u306d\u304f\u3060\u3055\u3044 #\u30d5\u30a9\u30ed\u30d0\u2026 https://t.co/jmJd2Zbzoz", 
    "is_quote_status": false, 
    "in_reply_to_status_id": null, 
    "reply_count": 0, 
    "id": 947991891240505344, 
    "favorite_count": 0, 
    "entities": {
        "user_mentions": [
            {
                "id": 3562471, 
                "indices": [
                    0, 
                    5
                ], 
                "id_str": "3562471", 
                "screen_name": "null", 
                "name": "not quite nothing"
      

<b> Key attributes: </b>
<ul>
<li>text: the text of the tweet itself</li>
<li> created_at: the date of creation </li>
<li> favorite_count, retweet_count: the number of favourites and retweets </li>
<li> favorited, retweeted: boolean stating whether the authenticated user (you) have favourited or retweeted this tweet </li>
<li> lang: acronym for the language (e.g. “en” for english)</li>
<li> id: the tweet identifier </li>
<li> place, coordinates, geo: geo-location information if available</li>
<li> user: the author’s full profile</li>
<li> entities: list of entities like URLs, @-mentions, hashtags and symbols</li>
<li> in_reply_to_user_id: user identifier if the tweet is a reply to a specific user</li>
<li> in_reply_to_status_id: status identifier id the tweet is a reply to a specific status </li>
</ul>

<b> Tokenization </b>

Tweet tokenization is a bit differnt than a general-purpose English tokeniser : @-mentions, emoticons, URLs and #hash-tags are not recognised as single tokens. The following code will propose a pre-processing chain that will consider these aspects of the language.

In [146]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'the',u'\u2019',u'\u2026','The',u'de',u'\xe9']

import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s.encode('ascii','ignore'))
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [147]:
tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
print(preprocess(tweet))

['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']


<h3> Terms Counting </h3>

In [162]:
import operator 
import json
from collections import Counter
from collections import defaultdict
 
fname = 'ces18.json'
with open(fname, 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        #terms_all = [term for term in preprocess(tweet['text'],True) if term not in stop and  term.startswith(('#'))]
        terms_all = [term for term in preprocess(tweet['text'],True) if term not in stop]
        # Update the counter
        count_all.update(terms_all)
    # Print the first 5 most frequent words
    print(count_all.most_common(5))

[('know', 4840), ('@blackmirror', 4839), ('goes', 4828), ('https://t.co/1ntdxuorld', 4815), ('#ces2018', 2617)]


<b> Terms Co-Occurrences </b>

In [163]:
#Co-occurence matrix

from collections import defaultdict
# remember to include the other import from the previous post
 
com = defaultdict(lambda : defaultdict(int))
 
# f is the file pointer to the JSON data set
fname = 'ces18.json'
n_docs = 0
with open(fname, 'r') as f:
    for line in f: 
          tweet = json.loads(line)
          n_docs = n_docs + 1
          terms_only = [term for term in preprocess(tweet['text']) 
                  if term not in stop] 
                  #and not term.startswith(('#', '@'))]
 
    # Build co-occurrence matrix
    for i in range(len(terms_only)-1):            
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])                
            if w1 != w2:
                com[w1][w2] += 1

In [174]:
search_word = '#samsung' # pass a term as a command-line argument
count_search = Counter()
fname = 'ces18.json'
with open(fname, 'r') as f:
    for line in f:
        tweet = json.loads(line)
        terms_only = [term for term in preprocess(tweet['text']) 
                      if term not in stop
                      and term.startswith(('#', '@'))]
        if search_word in terms_only:
            count_search.update(terms_only)
print("Co-occurrence for %s:" % search_word)
print(count_search.most_common(20))

Co-occurrence for #samsung:
[('#samsung', 2), ('#notebook9Pen', 1), ('#samsungflip', 1), ('@ThatsITLA', 1), ('#CES', 1), ('#SamsungxCES2018', 1), ('#CES2018', 1), ('#samsunggear', 1), ('#ces2018', 1)]
