In [1]:
import json
import csv

import re
from emoji import UNICODE_EMOJI

from datetime import datetime
from datetime import timezone
import calendar

import nltk
from nltk.util import ngrams
from nltk import word_tokenize

In [2]:
def count_emoji(s):
    count = 0
    for emoji in UNICODE_EMOJI:
        count += s.count(emoji)
    return count

In [3]:
with open('../data/richardson-hw01-tweets.json') as f:
    tweets = json.load(f)
    
data = []
    
w_bigrams = []
w_trigrams = []
w_quadgrams = []

c_trigrams = []
c_quadgrams = []

for tweet in tweets:
    tweet_txt = tweet['text']
    num_emoji = count_emoji(tweet_txt)
    
    # remove urls
    tweet_txt = re.sub('https?:\/{2}[\d\w]+\.([\d\w]+)*(\/[^\s]*)*', '', tweet_txt)
    
    # remove punctuation - including emoji
    tweet_txt = re.sub(r'[^\w\s]', '', tweet_txt)
    
    # tokenize into words
    tweet_tokens = word_tokenize(tweet_txt)
    
    # remove stop words
    tkn_minus_stop_words = set(tweet_tokens).difference(set(nltk.corpus.stopwords.words('english')))
    
    # lemmatize existing words
    wnl = nltk.WordNetLemmatizer()
    tokens = [wnl.lemmatize(t) for t in tkn_minus_stop_words]
    
    # calculate and aggregate word n-grams
    wbg = list(ngrams(tokens, 2))
    wtg = list(ngrams(tokens, 3))
    wqg = list(ngrams(tokens, 4))
    
    w_bigrams = w_bigrams + wbg
    w_trigrams = w_trigrams + wtg
    w_quadgrams = w_quadgrams + wqg
    
    # calculate and aggregate character n-grams
    # use tokens to eliminate spaces
    ctg = []
    cqg = []
    for token in tokens:
        chrs = [c for c in token]
        ctg = ctg + list(ngrams(chrs, 3))
        cqg = cqg + list(ngrams(chrs, 4))
        
    c_trigrams = c_trigrams + ctg
    c_quadgrams = c_quadgrams + cqg
    
    # calculate date information
    posted_on = tweet['created_at']
    d1 = datetime.strptime(posted_on, '%a %b %d %H:%M:%S %z %Y')
    d2 = datetime.now(timezone.utc)
    posted_len_minutes = abs((d1-d2).days * 24 * 60)
    
    # parse domain names of urls
    domains = []
    for u in tweet['entities']['urls']:
        domains = domains + re.findall('^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)', u['expanded_url'])
    
    data.append({
        'id': tweet['id'],
        'username': tweet['user']['screen_name'],
        'user_followers': tweet['user']['followers_count'],
        'user_verified': tweet['user']['verified'],
        'w_bigrams': wbg,
        'w_trigrams': wtg,
        'w_quadgrams': wqg,
        'c_trigrams': ctg,
        'c_quadgrams': cqg,
        'has_hashtag': len(tweet['entities']['hashtags']) > 0,
        'num_hashtags': len(tweet['entities']['hashtags']),
        'has_emoji': num_emoji > 0,
        'num_emoji': num_emoji,
        'has_url': len(tweet['entities']['urls']) > 0,
        'url_domains': list(set(domains)),
        'mentions_user': len(tweet['entities']['user_mentions']) > 0,
        'day_of_week': calendar.day_name[d1.weekday()],
        'hour_posted': d1.hour,
        'posted_len_minutes': posted_len_minutes,
        'retweets': tweet['retweet_count'],
        'favorites': tweet['favorite_count']
    })
    
# print(json.dumps(data, indent=1))



In [4]:
w_bigrams_uniq = set(w_bigrams)
w_trigrams_uniq = set(w_trigrams)
w_quadgrams_uniq = set(w_quadgrams)

c_trigrams_uniq = set(c_trigrams)
c_quadgrams_uniq = set(c_quadgrams)

print('Word ngrams:')
print('Bigrams:', len(w_bigrams), len(w_bigrams_uniq))
print('Trigrams:', len(w_trigrams), len(w_trigrams_uniq))
print('Quadgrams:', len(w_quadgrams), len(w_quadgrams_uniq))
print()
print('Character ngrams:')
print('Trigrams:', len(c_trigrams), len(c_trigrams_uniq))
print('Quadgrams:', len(c_quadgrams), len(c_quadgrams_uniq))
print()
print('Total ngram features:', len(w_bigrams) + len(w_trigrams) + len(w_quadgrams) + len(c_trigrams) + len(c_quadgrams))
print('Total unique ngram features:', len(w_bigrams_uniq) + len(w_trigrams_uniq) + len(w_quadgrams_uniq) + len(c_trigrams_uniq) + len(c_quadgrams_uniq))

Word ngrams:
Bigrams: 12493 732
Trigrams: 11492 686
Quadgrams: 10491 632

Character ngrams:
Trigrams: 48087 1484
Quadgrams: 35711 1558

Total ngram features: 118274
Total unique ngram features: 5092


In [5]:
# Build csv file

header = ['id', 'username', 'user_followers', 'user_verified', 'has_hashtag', 'num_hashtags', 
          'has_emoji', 'num_emoji', 'has_url', 'mentions_user', 'day_of_week', 
          'hour_posted', 'posted_len_minutes', 'retweets', 'favorites']

header = header + [x[0] + '*' + x[1] for x in list(w_bigrams_uniq)]
header = header + [x[0] + '*' + x[1] + '*' + x[2] for x in list(w_trigrams_uniq)]
header = header + [x[0] + '*' + x[1] + '*' + x[2] + '*' + x[3] for x in list(w_quadgrams_uniq)]
header = header + [x[0] + '*' + x[1] + '*' + x[2] for x in list(c_trigrams_uniq)]
header = header + [x[0] + '*' + x[1] + '*' + x[2] + '*' + x[3] for x in list(c_quadgrams_uniq)]

print('Total features:', len(header))
print('Total unique ngram features:', len(header[15:]))

Total features: 5107
Total unique ngram features: 5092


In [6]:
csv_data = [header]

for item in data:
    row = []
    row.append(item['id'])
    row.append(item['username'])
    row.append(item['user_followers'])
    row.append(item['user_verified'])
    row.append(item['has_hashtag'])
    row.append(item['num_hashtags'])
    row.append(item['has_emoji'])
    row.append(item['num_emoji'])
    row.append(item['has_url'])
    # row.append(item['url_domains'])
    row.append(item['mentions_user'])
    row.append(item['day_of_week'])
    row.append(item['hour_posted'])
    row.append(item['posted_len_minutes'])
    row.append(item['retweets'])
    row.append(item['favorites'])
    
    for x in header[15:]:
        temp = x.split('*')
        contains_ngram = False
        
        if len(temp) == 2:
            for y in item['w_bigrams']:
                if y[0] == temp[0] and y[1] == temp[1]:
                    contains_ngram = True
                    break
        if len(temp) == 3:
            if len(temp[0]) == 1:
                for y in item['c_trigrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2]:
                        contains_ngram = True
                        break
            else:
                for y in item['w_trigrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2]:
                        contains_ngram = True
                        break
        if len(temp) == 4:
            if len(temp[0]) == 1:
                for y in item['c_quadgrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2] and y[3] == temp[3]:
                        contains_ngram = True
                        break
            else:
                for y in item['w_quadgrams']:
                    if y[0] == temp[0] and y[1] == temp[1] and y[2] == temp[2] and y[3] == temp[3]:
                        contains_ngram = True
                        break
        
        row.append(int(contains_ngram))
        
    csv_data.append(row)
    
print('Total records:', len(csv_data))
print('Total features:', len(csv_data[0]))

Total records: 1002
Total features: 5107


In [7]:
with open('../data/richardson-hw03.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(csv_data)
    f.close()