## Preprocessing

In [1]:
from preprocessing.pan21 import read_dataset

path = './data/pan21/train/en'

In [2]:
data = read_dataset(path)

In [3]:
data.keys()    # labels

dict_keys([0, 1])

In [4]:
len(data[0]), len(data[0][0])    # (num_users_in_a_label, num_tweets_per_user)

(100, 200)

## Stats

In [5]:
from eda.get_stats import *

In [6]:
hashtags = count_hashtags(data)
urls = count_urls(data)
users = count_users(data)
rt = count_rt(data)


# print out
print('#HASHTAG#:', hashtags)
print('#URL#:', urls)
print('#USER#:', users)
print('##RT##:', rt)

#HASHTAG#: {0: 3757, 1: 3392}
#URL#: {0: 8571, 1: 6768}
#USER#: {0: 9723, 1: 11571}
##RT##: {0: 7633, 1: 6090}


In [7]:
uppercase_chars = count_uppercase_chars(data)
min_chars = count_min_chars(data)
avg_min_chars = count_avg_min_chars(data)
max_chars = count_max_chars(data)
avg_max_chars = count_avg_max_chars(data)
chars = count_chars(data)

# print out
print('number of upper-case characters:', uppercase_chars)
print('min min characters:', min_chars)
print('avg min characters:', avg_min_chars)
print('max max characters:', max_chars)
print('avg max characters:', avg_max_chars)
print('number of characters:', chars)

number of upper-case characters: {0: 71025, 1: 75867}
min min characters: {0: 4, 1: 5}
avg min characters: {0: 10.28, 1: 10.55}
max max characters: {0: 143, 1: 148}
avg max characters: {0: 125.08, 1: 128.2}
number of characters: {0: 1109779, 1: 1134313}


In [8]:
uppercase_words = count_uppercase_words(data)
min_words = count_min_words(data)
avg_min_words = count_avg_min_words(data)
max_words = count_max_words(data)
avg_max_words = count_avg_max_words(data)
words = count_words(data)

# print out
print('number of upper-case words:', uppercase_words)
print('min min words:', min_words)
print('avg min words:', avg_min_words)
print('max max words:', max_words)
print('avg max words:', avg_max_words)
print('number of words:', words)

number of upper-case words: {0: 43584, 1: 44012}
min min words: {0: 1, 1: 1}
avg min words: {0: 2.53, 1: 2.68}
max max words: {0: 31, 1: 32}
avg max words: {0: 24.85, 1: 25.58}
number of words: {0: 207317, 1: 213744}


In [9]:
# include #HASHTAG#, #URL#, #USER#
# not same as count_uppercase_words() + count(#HASHTAG#, #URL#, #USER#)
# not all #...# are followed by space. e.g., #USER#_Daily

min_words = count_min_words_alt(data)
avg_min_words = count_avg_min_words_alt(data)
max_words = count_max_words_alt(data)
avg_max_words = count_avg_max_words_alt(data)
words = count_words_alt(data)

# print out
print('min min words:', min_words)
print('avg min words:', avg_min_words)
print('max max words:', max_words)
print('avg max words:', avg_max_words)
print('number of words:', words)

min min words: {0: 1, 1: 2}
avg min words: {0: 3.29, 1: 3.5}
max max words: {0: 32, 1: 32}
avg max words: {0: 25.37, 1: 26.07}
number of words: {0: 228644, 1: 234867}


In [10]:
stop_words = count_stopwords(data)

# print out
print('number of stop-words:', stop_words)

## just split: number of stop-words: {0: 95998, 1: 101886}

number of stop-words: {0: 95998, 1: 101886}


In [11]:
emojis = count_emojis(data)

# print out
print('number of emojis:', emojis)

number of emojis: {0: 8792, 1: 7446}


In [12]:
sentiments = get_sentiments(data)

# print out
print('sentiment:', sentiments)

sentiment: {'positive': {0: 6385, 1: 6163}, 'negative': {0: 3679, 1: 4482}, 'neutral': {0: 9936, 1: 9355}}


In [13]:
ner1 = get_named_entities(data, corpora='en_core_web_sm')
ner2 = get_named_entities(data, corpora='xx_ent_wiki_sm')

# print out
print('ner1', ner1)
print()
print('ner2', ner2)

100%|█████████████████████████████████████████| 100/100 [03:26<00:00,  2.07s/it]
100%|█████████████████████████████████████████| 100/100 [04:52<00:00,  2.92s/it]


{'ORDINAL', 'CARDINAL', 'EVENT', 'NORP', 'FAC', 'MONEY', 'LANGUAGE', 'LAW', 'PRODUCT', 'TIME', 'DATE', 'QUANTITY', 'PERCENT', 'WORK_OF_ART'}


100%|█████████████████████████████████████████| 100/100 [01:41<00:00,  1.02s/it]
100%|█████████████████████████████████████████| 100/100 [01:41<00:00,  1.02s/it]


set()
ner1 {'PERSON': {0: 3809, 1: 3695}, 'PER': {0: 0, 1: 0}, 'ORG': {0: 4586, 1: 4226}, 'GPE': {0: 1966, 1: 1839}, 'LOC': {0: 106, 1: 98}, 'MISC': {0: 30469, 1: 27709}}

ner2 {'PERSON': {0: 0, 1: 0}, 'PER': {0: 4672, 1: 4743}, 'ORG': {0: 2611, 1: 2387}, 'GPE': {0: 0, 1: 0}, 'LOC': {0: 2437, 1: 2132}, 'MISC': {0: 6458, 1: 6534}}


In [14]:
## NOT ALL #...# is followed by space
'''e.g.
		<document><![CDATA[There was a 5sos meet up in Belgium! #HASHTAG# #HASHTAG# #USER# 💗 💗 #URL# (#USER#_Daily ) -E]]></document>
'''

def count_max_words_alt_(data):
    # given data: {label: [[tweets]]
    # remove ##RT##
    # get total number of words for each label
    # output out: {label: count}
    out = {}
    for label, users in data.items():
        out[label] = 0
        for user in users:
            if sum([len(re.findall(r'#(URL|HASHTAG|USER)#\w', tweet)) for tweet in user]) > 0:
                print(user[0],label)
                print([re.findall(r'#(URL|HASHTAG|USER)#\w', tweet) for tweet in user])
                break
            out[label] += sum([len(re.findall(r'#(URL|HASHTAG|USER)#', tweet)) for tweet in user])
    return out

count_max_words_alt_(data)

Who wants this acc 0
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['USER'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
#USER# #USER# Climate change is STILL ahead. WAKE UP! 1
[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [],

{0: 1271, 1: 0}