### **LIBRARIES**

In [2]:
import os
import re            # re.sub
import pandas as pd  # pd.DataFrame

import bow
import utils

### **DATA, GLOBAL CONSTANTS**

In [3]:
DATA_DIR = "data/"
DATA_FILE = os.path.join(DATA_DIR, "train_text.csv")

### **FUNCTIONS**

In [4]:
def list_to_string(list):
    all_s = ''
    for s in list:
        if all_s == '':
            all_s = s
        else:    
            all_s = all_s + ' ' + s
    return all_s    

In [5]:
def hashtag(tokens_list):
    return re.findall(r'#\w+', tokens_list)

def user(tokens_list):
    return re.findall(r'@\w+', tokens_list)

In [6]:
def count_tokens(tokens):
    """
    ARGUMENT: list of tokens
    RETURN: dictionary {token:count}
    """
    token_count_dict = dict()
    for w in tokens:
        token_count_dict[w] = token_count_dict.get(w,0) + 1
        
    return token_count_dict

In [7]:
def reduced_dict(count_dict, tokens_hashtag, tokens_user, N):
    """
    ARGUMENTS: dictionary of counts {x:count}, N
    RETURN: reduced dictionary of counts, count >= N
    """
    reduced_token_count_dict = dict()
    token_list = []
    
    for w in count_dict:
        if w in tokens_hashtag and count_dict[w] >= 3:
            reduced_token_count_dict[w] = count_dict[w]
            token_list.append(w)
        elif w in tokens_user and count_dict[w] >= 3:
            reduced_token_count_dict[w] = count_dict[w]
            token_list.append(w)            
        elif count_dict[w] >= N:
            reduced_token_count_dict[w] = count_dict[w]
            token_list.append(w)
            
    return reduced_token_count_dict, token_list

In [8]:
# prints dict items in descending count order

def print_token_count(count_dict, N):   
    """
    ARGUMENTS: dictionary of counts {x: count}, N
    prints top N key-value in dictionary
    """
    for w in sorted(count_dict, key = count_dict.get, reverse = True):
        if count_dict[w] >= N:
            #print(f'{w}:{token_count_dict[w]}',sep=' ', end=' ', flush=True)
            print(f'{w}:{token_count_dict[w]}  ', end='\n')

### **RUN!**

In [9]:
doc = utils.file_to_string(DATA_FILE)
print(doc[0:1000])

"text"
"Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
"Forest fire near La Ronge Sask. Canada"
"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
"13,000 people receive #wildfires evacuation orders in California "
"Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school"
"#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires"
"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas"
"I'm on top of the hill and I can see a fire in the woods..."
"There's an emergency evacuation happening now in the building across the street"
"I'm afraid that the tornado is coming to our area..."
"Three people died from the heat wave so far"
"Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding

In [10]:
tokens = bow.clean_doc_text(doc)
print(tokens[0:20])

['text', 'deed', 'reason', '#earthquak', 'may', 'allah', 'forgiv', 'us', 'forest', 'fire', 'near', 'la', 'rong', 'sask', 'canada', 'resid', 'ask', 'shelter', 'place', 'notifi']


In [11]:
all_tokens = list_to_string(tokens)
tokens_hashtag = set(hashtag(all_tokens))
tokens_user = set(user(all_tokens))

print(all_tokens[0:20])
print('')
print(len(tokens_hashtag))
print('')
print(len(tokens_user))

text deed reason #ea

1889

2317


In [13]:
DATA_TOKEN = os.path.join(DATA_DIR, "tokensfinal/train_text_token6.csv")
#DATA_TOKEN = os.path.join(DATA_DIR, "tokens_dataset_small/train_text_token.csv")

token_count_dict = count_tokens(tokens[1:])
#print(token_count_dict)
#print('')

# TOKENS WITH COUNT >= y: x TOKENS
reduced_token_count_dict, token_list = reduced_dict(token_count_dict, tokens_hashtag, tokens_user, 6)
utils.list_to_csv(token_list, DATA_TOKEN)
print(len(token_list))

print(token_list[0:20])

2301
['reason', '#earthquak', 'may', 'us', 'forest', 'fire', 'near', 'la', 'canada', 'resid', 'ask', 'shelter', 'place', 'offic', 'evacu', 'order', 'expect', 'peopl', '#wildfir', 'california']


In [14]:
print_token_count(reduced_token_count_dict,0)

like:407  
fire:355  
get:311  
amp:301  
bomb:228  
new:224  
via:220  
one:205  
go:198  
peopl:197  
burn:178  
kill:175  
video:170  
time:161  
crash:160  
emerg:158  
us:157  
flood:156  
attack:153  
build:152  
bodi:150  
year:148  
disast:148  
look:144  
say:142  
polic:139  
fatal:138  
home:137  
day:136  
would:136  
famili:130  
love:129  
make:129  
still:129  
evacu:128  
see:128  
train:128  
come:125  
storm:123  
know:122  
back:121  
watch:118  
want:117  
suicid:117  
news:117  
california:116  
live:115  
bag:115  
scream:114  
collaps:114  
derail:114  
got:112  
car:112  
man:110  
death:110  
first:107  
take:105  
think:105  
world:105  
caus:104  
today:101  
need:101  
work:100  
drown:100  
two:99  
rt:99  
wreck:99  
let:98  
war:97  
dead:96  
deton:96  
destroy:95  
accid:94  
plan:93  
feel:93  
nuclear:92  
hijack:92  
fuck:91  
full:91  
fear:90  
obliter:90  
good:89  
may:88  
murder:88  
weapon:88  
way:86  
last:86  
help:86  
even:86  
surviv:86 