### **LIBRARIES**

In [1]:
import os            # os.path.join
import re            # re.sub
import pandas as pd  # pd.DataFrame

import bow
import utils

#from itertools import islice   # islice(iterable, n)

[nltk_data] Downloading package stopwords to /home/ronkow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **DATA, GLOBAL CONSTANTS**

In [9]:
DATA_DIR = "data/"
DATA_FILE = os.path.join(DATA_DIR, "train_text.csv")

### **FUNCTIONS**

In [10]:
def count_tokens(tokens):
    """
    ARGUMENT: list of tokens
    RETURN: dictionary {token:count}
    """
    token_count_dict = dict()
    for w in tokens:
        token_count_dict[w] = token_count_dict.get(w,0) + 1
        
    return token_count_dict

In [11]:
def reduced_dict(count_dict, N):
    """
    ARGUMENTS: dictionary of counts {x:count}, N
    RETURN: reduced dictionary of counts, count >= N
    """
    reduced_token_count_dict = dict()
    token_list = []
    
    for w in count_dict:
        if count_dict[w] >= N:
            reduced_token_count_dict[w] = count_dict[w]
            token_list.append(w)
            
    return reduced_token_count_dict, token_list

In [12]:
# prints dict items in descending count order

def print_token_count(count_dict, N):   
    """
    ARGUMENTS: dictionary of counts {x: count}, N
    prints top N key-value in dictionary
    """
    for w in sorted(count_dict, key = count_dict.get, reverse = True):
        if count_dict[w] >= N:
            #print(f'{w}:{token_count_dict[w]}',sep=' ', end=' ', flush=True)
            print(f'{w}:{token_count_dict[w]}  ', end='\n')

### **RUN!**

In [13]:
doc = utils.file_to_string(DATA_FILE)
print(doc[0:1000])

"text"
"Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all"
"Forest fire near La Ronge Sask. Canada"
"All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected"
"13,000 people receive #wildfires evacuation orders in California "
"Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school"
"#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires"
"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas"
"I'm on top of the hill and I can see a fire in the woods..."
"There's an emergency evacuation happening now in the building across the street"
"I'm afraid that the tornado is coming to our area..."
"Three people died from the heat wave so far"
"Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding

In [14]:
tokens = bow.clean_doc_text(doc)
print(tokens[0:10])

['text', 'deed', 'reason', '#earthquak', 'may', 'allah', 'forgiv', 'us', 'forest', 'fire']


In [15]:
# The number at the end of the variable refers to the token count
# DATA_TOKEN4 means all tokens with counts >= 4

DATA_TOKEN4 = os.path.join(DATA_DIR, "tokens/train_text_token4.csv")
DATA_TOKEN6 = os.path.join(DATA_DIR, "tokens/train_text_token6.csv")
DATA_TOKEN8 = os.path.join(DATA_DIR, "tokens/train_text_token8.csv")
DATA_TOKEN9 = os.path.join(DATA_DIR, "tokens/train_text_token9.csv")
DATA_TOKEN13 = os.path.join(DATA_DIR, "tokens/train_text_token13.csv")
DATA_TOKEN14 = os.path.join(DATA_DIR, "tokens/train_text_token14.csv")
DATA_TOKEN100 = os.path.join(DATA_DIR, "tokens/train_text_token100.csv")



token_count_dict = count_tokens(tokens[1:])

# TOKENS WITH COUNT >= 4: 2944 TOKENS
reduced_token_count_dict4, token_list4 = reduced_dict(token_count_dict, 4)
utils.list_to_csv(token_list4, DATA_TOKEN4)
print(len(token_list4))

# TOKENS WITH COUNT >= 6: 2070 TOKENS
reduced_token_count_dict6, token_list6 = reduced_dict(token_count_dict, 6)
utils.list_to_csv(token_list6, DATA_TOKEN6)
print(len(token_list6))

# TOKENS WITH COUNT >= 8: 1603 TOKENS
reduced_token_count_dict8, token_list8 = reduced_dict(token_count_dict, 8)
utils.list_to_csv(token_list8, DATA_TOKEN8)
print(len(token_list8))

# TOKENS WITH COUNT >= 9: 1442 TOKENS
reduced_token_count_dict9, token_list9 = reduced_dict(token_count_dict, 9)
utils.list_to_csv(token_list9, DATA_TOKEN9)
print(len(token_list9))

# TOKENS WITH COUNT >= 13: 1053 TOKENS
reduced_token_count_dict13, token_list13 = reduced_dict(token_count_dict, 13)
utils.list_to_csv(token_list13, DATA_TOKEN13)
print(len(token_list13))

# TOKENS WITH COUNT >= 14: 987 TOKENS
reduced_token_count_dict14, token_list14 = reduced_dict(token_count_dict, 14)
utils.list_to_csv(token_list14, DATA_TOKEN14)
print(len(token_list14))

# TOKENS WITH COUNT >= 100: 64 TOKENS
reduced_token_count_dict100, token_list100 = reduced_dict(token_count_dict, 100)
utils.list_to_csv(token_list100, DATA_TOKEN100)
print(len(token_list100))

2944
2070
1603
1442
1053
987
64
