### **LIBRARIES**

In [186]:
import os            # os.path.join
import re            # re.sub
import pandas as pd  # pd.DataFrame

#from itertools import islice   # islice(iterable, n)

In [187]:
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/ronkow/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **DATA, GLOBAL CONSTANTS**

In [188]:
PUNC = '''!?.,;:$&*^~_"`(){}[]/\|<>=%+-'''  # exclude # (for #user) and @ (for @hashtag) and ' (so that can't is not converted to cant)

STOP_WORDS = set(stopwords.words('english')) # returns a set of stop words
ADD_WORDS = {"i'm"}
STOP_WORDS = STOP_WORDS.union(ADD_WORDS)

DATA_DIR = "data/"
DATA_FILE = os.path.join(DATA_DIR, "train_text.csv")

In [189]:
print(STOP_WORDS)

{'themselves', 'she', 'then', "doesn't", 'because', 'for', 'while', 'yourself', "shouldn't", 'not', 'or', "it's", 'having', 'other', 'after', 'again', 'from', 'which', "hadn't", 'ma', 'couldn', 'very', 'her', 'herself', 'does', 't', 'the', 'their', 're', 'ourselves', 'further', 'been', 'yourselves', 'of', 'don', "isn't", 'now', "couldn't", 'ours', 'this', "needn't", 'whom', 'any', 'during', 'but', 's', 'them', 'and', 'is', "aren't", 'what', 'most', 'only', 'doesn', 'it', 'myself', 'ain', "won't", 'weren', 'we', 'theirs', 'same', 'that', "weren't", 'won', 'in', 'you', 'his', 'i', 've', "mightn't", 'itself', 'just', 'few', 'be', 'if', 'under', 'hadn', 'him', 'wasn', 'below', 'there', "hasn't", 'nor', 'were', 'here', 'are', 'some', 'no', 'before', 'he', "you're", 'its', 'have', 'at', 'mustn', 'didn', "that'll", 'off', 'should', 'wouldn', 'where', 'll', 'haven', 'into', 'as', 'our', 'your', "don't", 'until', 'how', "haven't", 'once', 'shan', 'mightn', "didn't", "mustn't", 'shouldn', 'had',

### **FUNCTIONS**

In [190]:
def load_doc(filepath):
    """
    ARGUMENT: file path
    RETURN: string of text from file
    """
    with open(filepath) as f:
        s = f.read()           # 
    return s                   # string

In [191]:
def clean_doc_text(doc):
    """
    ARGUMENT: text (string)
    RETURN: list of tokens
    """
    doc = doc.replace('...',' ... ')  # to avoid converting abc...xyz to abcxyz
    doc = doc.replace("'",' ')        # to convert "can't" to "can" and "t"
    
    for p in PUNC:
        doc = doc.replace(p,'')
  
    tokens = doc.split()                                             # returns a list of tokens
    tokens = [w.lower() for w in tokens]                             # convert all letters to lower case  
    tokens = [w for w in tokens if not w in STOP_WORDS]              # exclude stop words
    
    tokens = [w for w in tokens if not w.isdigit()]                  # exclude all numbers, but include words with numbers, such as abc12

    tokens = [porter.stem(w) for w in tokens]                        # stemming
    tokens = [w for w in tokens if len(w)>=2]                        # include only words with length >= 2
    
    return tokens                                                    # list of tokens

In [192]:
def count_tokens(tokens):
    """
    ARGUMENT: list of tokens
    RETURN: dictionary {token:count}
    """
    token_count_dict = dict()
    for w in tokens:
        token_count_dict[w] = token_count_dict.get(w,0) + 1
        
    return token_count_dict

In [193]:
def reduced_dict(count_dict, N):
    """
    ARGUMENTS: dictionary of counts {x:count}, N
    RETURN: reduced dictionary of counts, count >= N
    """
    reduced_token_count_dict = dict()
    token_list = []
    
    for w in count_dict:
        if count_dict[w] >= N:
            reduced_token_count_dict[w] = count_dict[w]
            token_list.append(w)
            
    return reduced_token_count_dict, token_list

In [194]:
# prints dict items in descending count order

def print_token_count(count_dict, N):   
    """
    ARGUMENTS: dictionary of counts {x: count}, N
    prints top N key-value in dictionary
    """
    for w in sorted(count_dict, key = count_dict.get, reverse = True):
        if count_dict[w] >= N:
            #print(f'{w}:{token_count_dict[w]}',sep=' ', end=' ', flush=True)
            print(f'{w}:{token_count_dict[w]}  ', end='\n')

In [195]:
# used to save tokens list as csv

def list_to_csv(list, filepath):
    """
    ARGUMENT: list, file path
    RETURN: csv file
    """
    df = pd.DataFrame(list, columns=["text_tokens"])
    df.to_csv(filepath, index=False)

### **RUN!**

In [196]:
# The number at the end of the variable refers to the token count
# DATA_TOKEN4 means all tokens with counts >= 4

DATA_TOKEN4 = os.path.join(DATA_DIR, "tokens/train_text_token4.csv")
DATA_TOKEN6 = os.path.join(DATA_DIR, "tokens/train_text_token6.csv")
DATA_TOKEN8 = os.path.join(DATA_DIR, "tokens/train_text_token8.csv")
DATA_TOKEN9 = os.path.join(DATA_DIR, "tokens/train_text_token9.csv")
DATA_TOKEN13 = os.path.join(DATA_DIR, "tokens/train_text_token13.csv")
DATA_TOKEN14 = os.path.join(DATA_DIR, "tokens/train_text_token14.csv")
DATA_TOKEN100 = os.path.join(DATA_DIR, "tokens/train_text_token100.csv")

doc = load_doc(DATA_FILE)
tokens = clean_doc_text(doc)
token_count_dict = count_tokens(tokens[1:])

# TOKENS WITH COUNT >= 4: 2944 TOKENS
reduced_token_count_dict4, token_list4 = reduced_dict(token_count_dict, 4)
list_to_csv(token_list4, DATA_TOKEN4)
print(len(token_list4))

# TOKENS WITH COUNT >= 6: 2069 TOKENS
reduced_token_count_dict6, token_list6 = reduced_dict(token_count_dict, 6)
list_to_csv(token_list6, DATA_TOKEN6)
print(len(token_list6))

# TOKENS WITH COUNT >= 8: 1602 TOKENS
reduced_token_count_dict8, token_list8 = reduced_dict(token_count_dict, 8)
list_to_csv(token_list8, DATA_TOKEN8)
print(len(token_list8))

# TOKENS WITH COUNT >= 9: 1441 TOKENS
reduced_token_count_dict9, token_list9 = reduced_dict(token_count_dict, 9)
list_to_csv(token_list9, DATA_TOKEN9)
print(len(token_list9))

# TOKENS WITH COUNT >= 13: 1053 TOKENS
reduced_token_count_dict13, token_list13 = reduced_dict(token_count_dict, 13)
list_to_csv(token_list13, DATA_TOKEN13)
print(len(token_list13))

# TOKENS WITH COUNT >= 14: 986 TOKENS
reduced_token_count_dict14, token_list14 = reduced_dict(token_count_dict, 14)
list_to_csv(token_list14, DATA_TOKEN14)
print(len(token_list14))

# TOKENS WITH COUNT >= 100: 64 TOKENS
reduced_token_count_dict100, token_list100 = reduced_dict(token_count_dict, 100)
list_to_csv(token_list100, DATA_TOKEN100)
print(len(token_list100))

2944
2069
1602
1441
1053
986
64
