In [39]:
import pandas as pd
import numpy as np
import os
import string
import datetime as dt
import pathlib

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import re
from pattern.text.en import singularize

In [3]:
os.getcwd()
os.chdir("./Mike Hulett/")

In [37]:
def read_text_file(file_path):
	with open(file_path, 'r', encoding='utf-8' ) as f:
	# with open(file_path, 'r', encoding='cp1252' ) as f:
		return f.read()
		
def print_punctuation(input_string=None):
	punct_str = ""
	for i in range(len(input_string)):
		char = input_string[i] 
		char = char.replace('”', "\"")
		char = char.replace('“', "\"")
		char = char.replace("\n", " ")
		if char in string.punctuation:
			punct_str = punct_str + char
	return punct_str

def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)    

def remove_whitespace(text):
    return  " ".join(text.split())

def count_sentences(text):
	return len(sent_tokenize(text))

def count_words(text):
    return len(word_tokenize(text))
    # word_count = [n+1 for word in word_tokens]

def count_distinct_words(text):
    return len(set(word_tokenize(text)))

def build_vocab(existing_list, new_text, distinct=True):
    new_words = word_tokenize(new_text)
    existing_list.extend(new_words)
    if distinct==True:
        return list(set(existing_list))
    else:
        return(list(existing_list))

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

def clean_string(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    text = text.replace('’', "\'")
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = ' '.join([str(elem) for elem in text])
    return text	

def clean_incl_stopwords(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    text = text.replace('’', "\'")
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    # text = remove_stopwords(text)
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens]
    text = ' '.join([str(elem) for elem in filtered_text])
    return text	    

def vocab_freq(word_list,
               _singularize=False,
               top_n=None):

    _dict = {
        'word':[],
        'freq':[]
    }

    if _singularize == True:
        singles = [singularize(plural) for plural in word_list]
        distinct = list(set(singles))
        for d in range(len(distinct)):
            word = distinct[d]
            freq = singles.count(word)
            _dict['word'].append(word)
            _dict['freq'].append(freq)
    else:
        distinct = list(set(word_list))
        for d in range(len(distinct)):
            word = distinct[d]
            freq = word_list.count(word)
            _dict['word'].append(word)
            _dict['freq'].append(freq)

    df = pd.DataFrame(_dict)
    df.sort_values(by='freq', ascending=False, inplace=True)
    df.set_index('word', inplace=True)
    
    if top_n:
        return df.head(top_n)
    else:
        return df

def stack_string(text, max_length=25):
    stacked = ''
    n = 0
    for i in range(len(text)):
        stacked = stacked + text[i]
        n+=1
        if n == max_length:
            stacked = stacked + '\n'
            n = 0
    return stacked + '\n'    

In [5]:
wd = os.getcwd()
file_list = os.listdir()
columnist = wd.split('\\')[len(wd.split('\\'))-1:][0]

# file names are in YYYYMMDD.txt format
format = '%Y%m%d'

dict = {
	"columnist":[],
	"file_date":[],
	"story_punctuation":[],
	"word_count":[],
	"distinct_word_count":[],
	"sentence_count":[]
}

dict_vocab = {
	"columnist":[],
	"distinct_words":[],
	"distinct_words_excl_stop":[],
	"all_nonstop_words":[]
}

vocab_w_stop = []
vocab_no_stop = []
all_words = []

# iterate through all files
for i in range(len(file_list)):
	file = file_list[i]
	ymd = file[:8]

	# Check whether file is in text format or not
	if file.endswith(".txt"):
		file_path = f"{wd}\{file}"
		file_date = dt.datetime.strptime(ymd, format)
		s = read_text_file(file_path)
		cleaned_string = clean_string(s)
		clean_with_stopwords = clean_incl_stopwords(s)
		p = print_punctuation(s)
		vocab_no_stop = build_vocab(vocab_no_stop, cleaned_string)
		vocab_w_stop = build_vocab(vocab_w_stop, clean_with_stopwords)
		all_words = build_vocab(all_words, cleaned_string, distinct=False)

		dict["columnist"].append(columnist)
		dict["file_date"].append(file_date)
		dict["story_punctuation"].append(p)
		dict["word_count"].append(count_words(s))
		dict["distinct_word_count"].append(count_distinct_words(s))
		dict["sentence_count"].append(count_sentences(s))

dict_vocab["columnist"].append(columnist)
dict_vocab["distinct_words"].append(vocab_w_stop)
dict_vocab["distinct_words_excl_stop"].append(vocab_no_stop)
dict_vocab["all_nonstop_words"].append(all_words)

df = pd.DataFrame(dict)
df_vocab = pd.DataFrame(dict_vocab)


In [6]:
hl = df_vocab.all_nonstop_words.to_list()[0]

In [7]:
v = vocab_freq(hl, _singularize=True, top_n=10)
v

Unnamed: 0_level_0,freq
word,Unnamed: 1_level_1
biden,225
trump,176
american,147
president,136
democrat,122
america,101
white,83
person,81
would,79
state,76


In [8]:
df

Unnamed: 0,columnist,file_date,story_punctuation,word_count,distinct_word_count,sentence_count
0,Mike Hulett,2020-04-11,",."","".,-,.,.,-.,,',-.,-.,,-..,.,,.,,""."",.""."",,...",631,374,28
1,Mike Hulett,2020-04-18,",:"",,.,..""!..,,,.:"",,."".,.:""(),."".:"",,--?""?:"",...",607,321,29
2,Mike Hulett,2020-05-09,";,..!,./.,,.%..:,,,,,,;.,,,..,....!.,....,,.,....",585,314,33
3,Mike Hulett,2020-05-23,"-.,,....,-.,,-.,.-.-."",""-?.-"""".-"""",-.."".""-,""""....",616,361,30
4,Mike Hulett,2020-06-20,".,??..,:,,,-?:"",,.""??,?,.""""."""".,,.,,.,.,,-..:,...",591,303,31
...,...,...,...,...,...,...
62,Mike Hulett,2022-09-24,","",.""!..:""-.."",,,.:"":?-?"","",,,,-..,,."","""",-""-""...",599,309,24
63,Mike Hulett,2022-10-08,"""?"",.,.,-.--..-"","","","".-:"".."",!,:""..,,...."":""?...",587,337,20
64,Mike Hulett,2022-10-22,",.,.-,,,""."",,.,.,,%,"""".%.""-,.""."";."".""."","""".,-?...",572,320,18
65,Mike Hulett,2022-11-05,",:,-,.,.,,,-.""-"",.,.-."""".!,.,--,-,,--,.-,,..-....",583,349,27


In [38]:
for i in range(len(df)):
    print(stack_string(df['story_punctuation'][i], 50))

,.",".,-,.,.,-.,,',-.,-.,,-..,.,,.,,".",.".",,-.,-
..,?,,.!-,,?,..,.,.,","..,,.,..:",,."?..,,."":":..
..";!

,:",,.,.."!..,,,.:",,.".,.:"(),.".:",,--?"?:",,,?"
,.,.,."".,.","..-".".,-...,..,-:"...-."!,-""..

;,..!,./.,,.%..:,,,,,,;.,,,..,....!.,....,,.,..,,,
.-..,,,.....,,.

-.,,....,-.,,-.,.-.-.","-?.-"".-"",-.."."-,"".,:"?
":".",.?-?:.""..,.,.,,,,!.-?:--,".",,.,.-,-,.,.().
..,.

.,??..,:,,,-?:",,."??,?,.""."".,,.,,.,.,,-..:,,.(%
)-..,,.,-,(),.(,.)..(%),,,:(%)."".,.:,,--+.,-."".

...,.,,."",,.,,,,.,.,.,-,.,.;....,..,.""."",..,.,,
,.,-.,".",,.,.,,..,.-,-,.:,,?,,""""?.,,?

.,/...,...,.-.,-.??.,.,.?,,.:.......,..?..:",,."--
,..-,..",",."".,:??,--?..

.,,..,.......",",,..,..,..""..",",,,,...,....,-,,-
"."".,.":.","-,,.,,"."-,.:,.-,",",-,.

--....,.:,,,,,,,.,.,,"".,.,,.(--),,,,,,,,,,....,.-
,?!,.?"!"!-"".,.,"""-.",,.,.,.,".":,,.",",..",""."
.!

:,"""".--,,,.--,."".,,",,,.":".",..,."";-.,.?.,--,
.,-."",".":"..",,.,-,,,,"",!-.-,,,,"".-,-".",-.

.,-.,.,,.".""-.",.,,.,,.'.,.,,..,.,%,,.,-.,.,-.

In [46]:
avg_word_count = np.floor(np.average(df.word_count)).astype(np.int32)
avg_distinct_word_count = np.floor(np.average(df.distinct_word_count)).astype(np.int32)

std_word_count = round(np.std(df.word_count),1)
std_distinct_word_count = round(np.std(df.distinct_word_count),1)

print('Avg total word count: ', avg_word_count, 'StDev total word count:', std_word_count)
print('Avg distinct word count:', avg_distinct_word_count, 'StDev distinct word count:', std_distinct_word_count)

Avg total word count:  590 StDev total word count: 15.5
Avg distinct word count: 329 StDev distinct word count: 19.0
