In [1]:
import pandas as pd
import numpy as np
import os
import string
import datetime as dt
import pathlib

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import string
import re
from pattern.text.en import singularize

In [2]:
os.getcwd()
os.chdir("./Mike Hulett/")

In [12]:
def read_text_file(file_path):
	with open(file_path, 'r', encoding='utf-8' ) as f:
	# with open(file_path, 'r', encoding='cp1252' ) as f:
		return f.read()
		
def print_punctuation(input_string=None):
	punct_str = ""
	for i in range(len(input_string)):
		char = input_string[i] 
		char = char.replace('”', "\"")
		char = char.replace('“', "\"")
		char = char.replace("\n", " ")
		if char in string.punctuation:
			punct_str = punct_str + char
	return punct_str

def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)    

def remove_whitespace(text):
    return  " ".join(text.split())

def count_sentences(text):
	return len(sent_tokenize(text))

def count_words(text):
    return len(word_tokenize(text))
    # word_count = [n+1 for word in word_tokens]

def count_distinct_words(text):
    return len(set(word_tokenize(text)))

def build_vocab(existing_list, new_text, distinct=True):
    new_words = word_tokenize(new_text)
    existing_list.extend(new_words)
    if distinct==True:
        return list(set(existing_list))
    else:
        return(list(existing_list))

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text

def clean_string(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    text = text.replace('’', "\'")
    # adding this to split hyptenated words into separate words
    text = text.replace('-', " ")
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = ' '.join([str(elem) for elem in text])
    return text	

def clean_incl_stopwords(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    text = text.replace('’', "\'")
    # adding this to split hyptenated words into separate words
    text = text.replace('-', " ")
    text = text_lowercase(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    # text = remove_stopwords(text)
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens]
    text = ' '.join([str(elem) for elem in filtered_text])
    return text	    

def vocab_freq(word_list,
               _singularize=False,
               top_n=None):

    _dict = {
        'word':[],
        'freq':[]
    }

    if _singularize == True:
        singles = [singularize(plural) for plural in word_list]
        distinct = list(set(singles))
        for d in range(len(distinct)):
            word = distinct[d]
            freq = singles.count(word)
            _dict['word'].append(word)
            _dict['freq'].append(freq)
    else:
        distinct = list(set(word_list))
        for d in range(len(distinct)):
            word = distinct[d]
            freq = word_list.count(word)
            _dict['word'].append(word)
            _dict['freq'].append(freq)

    df = pd.DataFrame(_dict)
    df.sort_values(by='freq', ascending=False, inplace=True)
    df.set_index('word', inplace=True)
    
    if top_n:
        return df.head(top_n)
    else:
        return df

def stack_string(text, max_length=25):
    stacked = ''
    n = 0
    for i in range(len(text)):
        stacked = stacked + text[i]
        n+=1
        if n == max_length:
            stacked = stacked + '\n'
            n = 0
    return stacked + '\n'    

def count_quoted_words(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    quoted_words = ''
    start_quote = 0
    mid_quote = 0
    end_quote = 0
    for char in range(len(text)):
        if start_quote == 1 and end_quote == 1:
            start_quote = 0
            end_quote = 0

        if text[char] == "\"" and start_quote == 0 and end_quote == 0:
            start_quote = 1

        if text[char] != "\"" and start_quote == 1 and end_quote == 0:
            mid_quote = 1
            quoted_words += text[char]

        if text[char] == "\"" and start_quote == 1 and mid_quote == 1:
            quoted_words += " "
            mid_quote = 0
            end_quote = 1

    quoted_words = clean_incl_stopwords(quoted_words)
    word_count = count_words(quoted_words)

    return word_count

def stats_quoted_words(text):
    text = text.replace('”', "\"")
    text = text.replace('“', "\"")
    quoted_words = ''
    start_quote = 0
    mid_quote = 0
    end_quote = 0
    short_quotes = []
    long_quotes = []
    quote_list = []
    quote_word_len_list = []
    
    dict = {
        'quote_word_count':[],
        'quote_count':[],
        'count_short':[],
        'count_long':[],
        'avg_len_short':[],
        'avg_len_long':[]
    }

    for char in range(len(text)):
        if start_quote == 1 and end_quote == 1:
            quoted_words = ''
            start_quote = 0
            end_quote = 0

        if text[char] == "\"" and start_quote == 0 and end_quote == 0:
            start_quote = 1

        if text[char] != "\"" and start_quote == 1 and end_quote == 0:
            mid_quote = 1
            quoted_words += text[char]

        if text[char] == "\"" and start_quote == 1 and mid_quote == 1:
            quoted_words = clean_incl_stopwords(quoted_words)
            quote_list.append(quoted_words)
            quote_word_len_list.append(count_words(quoted_words))

            if count_words(quoted_words) > 0 and count_words(quoted_words) <= 3:
                short_quotes.append(count_words(quoted_words))

            if count_words(quoted_words) > 3:
                long_quotes.append(count_words(quoted_words))
            
            mid_quote = 0
            end_quote = 1

    dict['quote_word_count'].append(sum(quote_word_len_list))
    dict['quote_count'].append(len(quote_word_len_list))
    dict['count_short'].append(len(short_quotes))
    dict['count_long'].append(len(long_quotes))

    if len(short_quotes) > 0:
        dict['avg_len_short'].append(round(sum(short_quotes) / len(short_quotes),1))
    else:
        dict['avg_len_short'].append(0)

    if len(long_quotes) > 0:
        dict['avg_len_long'].append(round(sum(long_quotes) / len(long_quotes), 1))
    else:
        dict['avg_len_long'].append(0)

    return pd.DataFrame(dict)    

In [13]:
wd = os.getcwd()
file_list = os.listdir()
columnist = wd.split('\\')[len(wd.split('\\'))-1:][0]

# file names are in YYYYMMDD.txt format
format = '%Y%m%d'

dict = {
	"columnist":[],
	"file_date":[],
	"story_punctuation":[],
	"word_count":[],
	"distinct_word_count":[],
	'quote_word_count':[],
	"sentence_count":[],
	'quote_count':[],
	'count_short_quote':[],
	'count_long_quote':[],
	'avg_len_short_quote':[],
	'avg_len_long_quote':[]
}

dict_vocab = {
	"columnist":[],
	"distinct_words":[],
	"distinct_words_excl_stop":[],
	"all_nonstop_words":[]
}

vocab_w_stop = []
vocab_no_stop = []
all_words = []

# iterate through all files
for i in range(len(file_list)):
	file = file_list[i]
	ymd = file[:8]

	# Check whether file is in text format or not
	if file.endswith(".txt"):
		file_path = f"{wd}\{file}"
		file_date = dt.datetime.strptime(ymd, format)
		s = read_text_file(file_path)
		cleaned_string = clean_string(s)
		clean_with_stopwords = clean_incl_stopwords(s)
		p = print_punctuation(s)
		vocab_no_stop = build_vocab(vocab_no_stop, cleaned_string)
		vocab_w_stop = build_vocab(vocab_w_stop, clean_with_stopwords)
		all_words = build_vocab(all_words, cleaned_string, distinct=False)

		dx = stats_quoted_words(s)

		dict["columnist"].append(columnist)
		dict["file_date"].append(file_date)
		dict["story_punctuation"].append(p)
		dict["word_count"].append(count_words(s))
		dict["distinct_word_count"].append(count_distinct_words(s))
		dict["sentence_count"].append(count_sentences(s))
		dict["quote_word_count"].append(dx.quote_word_count[0])
		dict["quote_count"].append(dx.quote_count[0])
		dict["count_short_quote"].append(dx.count_short[0])
		dict["count_long_quote"].append(dx.count_long[0])
		dict['avg_len_short_quote'].append(dx.avg_len_short[0])
		dict["avg_len_long_quote"].append(dx.avg_len_long[0])

dict_vocab["columnist"].append(columnist)
dict_vocab["distinct_words"].append(vocab_w_stop)
dict_vocab["distinct_words_excl_stop"].append(vocab_no_stop)
dict_vocab["all_nonstop_words"].append(all_words)

df = pd.DataFrame(dict)
df_vocab = pd.DataFrame(dict_vocab)


In [14]:
hl = df_vocab.all_nonstop_words.to_list()[0]
v = vocab_freq(hl, _singularize=True, top_n=10)
v

Unnamed: 0_level_0,freq
word,Unnamed: 1_level_1
biden,232
trump,205
american,155
president,137
democrat,127
america,102
white,84
person,81
state,80
would,79


In [8]:
for i in range(len(df)):
    print(stack_string(df['story_punctuation'][i], 50))

,.",".,-,.,.,-.,,',-.,-.,,-..,.,,.,,".",.".",,-.,-
..,?,,.!-,,?,..,.,.,","..,,.,..:",,."?..,,."":":..
..";!

,:",,.,.."!..,,,.:",,.".,.:"(),.".:",,--?"?:",,,?"
,.,.,."".,.","..-".".,-...,..,-:"...-."!,-""..

;,..!,./.,,.%..:,,,,,,;.,,,..,....!.,....,,.,..,,,
.-..,,,.....,,.

-.,,....,-.,,-.,.-.-.","-?.-"".-"",-.."."-,"".,:"?
":".",.?-?:.""..,.,.,,,,!.-?:--,".",,.,.-,-,.,.().
..,.

.,??..,:,,,-?:",,."??,?,.""."".,,.,,.,.,,-..:,,.(%
)-..,,.,-,(),.(,.)..(%),,,:(%)."".,.:,,--+.,-."".

...,.,,."",,.,,,,.,.,.,-,.,.;....,..,.""."",..,.,,
,.,-.,".",,.,.,,..,.-,-,.:,,?,,""""?.,,?

.,/...,...,.-.,-.??.,.,.?,,.:.......,..?..:",,."--
,..-,..",",."".,:??,--?..

.,,..,.......",",,..,..,..""..",",,,,...,....,-,,-
"."".,.":.","-,,.,,"."-,.:,.-,",",-,.

--....,.:,,,,,,,.,.,,"".,.,,.(--),,,,,,,,,,....,.-
,?!,.?"!"!-"".,.,"""-.",,.,.,.,".":,,.",",..",""."
.!

:,"""".--,,,.--,."".,,",,,.":".",..,."";-.,.?.,--,
.,-."",".":"..",,.,-,,,,"",!-.-,,,,"".-,-".",-.

.,-.,.,,.".""-.",.,,.,,.'.,.,,..,.,%,,.,-.,.,-.