In [13]:
# Set up
import os
import string
import csv

from pathlib import Path
from collections import Counter
from string import digits

current_directory = os.getcwd()

In [63]:
# Clean file, Return word list
def cleansed_text(file_name):
    # Open file
    text_file = open(f"{current_directory}/{file_name}")
    # Read text
    text = text_file.read().lower()
    # Remove digits
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    # Remove punctuations
    text = text.translate(text.maketrans('', '', string.punctuation))
    # Remove newline
    text = text.replace("/n", " ")
    # Tokenize
    text = text.split(" ")
    # Close file
    text_file.close()
    
    return text


In [18]:
# Type token Ratio

def get_ttr(file_name):
    text_list = cleansed_text(file_name)
    unique_words = []
    
    # create a list of unique words
    for word in text_list:
        if word not in unique_words:
            unique_words.append(word)

    # calculate TTR
    ttr = len(unique_words) / len(text_list)
    
    return ttr

# Test
# print(get_ttr("Amish and Mennonite_amish-apple-dumplings.txt"))

0.7422680412371134


In [19]:
# Average word length

def avg_word_len(file_name):
    text_list = cleansed_text(file_name)
    
    count = 0
    for word in text_list:
        count += len(word)
    
    avg_len = count / len(text_list)
    return avg_len

# Test
# print(avg_word_len("Amish and Mennonite_amish-apple-dumplings.txt"))

4.65979381443299


In [37]:
# Total number of words

def total_num_words(file_name):
    text_list = cleansed_text(file_name)
    total_number_words = len(text_list)
    return total_number_words

# Test
# print(total_num_words("Amish and Mennonite_amish-apple-dumplings.txt"))

97


In [60]:
# Top 10 most frequently used words (excluding function words)

# List of function words
def get_function_words():
    # Open and read file
    fhand = open(f"{current_directory}/reference_data/function_words.txt")
    function_words_list = []
    
    for line in fhand:
        function_words_list.append(line.strip())
        
    # Close file
    fhand.close()
        
    return function_words_list

function_words = get_function_words()
    

# Check if word is a function
def is_function_word(word):
    if word in function_words:
        return True
    else:
        return False

# Return top 10 most frequently used words
def most_freq_words(file_name):
    text_list = cleansed_text(file_name)
    
    # Exclude function words
    text_nofw = [word for word in text_list if not is_function_word(word)]
    
    # Count word frequencies
    word_counter = Counter(text_nofw)
    
    # Retrieve the top 10 most frequent words
    top10_dict = word_counter.most_common(11)
    
    # Return the top 10 words
    top10 = []
    for word, count in top10_dict:
        if word != '':
            top10.append(word)
        
    return top10

# Test
# print(most_freq_words("Amish and Mennonite_amish-baked-oatmeal.txt"))

{'boy', 'parent', 'generation', 'aunt', 'daddy', 'fiancé', 'cousin’s wife', 'granddaughter', 'girl', 'first cousin once removed', 'boyfriend', 'relationship', 'niece', 'son-in-law', 'wife', 'friends', 'great-grandmother', 'mom', 'family', 'great-uncle', 'peer', 'grandpa', 'brother-in-law', 'grandfather', 'baby boy', 'pet', 'warm', 'grandmother', 'gathering', 'teen', 'great-aunt', 'cousin', 'partner', 'fiance', 'daughter', 'mommy', 'house', 'dad', 'cat', 'close', 'husband', 'daughter-in-law', 'holiday', 'pa', 'younger', 'holidays', 'love', 'young', 'friend', 'brother', 'father', 'household', 'old', 'girl twins', 'great-grandfather', 'twin boys', 'dog', 'kin', 'father-in-law', 'mother', 'child', 'sibling', 'older', 'grandson', 'teacher', 'sister', 'cousin’s husband', 'son', 'girlfriend', 'nephew', 'uncle', 'sister-in-law'}


In [39]:
# Top 10 bigrams

def top_bigrams(file_name):
    words = cleansed_text(file_name)
    
    # Retrieve all bigrams
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]

    # Frequency of each bigram
    bigrams_freq = Counter(bigrams)

    # Sort bigrams by in descending order
    sorted_bigrams = sorted(bigrams_freq.items(), key=lambda x: x[1], reverse=True)
    
    # Remove unecessary trailing whitespace
    cleaned_bigrams = {}
    for words, count in sorted_bigrams:
        if '' in words:
            continue
        else:
            cleaned_bigrams[words] = count
        
    # Return the top 10 most frequent bigrams
    top10_bigrams = []
    num = 0
    for word1, word2 in cleaned_bigrams:
        num += 1
        top10_bigrams.append((word1, word2))
        if num == 10:
            break

    return top10_bigrams

# Test
# print(top_bigrams("Amish and Mennonite_amish-baked-oatmeal.txt"))

[('just', 'as'), ('as', 'written'), ('i', 'used'), ('coconut', 'oil'), ('was', 'wonderful'), ('brown', 'sugar'), ('don’t', 'like'), ('too', 'sweet'), ('sweet', 'and'), ('blueberries', 'and')]


In [40]:
# Top 10 trigrams
            
def top_trigrams(file_name):
    words = cleansed_text(file_name)
    
    # Retrieve all trigrams
    trigrams = [(words[i], words[i+1], words[i+2]) for i in range(len(words)-2)]

    # Frequency of each bigram
    trigrams_freq = Counter(trigrams)

    # Sort bigrams by in descending order
    sorted_trigrams = sorted(trigrams_freq.items(), key=lambda x: x[1], reverse=True)
    
    # Remove unecessary trailing whitespace
    cleaned_trigrams = {}
    for words, count in sorted_trigrams:
        if '' in words:
            continue
        else:
            cleaned_trigrams[words] = count
        
    # Return the top 10 most frequent bigrams
    top10_trigrams = []
    num = 0
    for word1, word2, word3 in cleaned_trigrams:
        num += 1
        top10_trigrams.append((word1, word2, word3))
        if num == 10:
            break

    return top10_trigrams

# Test
# print(top_trigrams("Amish and Mennonite_amish-baked-oatmeal.txt"))

[('just', 'as', 'written'), ('too', 'sweet', 'and'), ('i', 'made', 'it'), ('made', 'it', 'just'), ('it', 'just', 'as'), ('as', 'written', 'and'), ('written', 'and', 'was'), ('and', 'was', 'wonderful'), ('did', 'some', 'experimentations'), ('i', 'used', 'coconut')]


In [27]:
# Output txt file with all texts combined
                
def combined_texts():
    directory = f"{current_directory}/text"
    
    result_str = ""
    for filename in os.listdir(directory):
        file = os.path.join(directory, filename)
        
        if ".DS_Store" in file:
            continue
        elif os.path.isfile(file):
            # Open and read file
            text_file = open(file, "r")
            text = text_file.read()
            # Close text file
            text_file.close()
            # Add text to result string
            result_str = " ".join([result_str, text])
    
    # Change directory
    cur_directory = os.chdir(current_directory)
    
    with open(f'all_comments.txt', 'w') as f:
        f.write(result_str)
        f.close()

# combined_texts()

In [41]:
# Return a list of words or ngrams to a string

def list_to_str(lst):
    s = ''
    for i in range(len(lst)):  # for each tuple

        # Create a string from a tuple (e.g., "the united nations")
        if type(lst[i]) == tuple:
            s += ' '.join(lst[i])
        else:
            s += lst[i]

        # Insert a '|' character if i is not the last index.
        if i < len(lst)-1:           
            s += ', '
            
    return "{}".format(s)


In [45]:
# Input into one final dictionary

def one_dict():
    print(current_directory)
    filename = "all_comments.txt"
    print(filename)
        
    # --- DATA ---
    # TTR
    ttr = get_ttr(filename)

    # Average word length
    avg_wordlength = avg_word_len(filename)

    # Total number of words
    word_count = total_num_words(filename)

    # Top 10 most frequent words
    top10_words = most_freq_words(filename)
    top10_words = list_to_str(top10_words)

    # Top 10 bigrams
    top10_bigrams = top_bigrams(filename)
    top10_bigrams = list_to_str(top10_bigrams)

    # Top 10 trigrams
    top10_trigrams = top_trigrams(filename)
    top10_trigrams = list_to_str(top10_trigrams)

    row_data = {"TTR": f"{ttr}",
                "Wd Length": f"{avg_wordlength}",
                "Word Count": f"{word_count}",
                "Top 10 words": f"{top10_words}",
                "Top 10 bigrams": f"{top10_bigrams}",
                "Top 10 trigrams": f"{top10_trigrams}"}
    
    return row_data

print(one_dict())

/Users/seyounglee/Desktop/coding_for_humanists/f23-labs-seyoungleee/Final Project
all_comments.txt
{'TTR': '0.033479034844986516', 'Wd Length': '4.1601809665650284', 'Word Count': '907792', 'Top 10 words': 'recipe, make, made, added, like, good, time, just, use, add', 'Top 10 bigrams': 'it was, in the, this recipe, of the, i used, the recipe, to make, a little, to the, and i', 'Top 10 trigrams': 'and it was, next time i, easy to make, to make it, followed the recipe, i made it, a lot of, this is a, i used a, it was a'}


In [71]:
# List of family words
def get_family_words():
    # Open file
    fhand = open(f"{current_directory}/reference_data/family_words.txt")
    family_words = [line.lower() for line in fhand.read().splitlines()]
    family_set = set(family_words)

    return family_set

family_words = get_family_words()

# Clean text
def cleansed_text2(text):
    # Remove digits
    remove_digits = str.maketrans('', '', digits)
    text = text.translate(remove_digits)
    # Remove punctuations
    text = text.translate(text.maketrans('', '', string.punctuation))
    # Remove newline
    text = text.replace("/n", " ")
    # Tokenize
    text = text.split(" ")
    
    return text

# Find family words
def find_family():
    fhand = open(f"{current_directory}/all_comments.txt")
    text = fhand.read().lower()
    
    # Clean text
    text_list = cleansed_text2(text)
    # Exclude function words
    text_nofw = [word for word in text_list if not is_function_word(word)]
    
    # Frequency of family words
    all_count = 0
    family_count = dict()
    for word in text_nofw:
        if word in family_words:
            all_count += 1
            
            if word in family_count:
                family_count[word] += 1
            else:
                family_count[word] = 1
                
    print(f"All family word count: {all_count}")
    print(f"Total word count: {len(set(text_nofw))}")
    return family_count


print(find_family())

All family word count: 6308
Total word count: 30147
{'partner': 7, 'husband': 805, 'wife': 152, 'love': 1359, 'child': 47, 'family': 1225, 'daughter': 131, 'mom': 246, 'girlfriend': 8, 'close': 210, 'house': 199, 'friend': 139, 'holidays': 37, 'holiday': 60, 'mother': 185, 'friends': 204, 'old': 289, 'warm': 214, 'girl': 31, 'son': 112, 'boyfriend': 65, 'sister': 38, 'boy': 20, 'older': 15, 'uncle': 13, 'young': 33, 'father': 25, 'household': 22, 'grandmother': 165, 'cat': 2, 'aunt': 32, 'teen': 5, 'dad': 42, 'granddaughter': 9, 'grandfather': 8, 'grandpa': 6, 'grandson': 11, 'teacher': 5, 'cousin': 6, 'generation': 7, 'niece': 3, 'gathering': 19, 'fiance': 24, 'younger': 6, 'pa': 22, 'brother': 18, 'nephew': 6, 'daddy': 4, 'dog': 6, 'fiancé': 5, 'kin': 1, 'pet': 5}


In [32]:
# Input data into final csv file

def final_csv():
    all_data = one_dict()
    
    # Change current directory
    cur_directory = os.chdir(f"{current_directory}/data/{subgroup}")
    
    fields = ["TTR",
              "Wd Length", "Word Count",
              "Top 10 words", "Top 10 bigrams", "Top 10 trigrams"]
    
    with open(f"all_comments_stats.csv", 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fields)
        
        writer.writeheader()
        writer.writerows(all_data)


all_newest20.txt
all_oldest20.txt
