In [1]:
import os
import bz2
import pickle as pkl

In [2]:
os.chdir(os.path.join(r"C:\Users\Raya\OneDrive\Documents\3-CSAI\CSAI-Y3-S2\Thesis\csai-thesis"))

In [15]:
# Constants

# Word ids
wordids_compressed = os.path.join("models", "wiki_wordids.txt.bz2")
wordids_decompressed = os.path.join("models", "wordids.txt")

# Vocabularies
vocab = os.path.join('data', 'vocab.pkl')
vocab_controlled = os.path.join('data', 'vocab_controlled.pkl')

# Words to inspect
selected_words = os.path.join("data", "200_words.txt")
controlled_selected_words = os.path.join("data", "200_words_controlled.txt")

# Celex dictionary
celex = os.path.join('data', 'celex_dict.pkl')

# Output files
id2word_file = os.path.join('data', 'id2word_dict.pkl')
words_inspection_results = os.path.join('data', 'words_inspection_results.txt')

In [14]:
# # Save a decompressed version of the wordids file
# with bz2.open(wordids_compressed, 'rb') as f_in, open(wordids_decompressed, 'wb') as f_out:
#     f_out.write(f_in.read())

In [6]:
# Creating an id2word dictionary from the compressed file (directly)
id2word_dict = {}

# Preview the file contents
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:
    contents = f.read()
    print("The contents of the wordids file look like this:")
    print(contents[:100])

# Extract words and corresponding IDs
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:    
    first_line = True
    for line in f:
        if first_line:
            first_line = False
            continue  # Skip the first line
        id, word, _ = line.split() # Split the line into components
        id2word_dict[id] = word

print(f"The id2word dictionary looks like this:")
for i, (id, word) in enumerate(list(id2word_dict.items())[:10]):
    print(f"{id} : {word}")

The contents of the wordids file look like this:
5545283
16004	aa	25827
12579	aaa	12305
63127	aaaa	999
80933	aaas	1594
54701	aab	1435
20022	aac	2796

The id2word dictionary looks like this:
16004 : aa
12579 : aaa
63127 : aaaa
80933 : aaas
54701 : aab
20022 : aac
30535 : aachen
96694 : aacsb
84104 : aacta
54859 : aad


In [23]:
# # Save the id2word dictionary for later use
# with open(id2word_file, 'wb') as f:
#     pkl.dump(id2word_dict, f)

with open(id2word_file, 'rb') as f:
    id2word_dict_check = pkl.load(f)

print(f"id2word_dict (saved) contains {len(id2word_dict_check)} items. The first 10 are:")
for i, (id, word) in enumerate(list(id2word_dict_check.items())[:10]):
    print(f"{id} : {word}")

id2word_dict (saved) contains 100000 items. The first 10 are:
16004 : aa
12579 : aaa
63127 : aaaa
80933 : aaas
54701 : aab
20022 : aac
30535 : aachen
96694 : aacsb
84104 : aacta
54859 : aad


# Inspect words

In [25]:
# Load the words
with open(selected_words, 'r') as f:
    words_to_inspect = [word.strip() for word in f.readlines()]

total_words = len(words_to_inspect)
print(total_words)
print(words_to_inspect)

100
['chess', 'metro', 'noting', 'manila', 'santiago', 'branding', 'trade', 'miranda', 'obvious', 'alicia', 'afraid', 'biggest', 'saint', 'racer', 'bachelor', 'constituent', 'troubles', 'additionally', 'oblast', 'observing', 'manages', 'events', 'skill', 'kerry', 'channel', 'rest', 'seconds', 'investors', 'thomas', 'hermann', 'holidays', 'eye', 'kathy', 'flesh', 'prairie', 'threatened', 'brandon', 'type', 'korean', 'andrew', 'georges', 'engage', 'evaluated', 'subjects', 'bruce', 'debate', 'senator', 'testimony', 'occurring', 'tri', 'tough', 'stretched', 'bangkok', 'majority', 'acquire', 'cliffs', 'covers', 'marching', 'reported', 'improve', 'delhi', 'walking', 'covid', 'circle', 'fearing', 'copy', 'deposits', 'pennsylvania', 'recommendations', 'corpus', 'une', 'butterfly', 'accept', 'resistance', 'scorers', 'stuff', 'grove', 'whose', 'myself', 'expeditionary', 'singer', 'stream', 'arkansas', 'jul', 'definition', 'constellation', 'towards', 'dale', 'westminster', 'meters', 'faculty', 's

## Using nltk

In [7]:
import nltk
# Ensure the 'words' corpus is downloaded
nltk.download('words')
from nltk.corpus import words

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Raya\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [26]:
# Access the word list from the nltk 'words' corpus
word_list = words.words('en')
invalid_words_nltk = [] # List to store invalid words found in the selected sample

for word in words_to_inspect:
    if word not in word_list:
        invalid_words_nltk.append(word)

# Calculate ratio of invalid words to total words
ratio = len(invalid_words_nltk) / len(words_to_inspect)

print(f"Ratio of invalid over valid English words (based on nltk.corpus.words): {len(invalid_words_nltk)}/{total_words}={ratio}\n")
print(invalid_words_nltk)

# NOTE: many valid words are classified as invalid. We need a larger word list or we will resort to manual inspection.

Ratio of invalid over valid English words (based on nltk.corpus.words): 46/100=0.46

['metro', 'noting', 'santiago', 'branding', 'miranda', 'alicia', 'troubles', 'oblast', 'manages', 'events', 'seconds', 'investors', 'thomas', 'hermann', 'holidays', 'kathy', 'threatened', 'brandon', 'korean', 'andrew', 'georges', 'evaluated', 'subjects', 'bruce', 'occurring', 'stretched', 'cliffs', 'covers', 'marching', 'reported', 'delhi', 'fearing', 'deposits', 'pennsylvania', 'recommendations', 'une', 'scorers', 'arkansas', 'jul', 'westminster', 'meters', 'studies', 'traces', 'rome', 'dennis', 'claire']


## Using pyenchant

In [28]:
import enchant

english_dict = enchant.Dict("en_US")
invalid_words_enchant = []

for word in words_to_inspect:
    if not english_dict.check(word):
        invalid_words_enchant.append(word)

print(f"Ratio of invalid over valid English words (based on pyenchant): {len(invalid_words_enchant)}/{total_words}={ratio}\n")
print(invalid_words_enchant)

# NOTE: We see that most of the words classified as invalid are names
# The other words are: 'tri' and 'une'

Ratio of invalid over valid English words (based on pyenchant): 25/100=0.46

['santiago', 'miranda', 'alicia', 'kerry', 'thomas', 'hermann', 'kathy', 'brandon', 'korean', 'andrew', 'georges', 'bruce', 'tri', 'bangkok', 'delhi', 'covid', 'pennsylvania', 'une', 'arkansas', 'jul', 'westminster', 'rome', 'dennis', 'claire', 'walt']


## Using CELEX - for morphological status

In [None]:
# Import the list of selected word types for inspection
with open(selected_words) as f:
    words = [word.strip() for word in f.readlines()]

print(f"{len(words)} selected word types: {words}")


# Load the CELEX-based dictionary with word information
with open(celex, 'rb') as f:
    celex_dict = pkl.load(f)
print(f"Loaded CELEX dictionary with {len(celex_dict)} entries") 

# Extract all monomorphemic and non-monomorphemic words from the CELEX dictionary
celex_monomorph = []
celex_nonmonomorph = []
for word in celex_dict:
    # Check if the word is a monomorphemic ('morphstatus':'M') 
    if celex_dict[word]['morphstatus'] == 'M':
        celex_monomorph.append(celex_dict[word]['worddia'])
    else:
        celex_nonmonomorph.append(celex_dict[word]['worddia'])
print(f"It contains {len(celex_monomorph)} monomorphemic and {len(celex_nonmonomorph)} non-monomorphemic words")

# Define lists to classify the selected word types
selected_monomorph_freq = []
selected_monomorph_rd = []
selected_nonmonomorph_freq = []
selected_nonmonomorph_rd = []
oov = [] # not in celex

for i, word in enumerate(words):
    if word in celex_monomorph or word.capitalize() in celex_monomorph:
        if i < 101:
            selected_monomorph_freq.append(word)
        else:
            selected_monomorph_rd.append(word)
    elif word in celex_nonmonomorph or word.capitalize() in celex_monomorph:
        if i < 101:
            selected_nonmonomorph_freq.append(word)
        else:
            selected_nonmonomorph_rd.append(word)
    else:
        oov.append(word)

print(f"{len(selected_monomorph_freq)} of selected frequent words are monomorphemic")
print(f"{len(selected_monomorph_rd)} of randomly selected words are monomorphemic")

print(f"{len(selected_nonmonomorph_freq)} of selected frequent words are non-monomorphemic")
print(f"{len(selected_nonmonomorph_rd)} of randomly selected words are non-monomorphemic")

print(f"{len(oov)} word not found in the CELEX database: {oov}")