In [24]:
import os
import bz2
import pickle as pkl

In [2]:
os.chdir(os.path.join(r"C:\Users\Raya\OneDrive\Documents\3-CSAI\CSAI-Y3-S2\Thesis\csai-thesis"))

In [54]:
# Constants
wordids_compressed = os.path.join("data", "wiki_wordids.txt.bz2")
wordids_decompressed = os.path.join("data", "wordids.txt")
selected_words = os.path.join("data", "200_words.txt")
celex = os.path.join('data', 'wfdict.pkl')
words_inspection = os.path.join('data', 'words_inspection.txt')

In [7]:
# Save a decompressed version of the wordids file
with bz2.open(wordids_compressed, 'rb', encoding='utf-8') as f_in, open(wordids_decompressed, 'wb') as f_out:
    f_out.write(f_in.read())

In [53]:
# Directly creating an id2word dictionary from the compressed file.
id2word_dict = {}

# Preview the file contents
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:
    contents = f.read()
    print("The contents of the wordids file look like this:")
    print(contents[:100])

# Extract words and corresponding IDs
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:    
    first_line = True
    for line in f:
        if first_line:
            first_line = False
            continue  # Skip the first line
        id, word, _ = line.split() # Split the line into components
        id2word_dict[id] = word

print(f"The id2word dictionary looks like this:")
for i, (id, word) in enumerate(list(id2word_dict.items())[:10]):
    print(f"{id} : {word}")

The contents of the wordids file look like this:
5545283
16004	aa	25827
12579	aaa	12305
63127	aaaa	999
80933	aaas	1594
54701	aab	1435
20022	aac	2796

The id2word dictionary looks like this:
16004 : aa
12579 : aaa
63127 : aaaa
80933 : aaas
54701 : aab
20022 : aac
30535 : aachen
96694 : aacsb
84104 : aacta
54859 : aad


# Inspect words

## Using nltk

In [None]:
import nltk
# Ensure the 'words' corpus is downloaded
nltk.download('words')
from nltk.corpus import words

# Access the word list
word_list = words.words('en')
invalid_words = []
total_words = 0

with open(selected_words, 'r') as f:
    for word in f:
        word = word.strip() # Remove newline and whitespace
        total_words += 1
        if word not in word_list:
            invalid_words.append(word)

if total_words == 0:
    print("No words to process.")

with open(words_inspection, 'w') as f:
    if total_words > 0:
        ratio = len(invalid_words) / total_words
        f.write(f"Ratio of invalid over valid English words (based on nltk.corpus.words): {len(invalid_words)}/{total_words}={ratio}\n")
    else:
        print("No valid data to calculate ratio.\n")
    for word in invalid_words:
        f.write(f"{word}\n")

## Using CELEX - for morphological status

In [None]:
# Import the list of selected word types for inspection
with open(selected_words) as f:
    words = [word.strip() for word in f.readlines()]

print(f"{len(words)} selected word types: {words}")


# Load the CELEX-based dictionary with word information
with open(celex, 'rb') as f:
    celex_dict = pkl.load(f)
print(f"Loaded CELEX dictionary with {len(celex_dict)} entries")

# Extract all monomorphemic and non-monomorphemic words from the CELEX dictionary
celex_monomorph = []
celex_nonmonomorph = []
for word in celex_dict:
    # Check if the word is a monomorphemic ('morphstatus':'M') 
    if celex_dict[word]['morphstatus'] == 'M':
        celex_monomorph.append(celex_dict[word]['worddia'])
    else:
        celex_nonmonomorph.append(celex_dict[word]['worddia'])
print(f"It contains {len(celex_monomorph)} monomorphemic and {len(celex_nonmonomorph)} non-monomorphemic words")

# Define lists to classify the selected word types
selected_monomorph_freq = []
selected_monomorph_rd = []
selected_nonmonomorph_freq = []
selected_nonmonomorph_rd = []
oov = [] # not in celex

for i, word in enumerate(words):
    if word in celex_monomorph or word.capitalize() in celex_monomorph:
        if i < 101:
            selected_monomorph_freq.append(word)
        else:
            selected_monomorph_rd.append(word)
    elif word in celex_nonmonomorph or word.capitalize() in celex_monomorph:
        if i < 101:
            selected_nonmonomorph_freq.append(word)
        else:
            selected_nonmonomorph_rd.append(word)
    else:
        oov.append(word)

print(f"{len(selected_monomorph_freq)} of selected frequent words are monomorphemic")
print(f"{len(selected_monomorph_rd)} of randomly selected words are monomorphemic")

print(f"{len(selected_nonmonomorph_freq)} of selected frequent words are non-monomorphemic")
print(f"{len(selected_nonmonomorph_rd)} of randomly selected words are non-monomorphemic")

print(f"{len(oov)} word not found in the CELEX database: {oov}")