In [1]:
import os
import bz2
import random
import pickle as pkl

In [2]:
os.chdir(os.path.join(r"C:\Users\Raya\OneDrive\Documents\3-CSAI\CSAI-Y3-S2\Thesis\csai-thesis"))

In [3]:
# Constants

random.seed(10)

# Word ids
wordids_compressed = os.path.join("models", "wiki_wordids.txt.bz2")
wordids_decompressed = os.path.join("models", "wordids.txt")

# Vocabularies
vocab_file = os.path.join('data', 'vocab.pkl')
vocab_controlled_file = os.path.join('data', 'vocab_controlled.pkl')

# Words to inspect
sorted_ids_by_freq_file = os.path.join("data", "sorted_ids.txt")
selected_words_file = os.path.join("data", "200_words.txt")
controlled_selected_words_file = os.path.join("data", "200_words_controlled.txt")

# Celex dictionary
celex_file = os.path.join('data', 'celex_dict.pkl')

# Output files
id2word_file = os.path.join('models', 'id2word_dict.pkl')
words_inspection_results_file = os.path.join('data', 'words_inspection_results.txt')
ids_by_wordlength_file = os.path.join('data', 'ids_by_wordlength.pkl')
ids_by_wordlength_controlled_file = os.path.join('data', 'ids_by_wordlength_controlled.pkl')

In [4]:
# # Save a decompressed version of the wordids file
# with bz2.open(wordids_compressed, 'rb') as f_in, open(wordids_decompressed, 'wb') as f_out:
#     f_out.write(f_in.read())

In [5]:
# Creating an id2word dictionary from the compressed file (directly)
id2word_dict = {}

# Preview the file contents
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:
    contents = f.read()
    print("The contents of the wordids file look like this:")
    print(contents[:100])

# Extract words and corresponding IDs
with bz2.open(wordids_compressed, 'rt', encoding='utf-8') as f:    
    first_line = True
    for line in f:
        if first_line:
            first_line = False
            continue  # Skip the first line
        id, word, _ = line.split() # Split the line into components
        id2word_dict[id] = word

print(f"The id2word dictionary looks like this:")
for i, (id, word) in enumerate(list(id2word_dict.items())[:10]):
    print(f"{id} : {word}")

The contents of the wordids file look like this:
5545283
7801	aa	25827
1740	aaron	55355
9301	ab	31240
7014	abandon	26896
1912	abandoned	124225
9654	a
The id2word dictionary looks like this:
7801 : aa
1740 : aaron
9301 : ab
7014 : abandon
1912 : abandoned
9654 : abbey
9848 : abbot
9958 : abbott
6070 : abbreviated
1913 : abc


In [6]:
# # Save the id2word dictionary for later use
# with open(id2word_file, 'wb') as f:
#     pkl.dump(id2word_dict, f)

with open(id2word_file, 'rb') as f:
    id2word_dict_check = pkl.load(f)

print(f"id2word_dict (saved) contains {len(id2word_dict_check)} items. The first 10 are:")
for i, (id, word) in enumerate(list(id2word_dict_check.items())[:10]):
    print(f"{id} : {word}")

id2word_dict (saved) contains 100000 items. The first 10 are:
16004 : aa
12579 : aaa
63127 : aaaa
80933 : aaas
54701 : aab
20022 : aac
30535 : aachen
96694 : aacsb
84104 : aacta
54859 : aad


# Select words for inspection

In [7]:
with open(sorted_ids_by_freq_file, 'r') as f:
    sorted_ids = [line.split(',')[0] for line in f.readlines()]
print(f"Total number of ids: {len(sorted_ids)}")
print(f"10 most frequent ids: {sorted_ids[:10]}")

Total number of ids: 10000
10 most frequent ids: ['2554', '3207', '2986', '2364', '3005', '7749', '187', '3734', '819', '1777']


In [8]:
# Create a dictionary containing the 5000 most frequent words keyed by their IDs
most_freq_ids_5000 = []
counter = 0
for id in sorted_ids:
    if counter == 5000:
        break
    # Add the id-word pair to the vocabulary if it is part of the 10,000 filtered words collected in id2word_dict
    if id in id2word_dict:
        most_freq_ids_5000.append(id)
        counter += 1
vocab = {id:id2word_dict[id] for id in most_freq_ids_5000}

print(f"Vocabulary (raw) size: {len(vocab)}")

# Store the vocabulary in a pickle file for easy access
with open(vocab_file, 'wb') as f:
    pkl.dump(vocab, f)


### Repeat, controlling for morphological status of words in the vocabulary ###
# ---------- Load the CELEX dictionary ----------
with open(celex_file, 'rb') as f:
    celex_dict = pkl.load(f)

# Extract all wordforms which are monomorphemic lemmas from the CELEX dictionary
celex_monomorph_lemmas = []
for item in celex_dict:
    if celex_dict[item]['morphstatus'] == 'M' and celex_dict[item]['lemma'] == celex_dict[item]['worddia']:
        celex_monomorph_lemmas.append(celex_dict[item]['worddia'])
print(f"Monomorphemic lemmas in CELEX: {len(celex_monomorph_lemmas)}")


# ---------- Create a vocabulary with the 5000 most frequent monomorphemic wordforms which are lemmas ----------
# Create a dictionary containing the 5000 most frequent monomorphemic words keyed by their IDs
most_freq_monomorph_ids_10000 = [] # NOTE: changed to 10,000 from 5,000 because too few words are monomorphemic
counter = 0
for id in sorted_ids:
    if counter == 5000:
        break
    # Check if the word is monomorphemic according to CELEX, ensuring that it is also in the 10,000 selected words
    if id in id2word_dict and id2word_dict[id] in celex_monomorph_lemmas:
        most_freq_monomorph_ids_10000.append(id)
        counter += 1
controlled_vocab = {id:id2word_dict[id] for id in most_freq_monomorph_ids_10000}
print(f"Vocabulary (controlled) size: {len(controlled_vocab)}")

# Store the vocabulary in a pickle file for easy access
with open(vocab_controlled_file, 'wb') as f:
    pkl.dump(controlled_vocab, f)

# ---------- Select 200 word types for inspection from each vocabulary ----------
def select_word_types_for_inspection(sorted_word_ids, vocab, output_file):
    """
    Select 200 word types for inspection and write them to a file.
    
    Args:
        sorted_word_ids (list): List of word IDs to select from, sorted by frequency (from most frequent to least frequent).
        id2word_dict (dict): Dictionary mapping IDs to words.
        output_file (str): Path to the output file.
    """
    with open(output_file, 'w') as f:
        # Extract the 100 most frequent word IDs
        most_freq_ids_100 = sorted_word_ids[:100]
        for word_id in most_freq_ids_100:
            word = id2word_dict[word_id]
            f.write(f"{word}\n")
        
        # Extract 100 other IDs randomly
        random_word_ids = list(set(vocab) - set(most_freq_ids_100))
        for _ in range(100):
            random_id = random.choice(random_word_ids)
            word = id2word_dict[random_id]
            f.write(f"{word}\n")

# Usage for non-filtered vocabulary
select_word_types_for_inspection(most_freq_ids_5000, vocab, selected_words_file)

# Usage for controlled vocabulary
select_word_types_for_inspection(most_freq_monomorph_ids_10000, controlled_vocab, controlled_selected_words_file)


Vocabulary (raw) size: 5000
Monomorphemic lemmas in CELEX: 12546
Vocabulary (controlled) size: 1902


In [9]:
# Sanity check

with open(vocab_file, 'rb') as f:
    vocab = pkl.load(f, encoding='utf-8')
print(len(vocab))

with open(vocab_controlled_file, 'rb') as f:
    vocab_controlled = pkl.load(f, encoding='utf-8')
print(len(vocab_controlled))

5000
1902


# Inspect words

In [10]:
# Load the words
with open(selected_words_file, 'r') as f:
    words_to_inspect = [word.strip() for word in f.readlines()]

total_words = len(words_to_inspect)
print(total_words)
print(words_to_inspect)

200
['league', 'album', 'species', 'football', 'station', 'village', 'church', 'song', 'party', 'cup', 'population', 'town', 'game', 'women', 'river', 'election', 'player', 'band', 'games', 'la', 'division', 'championship', 'award', 'park', 'building', 'round', 'president', 'road', 'art', 'french', 'jpg', 'german', 'line', 'goals', 'book', 'railway', 'london', 'island', 'director', 'men', 'council', 'chart', 'track', 'elected', 'india', 'census', 'education', 'professional', 'championships', 'television', 'king', 'awards', 'coach', 'tournament', 'law', 'museum', 'al', 'region', 'william', 'research', 'australia', 'community', 'you', 'px', 'army', 'genus', 'minister', 'file', 'show', 'play', 'street', 'record', 'center', 'historic', 'australian', 'tv', 'lake', 'km', 'head', 'site', 'municipality', 'white', 'politician', 'father', 'san', 'indian', 'married', 'france', 'medal', 'union', 'episode', 'court', 'black', 'works', 'western', 'debut', 'radio', 'england', 'department', 'air', 'typ

## Using nltk

In [11]:
import nltk
# Ensure the 'words' corpus is downloaded
# nltk.download('words')
from nltk.corpus import words

In [12]:
# Access the word list from the nltk 'words' corpus
word_list = words.words('en')
invalid_words_nltk = [] # List to store invalid words found in the selected sample

for word in words_to_inspect:
    if word not in word_list:
        invalid_words_nltk.append(word)

# Calculate ratio of invalid words to total words
ratio = len(invalid_words_nltk) / len(words_to_inspect)

print(f"Ratio of invalid over valid English words (based on nltk.corpus.words): {len(invalid_words_nltk)}/{total_words}={ratio}\n")
print(invalid_words_nltk)

# NOTE: many valid words are classified as invalid. We need a larger word list or we will resort to manual inspection.

Ratio of invalid over valid English words (based on nltk.corpus.words): 52/200=0.26

['women', 'games', 'french', 'jpg', 'goals', 'london', 'elected', 'india', 'championships', 'awards', 'william', 'australia', 'px', 'australian', 'tv', 'km', 'indian', 'france', 'england', 'ian', 'plays', 'nominations', 'interviews', 'liverpool', 'worldwide', 'christ', 'fans', 'feet', 'publications', 'ncaa', 'indonesia', 'berkeley', 'renamed', 'passes', 'scholars', 'courts', 'alberto', 'nazi', 'ranking', 'lisa', 'orders', 'visited', 'ernest', 'apps', 'bengal', 'farms', 'vienna', 'pirates', 'madrid', 'bible', 'dragons', 'bennett']


## Using pyenchant

In [13]:
import enchant

english_dict = enchant.Dict("en_US")
invalid_words_enchant = []

for word in words_to_inspect:
    if not english_dict.check(word):
        invalid_words_enchant.append(word)

print(f"Ratio of invalid over valid English words (based on pyenchant): {len(invalid_words_enchant)}/{total_words}={ratio}\n")
print(invalid_words_enchant)

# NOTE: We see that most of the words classified as invalid are names and abbreviations
# The other words are: 'tri' and 'une'

Ratio of invalid over valid English words (based on pyenchant): 29/200=0.26

['jpg', 'london', 'india', 'william', 'australia', 'px', 'australian', 'tv', 'san', 'indian', 'france', 'england', 'ian', 'betty', 'liverpool', 'christ', 'jeff', 'sur', 'ncaa', 'indonesia', 'berkeley', 'alberto', 'nazi', 'lisa', 'ernest', 'bengal', 'vienna', 'madrid', 'bennett']


## Using CELEX - for morphological status

In [14]:
# Load the CELEX-based dictionary with word information
with open(celex_file, 'rb') as f:
    celex_dict = pkl.load(f)
print(f"Loaded CELEX dictionary with {len(celex_dict)} entries") 

# Extract all monomorphemic and non-monomorphemic words from the CELEX dictionary
celex_monomorph = []
celex_nonmonomorph = []
for word in celex_dict:
    # Check if the word is a monomorphemic ('morphstatus':'M') 
    if celex_dict[word]['morphstatus'] == 'M':
        celex_monomorph.append(celex_dict[word]['worddia'])
    else:
        celex_nonmonomorph.append(celex_dict[word]['worddia'])
print(f"It contains {len(celex_monomorph)} monomorphemic and {len(celex_nonmonomorph)} non-monomorphemic words")


for selected_words_f in [selected_words_file, controlled_selected_words_file]:
    print(f"--- File: {selected_words_f} ---")
    
    # Load the selected words for inspection
    with open(selected_words_f) as f:
        words = [word.strip() for word in f.readlines()]
    print(f"{len(words)} selected word types: {words}")


    # Define lists to classify the selected word types
    selected_monomorph_freq = []
    selected_monomorph_rd = []
    selected_nonmonomorph_freq = []
    selected_nonmonomorph_rd = []
    oov = [] # not in celex

    for i, word in enumerate(words):
        if word in celex_monomorph or word.capitalize() in celex_monomorph:
            if i < 101:
                selected_monomorph_freq.append(word)
            else:
                selected_monomorph_rd.append(word)
        elif word in celex_nonmonomorph or word.capitalize() in celex_monomorph:
            if i < 101:
                selected_nonmonomorph_freq.append(word)
            else:
                selected_nonmonomorph_rd.append(word)
        else:
            oov.append(word)

    print(f"{len(selected_monomorph_freq)} of selected frequent words are monomorphemic")
    print(f"{len(selected_monomorph_rd)} of randomly selected words are monomorphemic")

    print(f"{len(selected_nonmonomorph_freq)} of selected frequent words are non-monomorphemic")
    print(f"{len(selected_nonmonomorph_rd)} of randomly selected words are non-monomorphemic")

    print(f"{len(oov)} word not found in the CELEX database: {oov}")

Loaded CELEX dictionary with 160595 entries
It contains 28641 monomorphemic and 131954 non-monomorphemic words
--- File: data\200_words.txt ---
200 selected word types: ['league', 'album', 'species', 'football', 'station', 'village', 'church', 'song', 'party', 'cup', 'population', 'town', 'game', 'women', 'river', 'election', 'player', 'band', 'games', 'la', 'division', 'championship', 'award', 'park', 'building', 'round', 'president', 'road', 'art', 'french', 'jpg', 'german', 'line', 'goals', 'book', 'railway', 'london', 'island', 'director', 'men', 'council', 'chart', 'track', 'elected', 'india', 'census', 'education', 'professional', 'championships', 'television', 'king', 'awards', 'coach', 'tournament', 'law', 'museum', 'al', 'region', 'william', 'research', 'australia', 'community', 'you', 'px', 'army', 'genus', 'minister', 'file', 'show', 'play', 'street', 'record', 'center', 'historic', 'australian', 'tv', 'lake', 'km', 'head', 'site', 'municipality', 'white', 'politician', 'fat

# Group IDs by the length of the corresponding words

In [20]:
for vocab_f, wordlength_file in zip([vocab_file, vocab_controlled_file], [ids_by_wordlength_file, ids_by_wordlength_controlled_file]):
    print(vocab_f)
    with open(vocab_f, 'rb') as f:
        vocab = pkl.load(f)
    print(len(vocab))
    ids_by_wordlength_dict = {wl: [id for id in vocab if len(vocab[id]) == wl] for wl in range(3,8)}
    for wl in ids_by_wordlength_dict:
        print(f"Length {wl} ({len(ids_by_wordlength_dict[wl])}): {ids_by_wordlength_dict[wl]}")
    
    with open(wordlength_file, 'wb') as f:
        pkl.dump(ids_by_wordlength_dict, f)


data\vocab.pkl
5000
Length 3 (249): ['1777', '86', '3515', '706', '653', '3832', '3703', '1255', '2595', '1226', '2852', '1644', '3796', '763', '6442', '2013', '14', '1570', '994', '7012', '3540', '137', '2078', '309', '467', '3353', '2557', '572', '3158', '6920', '2256', '1458', '4954', '3052', '1721', '1505', '1757', '2457', '715', '2399', '1677', '3075', '4496', '7363', '2444', '1213', '3511', '1512', '802', '1558', '407', '6512', '2401', '885', '454', '3134', '9110', '666', '3366', '3272', '8002', '710', '5179', '641', '5231', '8180', '1151', '3659', '1003', '2039', '6494', '2062', '2518', '2480', '5075', '7815', '4971', '867', '5486', '3936', '5326', '8885', '5168', '2417', '2204', '9982', '1030', '1437', '5888', '7957', '7512', '2373', '699', '5470', '3533', '3906', '6862', '1670', '3332', '9666', '5785', '5544', '1241', '6610', '4913', '5636', '9138', '7695', '2096', '3372', '7584', '3221', '1658', '9097', '2918', '1543', '1826', '3044', '2733', '412', '2261', '1004', '9292', '6