# Systematicity in English monomorphemic words by word class

### Sean Trott

Do certain word classes have more sub-morphemic systematicity than others?

## Load model and dataset

In [1]:
import os 
import gensim
import numpy as np
import pandas as pd
import re

# Variables
MODEL_PATH = os.environ['WORD2VEC_PATH']
ROOT_PATH = 'data/raw/roots_celex_monosyllabic.txt'

Using TensorFlow backend.


In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_PATH, binary=True)

In [3]:
entries = open(ROOT_PATH, "r").read().split("\n")

In [4]:
words = [entry.split("\\")[0] for entry in entries if entry != "" and entry.islower()]
words[0]

'a'

## Filter by words that appear in model

In [5]:
critical_words = list(set([w for w in words if w in model.vocab]))

In [6]:
len(critical_words)

2082

## Obtain form and meaning similarity metrics

Here, we import the class `SystematicityUtilities` from a [custom library](https://github.com/seantrott/nlp_utilities). By default, this class uses *Levenshtein distance* as its metric for *form similarity*, and *cosine distance* as its metric for *meaning similarity*. The `compare_form_and_meaning` method used below compares every word pair along form and meaning dimensions.

In [7]:
from nlp_utilities.compling import SystematicityUtilities
systematicity_utils = SystematicityUtilities(model)
comparisons = systematicity_utils.compare_form_and_meaning(critical_words)

In [8]:
import pandas as pd

In [9]:
comparisons_df = pd.DataFrame.from_dict(comparisons)

In [10]:
print("{length} comparisons total".format(length=len(comparisons_df)))

2166321 comparisons total


In [11]:
comparisons_df.sample(4)

Unnamed: 0,form,meaning,w1,w2
1773363,3,0.132985,spray,bran
1013995,5,-0.002741,lo,space
90076,5,0.116098,troop,age
1362426,5,0.140935,right,twice


## Global correlation

In [12]:
from scipy.stats import linregress

In [13]:
true_regression = linregress(comparisons_df['form'], comparisons_df['meaning'])
true_regression.rvalue

-0.040672612879521758

In other words, words with more **form differences** (e.g. a higher Levenshtein distance) will have *less* similar meanings (e.g. cosine similarity).

## Compare global correlation to permuted distributions

In [14]:
import numpy as np

In [15]:
permuted_results = []
for permute in range(100):
    permuted_meaning = np.random.permutation(comparisons_df['meaning'])
    random_regression = linregress(comparisons_df['form'], permuted_meaning)
    permuted_results.append(random_regression)

In [16]:
permuted_cors = [reg.rvalue for reg in permuted_results]

Now we can compare the *true correlation* with the distribution of correlations obtained by shuffling our dataset.

In [22]:
greater = [cor for cor in permuted_cors if cor <= true_regression.rvalue]
p_global = len(greater) / len(permuted_cors)
p_global

0.0

## Systematicity coefficients for each word

In [48]:
from multiprocessing import Process, Queue, Pool

In [64]:
def run_comparison(queue, word, wordlist):
    copy = wordlist[:]
    copy.remove(word)
    comparisons_minus_word = systematicity_utils.compare_form_and_meaning(copy)
    new_regression = linregress(comparisons_minus_word['form'], comparisons_minus_word['meaning'])
    queue.put([word, new_regression])

In [67]:
pool = Pool(3)
results = pool.map(time.sleep, [4, 6, 8])

Traceback (most recent call last):
  File "/Users/seantrott/anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-37677103f248>", line 1, in <module>
    pool = Pool(3)
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/context.py", line 118, in Pool
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/pool.py", line 150, in __init__
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/pool.py", line 243, in _setup_queues
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/context.py", line 111, in SimpleQueue
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/queues.py", line 323, in __init__
  File "/Users/seantrott/anaconda/lib/python3.5/multiprocessing/connection.py", line 512, in Pipe
OSError: [Errno 24] Too many open files

During handling of the above exception, another exception occurred:

Traceback 

ERROR: Internal Python error in the inspect module.
Below is the traceback from this internal error.


Unfortunately, your original traceback can not be constructed.



OSError: [Errno 24] Too many open files

In [None]:

queues, processes = [], []
for word in critical_words[0:1]:
    q = Queue()
    p = Process(target=run_comparison, args=(q, word, critical_words))
    p.start()
    processes.append(p)
    queues.append(q)
    for pro in processes:
        if pro.is_alive:
            pro.join()

In [None]:
for word in critical_words:
    q = Queue()
    p = Process(target=run_comparison, args=(q, word, critical_words))
    p.start()
    processes.append(p)
    queues.append(q)