# Clusterability pilot

From Barend's email:

''
Take the ~1290 homonyms, polysemes, and monosemes.

initialize a dictionary clusterability = {}

Go over SUBTL (if you don't have it, I can upload it to the server) and extract 100 random instances of sentences per word.

For every word w in the 1290 test items

-- for every sentence in the 100 sentences, extract the sentence vector by summing over all the content word vectors according to word2vec (or glove -- i forget which one was better)

-- initialize scores = []

-- for k in {2,5,10,20,50}, 

---- run k-means clustering on the 100 sentence vectors

---- calculate the average silhouette for each vector and add this value to scores https://en.wikipedia.org/wiki/Silhouette_(clustering)

-- set clusterability[w] to be the max of scores

Compare the clusterability scores for the three groups of words: are homonyms more clusterable than polysemes and monosemes etc. (You can use a t-test for this)
''

In [1]:
import os
import numpy as np
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import progressbar
from scipy import stats
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import codecs
from collections import defaultdict
import random
tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

In [2]:
SUBTITLE_CORPUS = '/ais/clspace5/u/sasa/HOMONOMY_POLYSEMY/Subtlex/Subtlex.US.txt'

In [3]:
# Gensim Continuous Skipgram
# English Wikipedia Dump of February 2017 Gigaword 5th Edition
# not lemmatized
VECTORS = KeyedVectors.load_word2vec_format('/u/sasa/sasa/model.txt', binary=False)

In [4]:
HOMO = [line.split('\t')[0] for line in open('/u/sasa/homonymy_polysemy/Sasa_SOS.HomonymHybrid.tsv').readlines()[1:]]
POLY = [line.split('\t')[0] for line in open('/u/sasa/homonymy_polysemy/Sasa_SOS.Polysemes.tsv').readlines()[1:]]
MONO = [line.split('\t')[0] for line in open('/u/sasa/homonymy_polysemy/Sasa_SOS.Monosemes.tsv').readlines()[1:]]
len(HOMO + POLY + MONO)

1287

In [36]:
def context_clusterability(words, context_dict):
    '''
    Returns a dict mapping between target words in `words`
    and a clusterability score.
    
    words - iterable of target words (strings)
    context-dict - maps between target word t
        and an iterable containing t's context words (strings) 
    '''

    clusterability = {}

    bar = progressbar.ProgressBar(maxval=len(HOMO + POLY + MONO))
    bar.start()
    for i,tw in enumerate(words):
        bar.update(i)
        clusterability_w = []
        
        contexts = context_dict[tw]
        context_vectors = []
        
        if type(contexts[0]) == iter:
            # we are clustering sentence vectors
            for sentence in contexts:
                # remove stopwords *and* target word
                sentence = set(sentence.strip().split()) - (stopWords | set([tw]))
                if not sentence:
                    # sentence contained only stopwords and target word, thus empty
                    continue
                context_vectors.append(np.sum([VECTORS[cw] for cw in sentence if cw in VECTORS], axis=0))
                # filter out "empty" sentence vectors
                context_vectors = [sv for sv in sentence_vectors if sv.shape]

        elif type(contexts[0]) == str:
            context_vectors = [VECTORS[cw] for cw in context_dict[tw] if cw in VECTORS]
            
        else:
            raise TypeError('contexts should be strings or lists of strings')

        # for k in {2,5,10,20,50}, ---- run k-means clustering on the context vectors
        scores = []
        for k in range(2, min(52, len(context_vectors)), 2):
            kmeans = KMeans(init='k-means++', n_clusters=k)
            #print(len(sentence_vectors))
            #print([v.shape for v in sentence_vectors])
            kmeans.fit(context_vectors)
            score = silhouette_score(context_vectors, kmeans.labels_, metric='euclidean')
            scores.append(score)

        clusterability[tw] = np.mean(max(scores))

    bar.finish()
    
    return clusterability

In [12]:
# find most strongy associated context words for each w in MONO/POLY/HOMO using pmi

# (target_word, context_word) frequencies
f_xy = defaultdict(lambda: defaultdict(int))

target_words = set(HOMO + POLY + MONO)

print('Reading sublex file [1/3]...')
with open(SUBTITLE_CORPUS, "r", encoding='latin-1') as f:
    bar = progressbar.ProgressBar(maxval=len(f.readlines()))

print('Reading sublex file [2/3]...')
bar.start()
with open(SUBTITLE_CORPUS, "r", encoding='latin-1') as f:
    for i,line in enumerate(f):
        sentence = tokenizer.tokenize(line.lower())
        for tw in target_words & set(sentence):
            for cw in set(sentence):
                if cw == tw:
                    continue
                f_xy[tw][cw] += 1
        bar.update(i)
bar.finish()

print('Reading sublex file [3/3]...')
with open(SUBTITLE_CORPUS, "r", encoding='latin-1') as f:
    # all word frequencies
    f_x = Counter(tokenizer.tokenize(f.read().lower()))
                  
print('Done!')

# total number of words (needed in PMI computation)
N = sum(f_x.values())
N

Reading sublex file [1/3]...
Reading sublex file [2/3]...


100% |########################################################################|


Reading sublex file [3/3]...
Done!


50619248

### Compute ranked (by PMI) context words per target
Note that *not* having a lower threshold on the frequency of cooccurance (between target word and context word) produces pretty bad context word rankings.

In [24]:
context_words_pmi = {}
for tw in HOMO + POLY + MONO:
    cws = {}
    for cw, f in f_xy[tw].items():
        # TODO: what should min freq threshold be?
        #if f_xy[tw][cw] < 3:
        #    continue
        if f_x[cw] / N < 2e-6:
            continue
        cws[cw] = np.log(N * f / (f_x[tw] * f_x[cw]))
    # sort words by pmi
    cws = sorted(cws.items(), key=lambda t:t[1], reverse=True)
    context_words_pmi[tw] = cws


Some randomly selected target words and their top context words. Note: not all words (e.g. "sash") contain enough context words.

In [25]:
random.seed(2018)
examples = random.sample(MONO, 3) + random.sample(POLY, 3) + random.sample(HOMO, 3)
for tw in examples:
    print(tw.upper())
    print(', '.join([t[0] for t in context_words_pmi[tw][:20] if t[0] not in stopWords]))


APARTMENT
vacant, rented, renting, lease, samuels, tenant, coronary, bugged, crappy, penthouse, loft, tenants, complex, landlady, decorated, trend, drapes, berger, doorman, shitty
LINKS
cuff, hmmm, lunar, mainframe, defenses, andrews, tile, website, neural, mediterranean, rommel, curl, weaker, conditioned, und, edison, platinum, patti, ranking, supplied
BEARDED
edwardes, furry, clam, jumpin, haste, shaved, skiing, shortage, panther, dancers, seals, strap, overlook, evidently, oyster, singers, suited, portland, enters, bikini
BURN
slash, crisp, amulet, fires, perish, brighter, bridges, shred, villages, rubber, plantation, initiate, flame, crops, scars, circuits, stake, flames, palms, marks
TWIRL
baton, hawaiian, obnoxious, whooping, salsa, spine, checkbook, vacant, knob, knack, tame, viola, alarms, ½, shadows, massage, umbrella, wink, arch, rum
SNOW
cone, sleigh, dashing, lotus, globe, melted, eileen, shovel, frosty, skiing, footprints, avalanche, barefoot, driven, duel, sled, snowing, 

175407

### test with different kinds of contexts
1. randomly selected sentence vectors
2. randomly selected context words
3. top-k context words by pmi (k=10,20,30,50,100)

In [None]:
# randomly selected sentences
with open(os.path.join(CONTEXT_DIR, '{}.txt'.format(tw))) as f_context:
            contexts = np.random.choice(f_context.readlines(), size=100)
        
        sentence_vectors = []
        for sentence in contexts:
            # remove stopwords *and* target word
            sentence = set(sentence.strip().split()) - (stopWords | set([tw]))
            if not sentence:
                # sentence contained only stopwords and target word, thus empty
                continue
            sentence_vectors.append(np.sum([VECTORS[cw] for cw in sentence if cw in VECTORS], axis=0))
        # filter out "empty" sentence vectors
        sentence_vectors = [sv for sv in sentence_vectors if sv.shape]
        if len(sentence_vectors) < 50:
            continue


In [26]:
contexts = [[t[0] for t in l if t[0] not in stopWords] for l in context_words_pmi.values()]

from scipy.stats import describe
#print([len(x) for x in contexts])
num_contexts = list([len(list(x)) for x in contexts])
print(describe(num_contexts))
print(len([x for x in num_contexts if x < 50]))
print(len([x for x in num_contexts if x < 20]))

DescribeResult(nobs=1287, minmax=(141, 5528), mean=1258.1802641802642, variance=1140357.993918126, skewness=1.3523214304347755, kurtosis=1.308854857119539)
0
0


In [37]:
# top-k context words
for k in [10, 20, 30, 50, 100]:
    context_dict = {}
    for tw in context_words_pmi.keys():
        cws = [t[0] for t in context_words_pmi[tw] if t[0] not in stopWords and t[0] in VECTORS]
        if len(cws) < k:
            print("{} has {}<k={} context words; it will not be included in dataset".format(tw, len(cws), k))
            continue
        context_dict[tw] = cws[:k]

    #words = MONO + POLY + HOMO
    clusterability = context_clusterability(context_dict.keys(), context_dict)
    print("top-{} context words by pmi score".format(k))
    print_results(clusterability)


100% |########################################################################|


top-10 context words by pmi score
mono
0.080 (0.039)
poly
0.082 (0.043)
homo
0.083 (0.041)
Ttest_indResult(statistic=-0.5604315933798681, pvalue=0.5753317317476934)
Ttest_indResult(statistic=-1.2244162038759583, pvalue=0.22113212246744604)
Ttest_indResult(statistic=-0.6199053597944446, pvalue=0.535485073361547)


100% |########################################################################|


top-20 context words by pmi score
mono
0.070 (0.027)
poly
0.073 (0.032)
homo
0.073 (0.033)
Ttest_indResult(statistic=-1.3623398144447478, pvalue=0.17344901131194018)
Ttest_indResult(statistic=-1.4427505061451524, pvalue=0.1494565722203723)
Ttest_indResult(statistic=-0.0973409297017433, pvalue=0.9224784395276681)


100% |########################################################################|


top-30 context words by pmi score
mono
0.066 (0.024)
poly
0.068 (0.024)
homo
0.068 (0.028)
Ttest_indResult(statistic=-1.0610628938910691, pvalue=0.288960596497583)
Ttest_indResult(statistic=-1.0939405115084297, pvalue=0.2742888047376381)
Ttest_indResult(statistic=-0.11072209004498942, pvalue=0.9118626824774975)


100% |########################################################################|


top-50 context words by pmi score
mono
0.062 (0.018)
poly
0.063 (0.018)
homo
0.064 (0.024)
Ttest_indResult(statistic=-0.6966368027790235, pvalue=0.4862192432148398)
Ttest_indResult(statistic=-1.469305021150172, pvalue=0.14211748439214836)
Ttest_indResult(statistic=-0.868392875933672, pvalue=0.38542268884806197)


 98% |####################################################################### |

top-100 context words by pmi score
mono
0.058 (0.013)
poly
0.059 (0.013)
homo
0.060 (0.016)
Ttest_indResult(statistic=-0.40949156783689905, pvalue=0.682281398764693)
Ttest_indResult(statistic=-1.2679884497021294, pvalue=0.2051468471089382)
Ttest_indResult(statistic=-0.9107806129136616, pvalue=0.3626673751544677)


100% |########################################################################|


In [28]:
def print_results(clusterability):
    scores = {
        'homo': [],
        'poly': [],
        'mono': []
    }

    for type_,l in zip(['mono', 'poly', 'homo'], [MONO, POLY, HOMO]):
        for w in l:
            if w in clusterability:
                scores[type_].append(clusterability[w])
        print(type_)
        print("{:.3f} ({:.3f})".format(np.mean(scores[type_]), np.std(scores[type_])))

    print(stats.ttest_ind(scores['mono'], scores['poly']))
    print(stats.ttest_ind(scores['mono'], scores['homo']))
    print(stats.ttest_ind(scores['poly'], scores['homo']))