In [14]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
import heapq
import nltk
from nltk.corpus import stopwords
import itertools
import gensim
%matplotlib inline

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Evgeny\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
english_stopwords = set(stopwords.words('english'))

In [9]:
dataset = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes')
)

In [15]:
pured_documents = []
for i, doc in enumerate(dataset.data):
    tokens = gensim.utils.lemmatize(doc)
    document = []
    for token in tokens:
        word = token.split('/')[0]
        if word not in english_stopwords:
            document.append(word)
    pured_documents.append(document)    
    if i % 500 == 0:
        print 'Processed: ', i, 'documents from', len(dataset.data)

Processed:  0 documents from 11314
Processed:  500 documents from 11314
Processed:  1000 documents from 11314
Processed:  1500 documents from 11314
Processed:  2000 documents from 11314
Processed:  2500 documents from 11314
Processed:  3000 documents from 11314
Processed:  3500 documents from 11314
Processed:  4000 documents from 11314
Processed:  4500 documents from 11314
Processed:  5000 documents from 11314
Processed:  5500 documents from 11314
Processed:  6000 documents from 11314
Processed:  6500 documents from 11314
Processed:  7000 documents from 11314
Processed:  7500 documents from 11314
Processed:  8000 documents from 11314
Processed:  8500 documents from 11314
Processed:  9000 documents from 11314
Processed:  9500 documents from 11314
Processed:  10000 documents from 11314
Processed:  10500 documents from 11314
Processed:  11000 documents from 11314


In [16]:
text = nltk.Text(list(itertools.chain.from_iterable(pured_documents)))

In [17]:
class NltkHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        candidates = Counter()
        for word in words:
            for w in self.explain(word)[:5]:
                candidates[w] += 1
        for word in words:
            if word in candidates:
                del candidates[word]
        return [x for x, _ in candidates.most_common(len(candidates))]
    
    def explain(self, word):
        return text._word_context_index.similar_words(word)
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [18]:
from gensim.models import Word2Vec

In [19]:
model = Word2Vec(pured_documents, size=100, window=5, min_count=5, workers=4)

In [20]:
class GensimHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        return [x for x, _ in model.most_similar(positive=words)]
    
    def explain(self, word):
        return [x for x, _ in model.most_similar(positive=[word])]
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [21]:
words_universe = model.vocab.keys()
words_universe_set = set(model.vocab.keys())

In [22]:
def calc_score(player1, player2, word):
    explanation = [x for x in player1.explain(word) if x in words_universe_set][:10]
    exp_score = 0.
    if word in explanation:
        exp_score -= 1.0
    guess_score = 0.
    if len(explanation) > 0:
        for pref in xrange(10):
            guess = player2.guess(explanation[:pref + 1])[:10]
            try:
                pos = guess.index(word)
                player1.explanation_result(pos)
                player2.guess_result(pos)
                guess_score += 0.9 ** pos
                exp_score += 0.9 ** pos
            except ValueError:
                player1.explanation_result(None)
                player2.guess_result(None)
    return exp_score, guess_score


def play(player1, player2, rounds, seed=42):
    random_gen = np.random.RandomState(seed)
    player1_exp_score = 0.
    player1_guess_score = 0.
    player2_exp_score = 0.
    player2_guess_score = 0.
    for _ in xrange(rounds):
        word = random_gen.choice(words_universe)
        player1_exp, player2_guess = calc_score(player1, player2, word)
        player2_exp, player1_guess = calc_score(player2, player1, word)
        player1_explanation = player1.explain(word)
        player2_explanation = player1.explain(word)
        
        player1_exp_score += player1_exp
        player2_exp_score += player2_exp
        player1_guess_score += player1_guess
        player2_guess_score += player2_guess
        
    return (
        (player1_exp_score, player1_guess_score, player1_guess_score + player1_exp_score),
        (player2_exp_score, player2_guess_score, player2_guess_score + player2_exp_score)
    )

In [26]:
%%time
play(GensimHatPlayer(), GensimHatPlayer(), 1000)

Wall time: 58.3 s


((7784.868339009995, 7784.868339009995, 15569.73667801999),
 (7784.868339009995, 7784.868339009995, 15569.73667801999))

In [30]:
%%time
play(NltkHatPlayer(), GensimHatPlayer(), 1000)

Wall time: 49.3 s


((114.30978901300006, 67.031032143, 181.34082115600006),
 (67.031032143, 114.30978901300006, 181.34082115600006))

In [31]:
%%time
play(NltkHatPlayer(), NltkHatPlayer(), 1000)

KeyboardInterrupt: 

In [41]:
def thug_concat(a, b):
        
    output = []
    for ai, bi in zip(a, b):
        output.append(ai)
        output.append(bi)
        
    c = []
    if len(a) < len(b):
        lena = len(a)
        lenb = len(b)
        c = b[lena - lenb:]
    elif len(a) > len(b):
        lena = len(a)
        lenb = len(b)
        c = a[lenb - lena:]
    
    output.extend(c)
    return output

In [43]:
class ThugHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        candidates = Counter()
        for word in words:
            for w in self.explain(word)[:5]:
                candidates[w] += 1
        for word in words:
            if word in candidates:
                del candidates[word]
        nltk_guess = [x for x, _ in candidates.most_common(len(candidates))][:5]
        gensim_guess = [x for x, _ in model.most_similar(positive=words)][:5]
        return thug_concat(nltk_guess, gensim_guess)
        
    
    def explain(self, word):
        nltk_explain = text._word_context_index.similar_words(word)[:5]
        gensim_explain = [x for x, _ in model.most_similar(positive=[word])][:5]
        return thug_concat(nltk_explain, gensim_explain)
        
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [44]:
%%time
play(ThugHatPlayer(), NltkHatPlayer(), 1000)

Wall time: 9min 7s


((1563.3990292989993, 1367.2124078339982, 2930.6114371329977),
 (1367.2124078339982, 1563.3990292989993, 2930.6114371329977))

In [45]:
%%time
play(ThugHatPlayer(), GensimHatPlayer(), 1000)

Wall time: 2min 53s


((2112.2453685770006, 6788.433692173004, 8900.679060750004),
 (6788.433692173004, 2112.2453685770006, 8900.679060750004))