In [1]:
import re
import requests
import warnings

import numpy as np
import pandas as pd
import fasttext
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

In [2]:
%%time

file_path = '20-newsgroups/all_texts.preprocessed.txt'

model_skipgram = fasttext.train_unsupervised(file_path, model='skipgram', dim=5)
model_cbow = fasttext.train_unsupervised(file_path, model='cbow', dim=16)
model_skipgram2 = fasttext.train_unsupervised(file_path, model='skipgram', dim=10)

CPU times: user 12min 14s, sys: 3.94 s, total: 12min 18s
Wall time: 1min 13s


In [3]:
!wc -l {file_path}

 1719263 20-newsgroups/all_texts.preprocessed.txt


In [4]:
!head {file_path}


newsgroup: sci . electronics
document_id: 52434
from: et@teal . csn . org  ( eric h .  taylor ) 
subject: re: help_with_tracking_device

in article <00969fba . e640ff10@aesop . rutgers . edu> mcdonald@aesop . rutgers . edu writes:
>[ .  .  . ]
>there are a variety of water-proof housings i could use but the real meat
>of the problem is the electronics .  .  . hence this posting .   what kind of


In [5]:
len(model_cbow.words)

76430

In [6]:
model_cbow['song']

array([-1.0904677 ,  3.1783462 ,  2.2694519 ,  0.26722917, -0.48688525,
        1.192406  ,  1.7286314 , -4.153593  , -2.9458418 ,  1.201558  ,
        0.41035125, -0.10245584,  1.1308031 ,  1.7286075 , -1.0084642 ,
        0.3910451 ], dtype=float32)

In [7]:
!mkdir models
model_skipgram.save_model('models/skipgram.model')
model_skipgram2.save_model('models/skipgram2.model')
model_cbow.save_model('models/cbow.model')

mkdir: models: File exists


In [8]:
!ls -lh models

total 579968
-rw-r--r--  1 aguschin  staff   133M Sep  8 10:33 cbow.model
-rw-r--r--  1 aguschin  staff    42M Sep  8 10:33 skipgram.model
-rw-r--r--  1 aguschin  staff    83M Sep  8 10:33 skipgram2.model


## Некоторые размышления по поводу выбора игры

Что фиксировано:
- игра "отгадывается" словом

Что можно варьировать:
- загадывается
  - словом
  - вектором
- критерий успеха (задача игры)
  - угадать загаданное слово
  - сойтись к одному слову
- обратная связь
  - есть
  - нет


Варианты игр, которые можно проверить и использовать
- угадать загаданное слово (**проверено, можно сделать**)

  Ведущий вытягивает из шляпы слово W для команды i. Команда составляет набор слов, которое будет сообщать другим участникам. Игра проходит 10 итераций, каждую итерацию j команда добавляет новую подсказку - новое слово, а другие игроки (все остальные команды) пытаются отгадать загаданное слово W, сообщая 5 слов. Как только слово окажется в этом топ-5, команда получает очки (чем раньше угадала - тем больше, например (10 - j)). Загадывающая команда получает очки за каждую отгадавшую команду (например, столько же очков).
  
 
- сойтись к одному слову

  Ведущий вытягивает из шляпы слово W для команды i. Команда i и напарник (команда k) одновременно произносят слово. Все по тем же правилам, что и http://robotmindmeld.com
  
  Игра продолжается 10 раундов, участники получают, например, (10 - j) очков. Проводим такие игры с разным набором команд многократно.

## proof of concept: Угадать загаданное слово
Текущие особенности poc:
- проверена только работа fasttext и только с помощью функции get_sentence_vector
- модели недотренированы
- тексты "грязные"
- слово "угадано", если загаданное слово составляет его часть (word in guess)
- 

In [17]:
class AbstractPlayer:
    def __init__(self):
        raise NotImplementedError()

    def explain(self, word, n_words):
        raise NotImplementedError()
        
    def guess(self, words, n_words):
        raise NotImplementedError()


class LocalFasttextPlayer(AbstractPlayer):
    def __init__(self, model):
        self.model = model
        self.words = model.get_words()
        self.matrix = np.concatenate([model[word].reshape(1, -1) for word in self.words], axis=0)

    def find_words_for_vector(self, vector, n_closest):
        sims = cosine_similarity(vector.reshape(1, -1), self.matrix).ravel()
        word_sims = pd.Series(sims, index=self.model.get_words()).sort_values(ascending=False)
        return list(word_sims.head(n_closest).index)
    
    def find_words_for_sentence(self, sentence, n_closest):
        vector = self.model.get_sentence_vector(sentence)
        return self.find_words_for_vector(vector, n_closest)
    
    def explain(self, word, n_words):
        return self.find_words_for_sentence(word, n_words)
    
    def guess(self, words, n_words):
        return self.find_words_for_sentence(' '.join(words), n_words)


# class LocalFasttextPlayer(AbstractPlayer):
#     def __init__(self, model):
#         self.model = model

#     def find_words_for_sentence(self, sentence, n_closest):
#         neighbours = self.model.get_nearest_neighbors(sentence)
#         words = [word for similariry, word in neighbours][:n_closest]
#         return words
    
#     def explain(self, word, n_words):
#         return self.find_words_for_sentence(word, n_words)
    
#     def guess(self, words, n_words):
#         return self.find_words_for_sentence(' '.join(words), n_words)


class RemotePlayer(AbstractPlayer):
    def __init__(self, url):
        self.url = url
        
    def explain(self, word, n_words):
        response = requests.get(self.url + '/explain', {'word': word, 'n_words': n_words})
        if response.status_code == 200:
            word_list = response.json()
        else:
            warnings.warn(f'request failed: {response.status_code}')
            word_list = []
        return word_list
    
    def guess(self, words, n_words):
        response = requests.get(self.url + '/guess', {'words': words, 'n_words': n_words})
        if response.status_code == 200:
            word_list = response.json()
        else:
            warnings.warn(f'request failed: {response.status_code}')
            word_list = []
        return word_list

In [43]:
# remote_player = RemotePlayer('https://obscure-everglades-02893.herokuapp.com')
remote_player = RemotePlayer('http://127.0.0.1:5000')
print(remote_player.explain('zen', 10))
print(remote_player.guess(['zen', 'desk', 'word'], 5))

['sin;', '>[i', 'lover', 'mad', 'rant', 'scorn', '>honestly', '*laugh*', 'forever', 'smack']
['toolong', 'converst', 'cslipper', 'grafpoint', 'distorteddata']


In [18]:
skipgram = LocalFasttextPlayer(model_skipgram)
cbow = LocalFasttextPlayer(model_cbow)
skipgram2 = LocalFasttextPlayer(model_skipgram2)

In [19]:
guessing_by = [c for c in skipgram.explain('word', 5) if not 'word' in c]
print('GUESSING BY: ', guessing_by)

GUESSING BY:  ['>agnostic', '>expressing', '"expert"', '#one', 'responces']


In [23]:
from collections import namedtuple

class Game:
    def __init__(self, players, words, criteria, n_rounds, n_explain_words, n_guessing_words):
        assert len(players) >= 2
        assert criteria in ('hard', 'soft')
        self.players = players
        self.words = words
        self.criteria = criteria
        self.n_rounds = n_rounds
        self.n_explain_words = n_explain_words
        self.n_guessing_words = n_guessing_words
        
    def remove_repeated_words(self, words):
        unique_words = []
        for c in words:
            if not c in unique_words:
                unique_words.append(c)
        return unique_words
        
    def create_word_list(self, player, word, n_words):
        if self.criteria == 'hard':
            explain_words = explain_words[:n_words]
        explain_words = player.explain(word, n_words)
        explain_words = [re.sub(r'[^\w]', '', c) for c in explain_words]
        explain_words = [c for c in explain_words if word not in c]
        explain_words = self.remove_repeated_words(explain_words)
        if self.criteria == 'soft':
            explain_words = explain_words[:n_words]
        return explain_words

    def play(self, explaining_player, guessing_players, word, criteria, verbose=False):

        if verbose:
            print(f'HOST to EXPLAINING PLAYER: the word is "{word}"')

        guessing_by = self.create_word_list(explaining_player.api, word, N_EXPLAIN_WORDS)
        if verbose:
            print(f'PLAYER 1 to HOST: my wordlist is {guessing_by}')

        df = []
        for i in range(1, len(guessing_by) + 1):
            if len(guessing_players) == 0:
                break
            game_round = {}
            if verbose:
                print(f'\n===ROUND {i}===\n')
            sentence = guessing_by[:i]
            if verbose:
                print(f"HOST: {sentence}")
            game_round.update({f'Explanation for "{word}" ({explaining_player.name})': sentence})
            for player in guessing_players:
                guessed_words = player.api.guess(sentence, N_GUESSING_WORDS)
                if criteria == 'soft':
                    guessed = any([word in c for c in guessed_words])
                else:
                    guessed = word in guessed_words
                if guessed:
                    guessing_players = [p for p in guessing_players if p != player]
                if verbose:
                    print(f'GUESSING PLAYER ({player.name}) to HOST: {guessed_words}')
                    print(f'HOST: {guessed}')
                game_round.update({f'Guess ({player.name})': guessed_words})
            df.append(game_round)
        return pd.DataFrame(df)

    def run(self):
        np.random.shuffle(self.words)
        igame = 0
        for r in range(self.n_rounds):
            for explaining_player in self.players:
                guessing_players = [p for p in self.players if p != explaining_player]
                word = self.words[igame]
                igame += 1
                df = self.play(explaining_player, guessing_players, word, criteria=self.criteria, verbose=False)
                display(df)
            
        
player = namedtuple('Player', ['name', 'api'])

N_EXPLAIN_WORDS = 10
N_GUESSING_WORDS = 5
N_ROUNDS = 2
CRITERIA = 'soft'

PLAYERS = [player('skipgram', skipgram), player('skipgram2', skipgram2), player('cbow', cbow)]
WORDS = [
    'play', 'master', 'word', 'cocoa', 'coffee',
    'september', 'jungle', 'spell', 'python',
    'cat', 'joy', 'sadness', 'small', 'stick'
]
# WORDS = ['september'] * 10

game = Game(PLAYERS, WORDS, CRITERIA, N_ROUNDS, N_EXPLAIN_WORDS, N_GUESSING_WORDS)
game.run()

Unnamed: 0,"Explanation for ""jungle"" (skipgram)",Guess (cbow),Guess (skipgram2)
0,[athravan],"[gambarian, mengele, bulgaricus, hamishmar, bu...","[culminated, tribes:, horrors:, tzeghagrons, ""..."
1,"[athravan, wiles]","[hues, gibes, sermons, diatribes, arromdians]","[culminated, keplerian, gasimov, libyan, bulga..."
2,"[athravan, wiles, consul]","[scholten, praxeas, dwellings, telempathy, mit...","[faisal, summus, fauci, -garison, barzilai]"
3,"[athravan, wiles, consul, ibn]","[mithraist:, repacholi, dwellings, larish, sch...","[summus, fauci, faisal, ameliorating, -garison]"
4,"[athravan, wiles, consul, ibn, constantine]","[colonialist, celebrations, konstantinople, me...","[fauci, catechumens, summus, faisal, constantine]"
5,"[athravan, wiles, consul, ibn, constantine, al...","[colonialist, aristotle, meningococcus, gestur...","[fauci, summus, faisal, catechumens, ameliorat..."
6,"[athravan, wiles, consul, ibn, constantine, al...","[colonialist, meningococcus, larish, aristotle...","[ameliorating, spirochete, summus, keplerian, ..."
7,"[athravan, wiles, consul, ibn, constantine, al...","[colonialist, meningococcus, history:, anania,...","[ameliorating, summus, fauci, keplerian, catec..."
8,"[athravan, wiles, consul, ibn, constantine, al...","[larish, aristotle, najarian, meningococcus, c...","[ameliorating, libyan, -garison, spirochete, |..."
9,"[athravan, wiles, consul, ibn, constantine, al...","[larish, aristotle, colonialist, najarian, men...","[ameliorating, -garison, spirochete, summus, v..."


Unnamed: 0,"Explanation for ""coffee"" (skipgram2)",Guess (cbow),Guess (skipgram)
0,[bend],"[coffee, swear, leave"", glaze, sheaffer]","[octopuses, all-round, goalies, stitches, stroke]"
1,"[bend, punk]",,"[desoto, blech, schatzki, whirlwind, bht]"
2,"[bend, punk, whiskey]",,"[wierdos, ,, likes:, whirlwind, #$^&]"
3,"[bend, punk, whiskey, lever]",,"[doen, livid, [no, ayoob, dupe]"
4,"[bend, punk, whiskey, lever, grille]",,"[des"", paragraph:, shroud, nuthin, loach]"
5,"[bend, punk, whiskey, lever, grille, ring]",,"[namelplate, ÿ, nuthin, des"", loach]"
6,"[bend, punk, whiskey, lever, grille, ring, whe...",,"[livid, marktomorrow, dupe, fdr, doen]"
7,"[bend, punk, whiskey, lever, grille, ring, whe...",,"[dupe, livid, doen, aarghhhh, -rush]"
8,"[bend, punk, whiskey, lever, grille, ring, whe...",,"[livid, fdr, dupe, newsflash:, doen]"
9,"[bend, punk, whiskey, lever, grille, ring, whe...",,"[livid, newsflash:, fdr, shroud, dupe]"


Unnamed: 0,"Explanation for ""cocoa"" (cbow)",Guess (skipgram),Guess (skipgram2)
0,[smudge],"[>80%, siblings, intrested, refilling, relaying]","[crankcase, somwehat, disconnected, plugging, ..."
1,"[smudge, custer]","[bednets, tome, bode, knives:, longbow]","[stroll, bode, siphoned, brimstone, chains]"
2,"[smudge, custer, coy]","[gro, shards, entitled:, marks, >>nasa]","[blockhead, dspse, pallas, bullhorn, laddish]"
3,"[smudge, custer, coy, plante]","[mitterrand, quang, chump, musician, ginga]","[fromm, dacumos, glockenspiel, flintstone, ark..."
4,"[smudge, custer, coy, plante, coax]","[glaucoma, maytag, corner"", `man, denon]","[cobralinks, glockenspiel, sturm, ingram, pallas]"
5,"[smudge, custer, coy, plante, coax, sneer]","[denon, reviewette:, iacr, >#valuino, reprint]","[glockenspiel, sturm, |>on, dacumos, gates]"
6,"[smudge, custer, coy, plante, coax, sneer, bea...","[ginga, reviewette:, [koz84], mitterrand, >#va...","[simpleton, cobralinks, sturm, fromm, tinseltown]"
7,"[smudge, custer, coy, plante, coax, sneer, bea...","[quang, ginga, mitterrand, pufferfish, chump]","[dacumos, stalker, fromm, macphase, sturm]"
8,"[smudge, custer, coy, plante, coax, sneer, bea...","[ginga, reviewette:, >#valuino, weet, budejovice]","[sturm, gainesville, caelum, dacumos, macphase]"
9,"[smudge, custer, coy, plante, coax, sneer, bea...","[weet, >#valuino, ginga, reviewette:, medi]","[caelum, gainesville, sturm, dacumos, macphase]"


Unnamed: 0,"Explanation for ""play"" (skipgram)",Guess (cbow),Guess (skipgram2)
0,[suckers],"[swimmers, wohlers, leans, backers, dreamers]","[>go, convertible, >cubs, >gee, bandwagon]"
1,"[suckers, walks]","[strangers, fuhr-bashers, hitchhikers, left-wi...","[>cubs, bummin, >gee, comeback, suckers]"
2,"[suckers, walks, comeback]","[>shortstops, watchit, comeback, stare, tirade]","[>cubs, bummin, bandwagon, butterflies, kettle]"
3,"[suckers, walks, comeback, nmmthe]","[gummint, tradgedy, >shortstops, attire, coach...","[butterflies, >cubs, kettle, bummin, ago""]"
4,"[suckers, walks, comeback, nmmthe, hammer]","[bore, basho, goldwing, watchit, steere]","[butterflies, blowout, chernobyl, sushi, bummin]"
5,"[suckers, walks, comeback, nmmthe, hammer, duck]","[goldwing, watchit, tacky, diduck, tommorow]","[butterflies, chernobyl, blowout, goos, whispe..."


Unnamed: 0,"Explanation for ""small"" (skipgram2)",Guess (cbow),Guess (skipgram)
0,[casing],"[crumbling, bulging, showering, aligning, rear...","[dot-4, compartments, fewer, gasoline, waves]"
1,"[casing, localized]","[arranging, fortified, bordering, certified, t...","[targetted, crossbow, shareholders, dumber, st..."
2,"[casing, localized, supplying]","[empowering, protruding, dispensing, supplying...","[recycled, dab, permanant, targetted, composure]"
3,"[casing, localized, supplying, beets]","[transplantants, transplants, dispair, conduct...","[silver-solder, succesful, tap, targetted, cas..."
4,"[casing, localized, supplying, beets, drag]","[name-dropping, off-the-shelf, array-flapping,...","[magazine;, q3:, tradeoff:, passthrough, under..."
5,"[casing, localized, supplying, beets, drag, la...","[empowering, transparent, thermoplastic, protr...","[q3:, magazine;, aero-engine, *on, ripple]"
6,"[casing, localized, supplying, beets, drag, la...","[empowering, monitoring, sub-bands, off-the-sh...","[*on, part;, magazine;, tauri, dumbshow]"
7,"[casing, localized, supplying, beets, drag, la...","[monitoring, transparent, clustering, equidist...","[*on, tauri, magazine;, part;, warparound]"
8,"[casing, localized, supplying, beets, drag, la...","[transparency, equidistant, monitoring, transp...","[merkle, lectorium, snails, tanstaafl, magazine;]"
9,"[casing, localized, supplying, beets, drag, la...","[ground-em-to-fit, trample, monitoring, equidi...","[lectorium, merkle, vice-versa, oam, ratio:]"


Unnamed: 0,"Explanation for ""spell"" (cbow)",Guess (skipgram),Guess (skipgram2)
0,[gobbledygook],"[,, bbc, shag, likes:, wrung]","[soem, slogan, `out, ""doctor, ,]"
1,"[gobbledygook, guesser]","[n>crap, :-], -ed, ""], stil]","[catch-22, _v_, #$^&, foolin, ,]"
2,"[gobbledygook, guesser, parody]","[oldish, `no, ;-], banana, weasel]","[>cobb, gobbledygook, }>}so, gb:, >}so]"
3,"[gobbledygook, guesser, parody, clue]","[oldish, >>hello, `no, ;-], gr>]","[gobbledygook, >cobb, ""defcon, thougt, }>}so]"
4,"[gobbledygook, guesser, parody, clue, foyer]","[friend:, ;-], [sorry, >>mark, oldish]","[gobbledygook, _v_, _body, soem, }>}so]"
5,"[gobbledygook, guesser, parody, clue, foyer, yer]","[>>mark, friend:, [sorry, |interested, thatch]","[_v_, gobbledygook, wsidom, }>}so, soem]"
6,"[gobbledygook, guesser, parody, clue, foyer, y...","[thatch, |interested, </s>, >>mark, peter;]","[_v_, wsidom, donoghue, linares, slang->sling-..."
