# proof of concept: шляпа
Играем в шляпу (один загадывает, остальные отгадывают)

Текущие особенности реализации:
- проверена только работа fasttext и только с помощью одним подходом к подбору слов для загадывания и отгадывания
- модели недотренированы
- тексты "грязные"
- слово "угадано", если загаданное слово составляет его часть (word in guess) -- переменная CRITERIA

In [1]:
import re
import requests
import warnings
from collections import namedtuple, defaultdict, OrderedDict
import logging

import numpy as np
import pandas as pd
import fasttext
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler('games.log')
c_handler.setLevel(logging.WARNING)
f_handler.setLevel(logging.DEBUG)

# Create formatters and add it to handlers
c_format = logging.Formatter('%(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

## Обучим несколько моделей 

In [3]:
%%time

file_path = '20-newsgroups/all_texts.preprocessed.txt'

model_skipgram = fasttext.train_unsupervised(file_path, model='skipgram', dim=5)
model_cbow = fasttext.train_unsupervised(file_path, model='cbow', dim=16)
model_skipgram2 = fasttext.train_unsupervised(file_path, model='skipgram', dim=10)

CPU times: user 12min 42s, sys: 4.94 s, total: 12min 47s
Wall time: 1min 20s


In [4]:
!wc -l {file_path}

 1719263 20-newsgroups/all_texts.preprocessed.txt


In [5]:
!head {file_path}


newsgroup: sci . electronics
document_id: 52434
from: et@teal . csn . org  ( eric h .  taylor ) 
subject: re: help_with_tracking_device

in article <00969fba . e640ff10@aesop . rutgers . edu> mcdonald@aesop . rutgers . edu writes:
>[ .  .  . ]
>there are a variety of water-proof housings i could use but the real meat
>of the problem is the electronics .  .  . hence this posting .   what kind of


In [6]:
len(model_cbow.words)

76430

In [7]:
model_cbow['song']

array([-0.4938997 ,  3.4252524 ,  1.2626133 ,  1.3792135 , -0.6833985 ,
        2.418735  ,  0.6330744 , -3.1505802 , -2.1769068 ,  0.47993   ,
       -0.20191263,  0.10602476,  3.0737967 ,  2.5073853 , -0.30763522,
       -0.07266755], dtype=float32)

In [8]:
!mkdir models
model_skipgram.save_model('models/skipgram.model')
model_skipgram2.save_model('models/skipgram2.model')
model_cbow.save_model('models/cbow.model')

mkdir: models: File exists


In [9]:
!ls -lh models

total 579968
-rw-r--r--  1 aguschin  staff   133M Sep 16 09:26 cbow.model
-rw-r--r--  1 aguschin  staff    42M Sep 16 09:26 skipgram.model
-rw-r--r--  1 aguschin  staff    83M Sep 16 09:26 skipgram2.model


## Реализации классов для игроков

In [10]:
class AbstractPlayer:
    def __init__(self):
        raise NotImplementedError()

    def explain(self, word, n_words):
        raise NotImplementedError()
        
    def guess(self, words, n_words):
        raise NotImplementedError()


if hasattr(model_skipgram2, 'get_nearest_neighbors'):
    print('using latest fasttext version')
    
    class LocalFasttextPlayer(AbstractPlayer):
        def __init__(self, model):
            self.model = model

        def find_words_for_sentence(self, sentence, n_closest):
            neighbours = self.model.get_nearest_neighbors(sentence)
            words = [word for similariry, word in neighbours][:n_closest]
            return words

        def explain(self, word, n_words):
            return self.find_words_for_sentence(word, n_words)

        def guess(self, words, n_words):
            return self.find_words_for_sentence(' '.join(words), n_words)
else:
    print('using older fasttext version')
    class LocalFasttextPlayer(AbstractPlayer):
        def __init__(self, model):
            self.model = model
            self.words = model.get_words()
            self.matrix = np.concatenate([model[word].reshape(1, -1) for word in self.words], axis=0)

        def find_words_for_vector(self, vector, n_closest):
            sims = cosine_similarity(vector.reshape(1, -1), self.matrix).ravel()
            word_sims = pd.Series(sims, index=self.model.get_words()).sort_values(ascending=False)
            return list(word_sims.head(n_closest).index)

        def find_words_for_sentence(self, sentence, n_closest):
            vector = self.model.get_sentence_vector(sentence)
            return self.find_words_for_vector(vector, n_closest)

        def explain(self, word, n_words):
            return self.find_words_for_sentence(word, n_words)

        def guess(self, words, n_words):
            return self.find_words_for_sentence(' '.join(words), n_words)


class RemotePlayer(AbstractPlayer):
    def __init__(self, url):
        self.url = url
        
    def explain(self, word, n_words):
        response = requests.get(self.url + '/explain', {'word': word, 'n_words': n_words})
        if response.status_code == 200:
            word_list = response.json()
        else:
            warnings.warn(f'request failed: {response.status_code}')
            word_list = []
        return word_list
    
    def guess(self, words, n_words):
        response = requests.get(self.url + '/guess', {'words': words, 'n_words': n_words})
        if response.status_code == 200:
            word_list = response.json()
        else:
            warnings.warn(f'request failed: {response.status_code}')
            word_list = []
        return word_list

using latest fasttext version


In [11]:
remote_player = RemotePlayer('https://obscure-everglades-02893.herokuapp.com')
# remote_player = RemotePlayer('http://127.0.0.1:5000')
print(remote_player.explain('zen', 10))
print(remote_player.guess(['zen', 'desk', 'word'], 5))

['zen', 'sin;', '>[i', 'lover', 'mad', 'rant', 'scorn', '>honestly', '*laugh*', 'forever']
['have:', '"boggs"', '>[lotsa', 'qualifier', '[now']


In [12]:
local_player = LocalFasttextPlayer(model_skipgram)
print(local_player.explain('zen', 10))
print(local_player.guess(['zen', 'desk', 'word'], 5))

['*laugh*', 'sin;', 'gun-toter', '>[i', '>honestly', 'lover', '"he', '"bastard"', '"stop', 'mad']
['umbdr522', 'converst', 'glxlink', 'calculus&mathematica', 'rpem']


## Игра

In [37]:
class Game:
    def __init__(
        self, players, words, criteria,
        n_rounds, n_explain_words, n_guessing_words,
        random_state=None
    ):
        assert len(players) >= 2
        assert criteria in ('hard', 'soft')
        self.players = players
        self.words = words
        self.criteria = criteria
        self.n_rounds = n_rounds
        self.n_explain_words = n_explain_words
        self.n_guessing_words = n_guessing_words
        
    def remove_repeated_words(self, words):
        unique_words = []
        for c in words:
            if not c in unique_words:
                unique_words.append(c)
        return unique_words
        
    def score_players(self, explainer_name, last_rounds):
        rewards = {
            player: self.n_explain_words + 1 - nround
            for player, nround in last_rounds.items()
        }
        rewards[explainer_name] = sum(rewards.values())
        return rewards

    def create_word_list(self, player, word, n_words):
        reported_words = player.explain(word, n_words)
        explain_words = reported_words[:]
        if self.criteria == 'hard':
            explain_words = explain_words[:n_words]
        explain_words = [re.sub(r'[^\w]', '', c) for c in explain_words]
        explain_words = [c for c in explain_words if word not in c]
        explain_words = self.remove_repeated_words(explain_words)
        if self.criteria == 'soft':
            explain_words = explain_words[:n_words]
        return reported_words, explain_words
    
    def check_criteria(self, word, guessed_words):
        if self.criteria == 'soft':
            guessed = any([word in c for c in guessed_words])
        else:
            guessed = word in guessed_words
        return guessed
    
    def play_round(self, explaining_player, guessing_players, word, sentence):
        game_round = OrderedDict()
        results = {}
        logger.info(f"HOST: {sentence}")
        game_round.update({f'Explanation for "{word}" ({explaining_player.name})': sentence})
        for player in guessing_players:
            guessed_words = player.api.guess(sentence, self.n_guessing_words)
            guessed = self.check_criteria(word, guessed_words)
            results[player.name] = guessed
            logger.info(f'GUESSING PLAYER ({player.name}) to HOST: {guessed_words}')
            logger.info(f'HOST: {guessed}')
            game_round.update({f'Guess ({player.name})': guessed_words})
        return game_round, results

    def play(self, explaining_player, guessing_players, word, criteria):

        logger.info(f'HOST to EXPLAINING PLAYER: the word is "{word}"')

        reported_words, guessing_by = self.create_word_list(explaining_player.api, word, self.n_explain_words)
        logger.info(f'EXPLAINING PLAYER to HOST: my wordlist is {reported_words}')
        logger.info(f'HOST TO EXPLAINING PLAYER: cleaning your word list. Now the list is {guessing_by}')

        df = []
        success_rounds = {}
        for iround in range(1, len(guessing_by) + 1):
            if len(guessing_players) == 0:
                break
            logger.info(f'\n===ROUND {iround}===\n')
            game_round, results_round = self.play_round(
                explaining_player=explaining_player,
                guessing_players=guessing_players,
                word=word,
                sentence=guessing_by[:iround],
            )
            for player in guessing_players[:]:
                if (player.name not in success_rounds) and results_round.get(player.name, False):
                    success_rounds[player.name] = iround
                    guessing_players = [p for p in guessing_players if p != player]
            df.append(game_round)
        
        df = pd.DataFrame(df)
        scores = self.score_players(explaining_player.name, success_rounds)
        
        return df, scores
    
    def get_words(self, complete):
        if not complete:
            words = self.words[:]
            np.random.seed(self.random_state)
            np.random.shuffle(words)
        else:
            words = []
            for word in self.words:
                words.extend([word] * len(self.players))
        return words
    
    def get_n_rounds(self, complete):
        if complete:
            return len(self.words)
        else:
            return self.n_rounds
        
    @staticmethod
    def set_console_logging_level(verbose):
        if verbose == 'print_logs':
            console_logging_level = logging.INFO
        else:
            console_logging_level = logging.WARNING
        c_handler.setLevel(console_logging_level)

    def run(self, verbose=False, complete=False):
        self.set_console_logging_level(verbose)
        
        self.run_words = self.get_words(complete=complete)
        self.run_rounds = self.get_n_rounds(complete=complete)
            
        igame = 0
        scores = []
        scores_status = defaultdict(int)
        for r in range(self.run_rounds):
            for explaining_player in self.players:
                guessing_players = [p for p in self.players if p != explaining_player]
                word = self.run_words[igame]
                df, score = self.play(
                    explaining_player, guessing_players, word,
                    criteria=self.criteria
                )
                scores.append(score)
                scores_status[(explaining_player.name, 'explaining')] += score.get(explaining_player.name, 0)
                for player in guessing_players:
                    scores_status[(player.name, 'guessing')] += score.get(player.name, 0)
                igame += 1
                logger.info(f'\n\nSCORES: {score}')
                if verbose:
                    display(df)

        self.scores = pd.DataFrame(scores).fillna(0)
        self.scores.index.name = 'game'
        
        self.scores_status = pd.Series(scores_status).unstack()        
        
#         logger.debug(game.scores)
#         logger.debug(game.scores.sum(axis=0))
#         logger.debug(game.scores_status)

    def report_results(self):
        print('=== Team scores in each game ===')
        display(self.scores)
        logger.debug(self.scores)
        print('=== Team scores, summary ===')
        self.summary = self.scores_status
        self.summary['total'] = self.scores.sum(axis=0)
        self.summary.sort_values('total', ascending=False)
        display(self.summary)
        logger.debug(self.summary)


player = namedtuple('Player', ['name', 'api'])

N_EXPLAIN_WORDS = 10
N_GUESSING_WORDS = 5
N_ROUNDS = 1
CRITERIA = 'soft'

PLAYERS = [
    # player('skipgram team', RemotePlayer('https://obscure-everglades-02893.herokuapp.com')),
    player('skipgram team', LocalFasttextPlayer(model_skipgram)),
    player('skipgram2 team', LocalFasttextPlayer(model_skipgram2)),
    player('cbow team', LocalFasttextPlayer(model_cbow))
]

WORDS = [
    'play', 'master', 'word', 'cocoa', 'coffee',
    'september', 'jungle', 'spell', 'python', 'world',
    'cat', 'joy', 'sadness', 'small', 'stick',
]
WORDS = ['september']

# for vocabulary_path in [
#     'vocabulary/verbs_top_50.txt',
#     'vocabulary/nouns_top_50.txt',
#     'vocabulary/adjectives_top_50.txt'
# ]:
#     print(vocabulary_path)
#     with open(vocabulary_path) as f:
#         WORDS = f.readlines()
#         WORDS = [word.strip() for word in WORDS]

game = Game(PLAYERS, WORDS, CRITERIA, N_ROUNDS, N_EXPLAIN_WORDS, N_GUESSING_WORDS)
game.run(verbose='print_logs', complete=True)

HOST to EXPLAINING PLAYER: the word is "september"
EXPLAINING PLAYER to HOST: my wordlist is ['1961', 'october', 'march', 'january', 'raceway', '1988-89', 'november', 'megalomania', 'mid-december', '15th']
HOST TO EXPLAINING PLAYER: cleaning your word list. Now the list is ['1961', 'october', 'march', 'january', 'raceway', '198889', 'november', 'megalomania', 'middecember', '15th']

===ROUND 1===

HOST: ['1961']
GUESSING PLAYER (skipgram2 team) to HOST: ['1968', '19th', 'september', '1988:', '1970']
HOST: True
GUESSING PLAYER (cbow team) to HOST: ['1961>', '1963', '1968-1974', '196', '194']
HOST: False

===ROUND 2===

HOST: ['1961', 'october']
GUESSING PLAYER (cbow team) to HOST: ['october', '1962', '1969', '19th', 'february']
HOST: False

===ROUND 3===

HOST: ['1961', 'october', 'march']
GUESSING PLAYER (cbow team) to HOST: ['february;', 'november', 'february', 'march', '193']
HOST: False

===ROUND 4===

HOST: ['1961', 'october', 'march', 'january']
GUESSING PLAYER (cbow team) to HOST

Unnamed: 0,"Explanation for ""september"" (skipgram team)",Guess (skipgram2 team),Guess (cbow team)
0,[1961],"[1968, 19th, september, 1988:, 1970]","[1961>, 1963, 1968-1974, 196, 194]"
1,"[1961, october]",,"[october, 1962, 1969, 19th, february]"
2,"[1961, october, march]",,"[february;, november, february, march, 193]"
3,"[1961, october, march, january]",,"[february;, february, november, january, march]"
4,"[1961, october, march, january, raceway]",,"[november, october, september, february;, febr..."


HOST to EXPLAINING PLAYER: the word is "september"
EXPLAINING PLAYER to HOST: my wordlist is ['anniversary', 'august', '19th', 'october', '1961', 'february', 'march', '24th', 'november', '20th']
HOST TO EXPLAINING PLAYER: cleaning your word list. Now the list is ['anniversary', 'august', '19th', 'october', '1961', 'february', 'march', '24th', 'november', '20th']

===ROUND 1===

HOST: ['anniversary']
GUESSING PLAYER (skipgram team) to HOST: ['north', '26%', '24th', '1968', 'february']
HOST: False
GUESSING PLAYER (cbow team) to HOST: ['february', '1969', '1962', 'trial', 'republique']
HOST: False

===ROUND 2===

HOST: ['anniversary', 'august']
GUESSING PLAYER (skipgram team) to HOST: ['1967', 'coastal', 'crescent', 'yardley', '1956']
HOST: False
GUESSING PLAYER (cbow team) to HOST: ['anniversary', 'august', 'february', '1969', '1962']
HOST: False

===ROUND 3===

HOST: ['anniversary', 'august', '19th']
GUESSING PLAYER (skipgram team) to HOST: ['yardley', 'erzeroum', 'crescent', '1967', '1

Unnamed: 0,"Explanation for ""september"" (skipgram2 team)",Guess (skipgram team),Guess (cbow team)
0,[anniversary],"[north, 26%, 24th, 1968, february]","[february, 1969, 1962, trial, republique]"
1,"[anniversary, august]","[1967, coastal, crescent, yardley, 1956]","[anniversary, august, february, 1969, 1962]"
2,"[anniversary, august, 19th]","[yardley, erzeroum, crescent, 1967, 1946]","[anniversary, august, 19th, february, 1969]"
3,"[anniversary, august, 19th, october]","[yardley, 1947, 1967, american-occupied, 1944]","[anniversary, february, october, august, 1969]"
4,"[anniversary, august, 19th, october, 1961]","[1876, yardley, 1947:, american-occupied, feir...","[anniversary, 1969, february, august, 1962]"
5,"[anniversary, august, 19th, october, 1961, feb...","[yardley, 1947, 1944, 1967, american-occupied]","[february, august, anniversary, november, febr..."
6,"[anniversary, august, 19th, october, 1961, feb...","[division, francisco, jemison, knox, amt]","[february, february;, november, august, march]"
7,"[anniversary, august, 19th, october, 1961, feb...","[trench, division, india-pakistan, smyrna, knox]","[february, august, february;, november, 1969]"
8,"[anniversary, august, 19th, october, 1961, feb...","[division, francisco, annals, knox, yardley]","[november, february, february;, august, october]"
9,"[anniversary, august, 19th, october, 1961, feb...","[division, francisco, india-pakistan, janus, t...","[february, november, february;, august, 1969]"


HOST to EXPLAINING PLAYER: the word is "september"
EXPLAINING PLAYER to HOST: my wordlist is ['december', 'february', 'november', 'october', 'february;', 'january', 'sept', 'mid-december', '1962', 'ex-assat']
HOST TO EXPLAINING PLAYER: cleaning your word list. Now the list is ['december', 'february', 'november', 'october', 'january', 'sept', 'middecember', '1962', 'exassat']

===ROUND 1===

HOST: ['december']
GUESSING PLAYER (skipgram team) to HOST: ['pennsylvania', 'massachusetts', 'lausanne', 'champaign', 'southwest']
HOST: False
GUESSING PLAYER (skipgram2 team) to HOST: ['november', 'dispatcher', 'nam', 'curzon', 'january']
HOST: False

===ROUND 2===

HOST: ['december', 'february']
GUESSING PLAYER (skipgram team) to HOST: ['rockefeller', 'caratzas', 'idaho', 'calif', 'n0nzo']
HOST: False
GUESSING PLAYER (skipgram2 team) to HOST: ['december', 'dispatcher', 'november', 'nam', 'february']
HOST: False

===ROUND 3===

HOST: ['december', 'february', 'november']
GUESSING PLAYER (skipgram t

Unnamed: 0,"Explanation for ""september"" (cbow team)",Guess (skipgram team),Guess (skipgram2 team)
0,[december],"[pennsylvania, massachusetts, lausanne, champa...","[november, dispatcher, nam, curzon, january]"
1,"[december, february]","[rockefeller, caratzas, idaho, calif, n0nzo]","[december, dispatcher, november, nam, february]"
2,"[december, february, november]","[caratzas, calif, n0nzo, idaho, billings]","[december, november, curzon, dispatcher, [1961]]"
3,"[december, february, november, october]","[[1962], caratzas, billings, antipolis, n0nzo]","[december, november, dispatcher, curzon, [1963]]"
4,"[december, february, november, october, january]","[[1962], billings, antipolis, annals, caratzas]","[november, december, dispatcher, curzon, october]"
5,"[december, february, november, october, januar...","[billings, swiers, n0nzo, curzon, [1962]]","[december, november, curzon, [1961], dispatcher]"
6,"[december, february, november, october, januar...","[n0nzo, idaho, swiers, ariv, calif]","[december, november, dispatcher, curzon, [1963]]"
7,"[december, february, november, october, januar...","[n0nzo, idaho, swiers, ariv, catalina]","[december, november, dispatcher, curzon, [1963]]"
8,"[december, february, november, october, januar...","[n0nzo, swiers, 2057, billings, [1962]]","[november, december, dispatcher, [1963], [1961]]"


In [38]:
WORDS = [
    'play', 'master', 'word', 'cocoa', 'coffee',
    'september', 'jungle', 'spell', 'python', 'world',
    'cat', 'joy', 'sadness', 'small', 'stick',
]

game = Game(PLAYERS, WORDS, CRITERIA, N_ROUNDS, N_EXPLAIN_WORDS, N_GUESSING_WORDS)
game.run(verbose=False, complete=True)
game.report_results()

=== Team scores in each game ===


Unnamed: 0_level_0,cbow team,skipgram team,skipgram2 team
game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,7.0,7.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


=== Team scores, summary ===


Unnamed: 0,explaining,guessing,total
cbow team,0,11,11.0
skipgram team,16,16,32.0
skipgram2 team,21,10,31.0
