Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Check if word exists #24

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
8 changes: 6 additions & 2 deletions server/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def sent_2_words(sent: str) -> List[str]:
return words


def corpus_to_words(file_path: str, vocab_path: str = str(VOCAB_PATH)):
def corpus_to_words(file_path: str):
my_counter: Counter = Counter()
with open(file_path, "r", encoding="utf-8") as fl:
for sent in tqdm(fl, desc="Precess file"):
Expand All @@ -40,7 +40,10 @@ def corpus_to_words(file_path: str, vocab_path: str = str(VOCAB_PATH)):
min_count = max([10, max_cnt / 100])

selected_words = [word for word, count in my_counter.items() if (min_count < count <= max_cnt)]
return selected_words


def save_words(selected_words, vocab_path: str = str(VOCAB_PATH)):
with open(vocab_path, "w", encoding="utf-8") as fl:
for w in selected_words:
fl.write(w + "\n")
Expand Down Expand Up @@ -179,7 +182,8 @@ def add_new_text(bucket_name=BUCKET_SPLIT_TEXTS, destination_bucket_name=BUCKET_
to_copy = sorted(set(all_files) - set(copied_files))[0]
print(to_copy)
download_blob(bucket_name, to_copy, DATA_PATH / to_copy)
corpus_to_words(DATA_PATH / to_copy)
words = corpus_to_words(DATA_PATH / to_copy)
save_words(words)
copy_blob(bucket_name, to_copy, destination_bucket_name, to_copy)


Expand Down
32 changes: 32 additions & 0 deletions the_hat_game/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re
from collections import Counter
from typing import List

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm

STOP_WORDS = stopwords.words("english")
LEMMATIZER = WordNetLemmatizer()


def sent_2_words(sent: str) -> List[str]:
sent = sent.lower()
sent = re.sub("[^a-z]+", " ", sent)
words = word_tokenize(sent)
words = [LEMMATIZER.lemmatize(word) for word in words if ((word not in STOP_WORDS) and len(word.strip()) > 3)]
return words


def corpus_to_words(file_path: str):
my_counter: Counter = Counter()
with open(file_path, "r", encoding="utf-8") as fl:
for sent in tqdm(fl, desc="Precess file"):
my_counter.update(sent_2_words(sent))

max_cnt = max(count for word, count in my_counter.items()) / 10
min_count = max([10, max_cnt / 100])

selected_words = [word for word, count in my_counter.items() if (min_count < count <= max_cnt)]
return selected_words
39 changes: 36 additions & 3 deletions the_hat_game/game.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import the_hat_game.nltk_setup # noqa: F401
from the_hat_game.loggers import c_handler, logger
from the_hat_game.players import RemotePlayer
from the_hat_game.data import corpus_to_words


class Game:
Expand All @@ -24,8 +25,23 @@ def __init__(
n_rounds,
n_explain_words,
n_guessing_words,
corpus_path=None,
vocab_path=None,
random_state=None,
):
"""Main class for Game.
params:
- players: list of AbstractPlayer - players in the game
- words: list of str - all used words for guessing
- criteria: 'hard' of 'soft' - game criteria
- n_rounds: int - number of rounds
- n_explain_words: int - number of words for explaining
- n_guessing_words: int - number of words for guessing
- corpus_path: str - path for the corpus to create vocabulary (for criteria='hard')
- vocab_path: str - path for vocabulary (for criteria='hard')
NOTE: only corpus_path or vocab_path must be defined
NOTE: if vocabulary is not defined nltk.wordnet will be used for filter not existing words
"""
assert len(players) >= 2
assert criteria in ("hard", "soft")
self.players = players
Expand All @@ -36,6 +52,21 @@ def __init__(
self.n_guessing_words = n_guessing_words
self.random_state = random_state
self.stemmer = SnowballStemmer("english")
if corpus_path is not None:
assert vocab_path is None, "corpus and vocabulary cannot be defined at the same time"
self.whitelist = corpus_to_words(corpus_path)
elif vocab_path is not None:
with open(vocab_path, encoding="utf-8") as f:
vocab_words = f.readlines()
self.whitelist = [word.strip() for word in vocab_words]
else:
self.whitelist = None
# add all words for guessing to non-empty whitelist
if self.whitelist is not None:
self.whitelist += [
word for word in self.words
if word not in self.whitelist
]

def remove_repeated_words(self, words):
unique_words = []
Expand All @@ -54,9 +85,11 @@ def remove_same_rooted_words(self, word, word_list):
cleared_word_list = [w for w in word_list if self.stemmer.stem(w) != root]
return cleared_word_list

@staticmethod
def remove_non_existing_words(words):
existing_words = [w for w in words if len(wordnet.synsets(w)) > 0]
def remove_non_existing_words(self, words):
if self.whitelist is not None:
existing_words = [w for w in words if w in self.whitelist]
else:
existing_words = [w for w in words if len(wordnet.synsets(w)) > 0]
return existing_words

def create_word_list(self, player, word, n_words):
Expand Down