In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from enum import Enum, auto

In [None]:
import preprocessing
df = preprocessing.df
categories = preprocessing.categories

In [None]:
# from importlib import reload
# reload(preprocessing)

In [None]:
# setkeys = pd.DataFrame(preprocessing.setkeys, columns=["key", "value"])
# cat_sizes = setkeys["key"].value_counts()
# c = cat_sizes.to_numpy()
# w = 1 / np.sqrt(c)
# dict(zip(cat_sizes.index, w / np.sum(w)))

### Show all alternative values

In [None]:
altcols = [col for col in df.columns if col.endswith("_alt")]
print("\nAll countries with alternative values:")
display(df[df[altcols].applymap(len).sum(axis=1) > 0])

In [None]:
class DifficultyLevel(Enum):
    EASY = "easy"
    MEDIUM = "medium"
    HARD = "hard"
    
    def __str__(self):
        return self.value
    
    def __repr__(self):
        return str(self)
    
    def __float__(self):
        return float(["easy", "medium", "hard"].index(self.value))

In [None]:
def save_games(games, name: str, difficulty_level=None):
    games = list(games)
    timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    info = [timestamp, name, preprocessing.GAME_LANGUAGE.lower()]
    if difficulty_level is not None:
        info.append(str(difficulty_level).lower())
    path = f"../../data/games-{'-'.join(info)}.json"
    json.dump([game.to_json() for game in games], open(path, mode="w", encoding="utf-8"))
    print(f"{len(games)} games saved to {path}")

In [None]:
from generator import Constraint, GameGenerator

constraints = [
    # Some categories are pretty boring to appear multiple times
    Constraint.category_at_most("capital_ending_letter", 1),
    Constraint.category_at_most("capital_starting_letter", 1),
    Constraint.category_at_most("ending_letter", 1)
]
category_probs = {
    'continent': 4,
    'starting_letter': 3,
    'ending_letter': 1.5,
    'capital_starting_letter': 2,
    'capital_ending_letter': .5,
    'flag_colors': 3,
    'landlocked': 4,
    'island': 4
}

In [None]:
generator = preprocessing.get_generator(constraints, category_probs, field_size=3,
                                        seed=None, selection_mode="shuffle_setkeys", uniform=False, shuffle=True)
games = list(generator.sample_games(n=5000))
# save_games(games, "new-structure-test")

# Compute country difficulties

In [None]:
COUNTRY_DIFFICULTY_WEIGHTS = {
    "sqrt_name_length": 0.25,
    "log_population": -1,
    "log_gdp": -2,
    "log_gdp_per_capita": -0.5
}
SOLUTION_DIFFICULTY_WEIGHTS = {
    "median": 3,
    "std": 0.5,
    "size_score": 2,
    "offset": 1.5  # here: category difficulty
}
CELL_DIFFICULTY_WEIGHTS = {
    "row_col_difficulty": 2,
    "content_difficulty": 3
}
GAME_DIFFICULTY_WEIGHTS = {
    "avg_cell_difficulty": 1,
    "num_unique": 0.33
}

In [None]:
def normalize_zero_centered(x: pd.Series, scale=1):  # map to interval [-scale, +scale]
    return scale * (2 * (x - x.min()) / (x.max() - x.min()) - 1)

def normalize(x: pd.Series, scale=1):  # map to interval [0, scale]
    return scale * (x - x.min()) / (x.max() - x.min())

def linear_combination(df, weights):
    return sum(w * df[col] for col, w in weights.items())

def normalized_combination(df, weights, scale=1):
    return normalize(linear_combination(df, weights), scale=scale)

# def normalize_levels(x: pd.Series, scale=10):
#     return np.ceil(normalize(x, scale))

# def sigmoid(x: pd.Series):
#     return 1 / (1 + np.exp(-x))

# def normalize_levels_logistic(x: pd.Series, scale=10):
#     return normalize_levels(sigmoid(normalize_zero_centered(x)), scale)

In [None]:
stats = pd.DataFrame({
    "sqrt_name_length": normalize(np.sqrt(df["name"].apply(len)), scale=10),
    "log_population": normalize(np.log(df["population"]), scale=10),
    "log_gdp": normalize(np.log(df["gdp"]), scale=10),
    "log_gdp_per_capita": normalize(np.log(df["gdp_per_capita"]), scale=10)
})

df["difficulty"] = normalized_combination(stats, COUNTRY_DIFFICULTY_WEIGHTS, scale=10)

# VATICAN is an outlier. As reference, set as difficult as MONACO.
df.loc[df["name"] == "Vatican", "difficulty"] = df.loc[df["name"] == "Monaco", "difficulty"].iloc[0]

# df1 = df.nsmallest(50, "difficulty").reset_index()
# df1
# df1[df1["landlocked"]][["iso", "difficulty", "gdp", "population"]]
# df.nlargest(20, "difficulty")[["name", "gdp", "population", "difficulty"]]
df[["name", "gdp", "gdp_per_capita", "population", "difficulty"]].sort_values("difficulty").head(50)

In [None]:
categories = preprocessing.categories
setkeys = preprocessing.setkeys
cells = preprocessing.cells

In [None]:
items = df[["iso", "name", "difficulty"]].copy()
for key, value in setkeys:
    cat = categories[key]
    items[(key, value)] = items.iso.apply(lambda iso: iso in cat.sets.loc[value])


def aggregate_difficulty(difficulties: pd.Series, offsets=None):
    if offsets is None:
        offsets = np.zeros_like(difficulties)
    info = difficulties.copy().rename("difficulties").reset_index()
    info["offset"] = offsets
    info["size"] = info.difficulties.apply(len)
    info["size_score"] = normalize(-np.sqrt(info["size"]), scale=10)
    info["mean"] = info.difficulties.apply(np.mean)
    info["min"] = info.difficulties.apply(min)
    info["max"] = info.difficulties.apply(max)
    info["median"] = info.difficulties.apply(np.median)
    info["std"] = info.difficulties.apply(np.std)
    info.drop(columns=["difficulties"], inplace=True)
    difficulty = normalized_combination(info, SOLUTION_DIFFICULTY_WEIGHTS, scale=10)
    return difficulty.astype("float64")


info = pd.Series({(key, value): items[items[(key, value)]].difficulty.agg(list) for key, value in setkeys})
info = info.reset_index()
info.columns=["key", "value", "difficulties"]
cat_difficulty = normalize(info.key.apply(lambda key: categories[key].difficulty), scale=10)
info["difficulty"] = aggregate_difficulty(info["difficulties"], offsets=cat_difficulty)

info["countries"] = info.apply(lambda row: categories[row["key"]].sets.loc[row["value"]], axis=1)
info.set_index(["key", "value"], inplace=True)
info.nsmallest(20, "difficulty").reset_index()

In [None]:
cell_info = preprocessing.cell_info.copy().reset_index(drop=True)
cell_info = cell_info.join(info["difficulty"].rename("row_difficulty"), on=["row_cat", "row_val"])
cell_info = cell_info.join(info["difficulty"].rename("col_difficulty"), on=["col_cat", "col_val"])
cell_info["row_col_difficulty"] = normalize(cell_info["row_difficulty"] + cell_info["col_difficulty"], scale=10)
cell_info["row_col_difficulty_harmonic"] = normalize((cell_info["row_difficulty"] + 1) * (cell_info["col_difficulty"] + 1), scale=10)
cell_info.nsmallest(20, "row_col_difficulty")

content_difficulties = cell_info["contents"].apply(lambda cc: items[items.iso.isin(cc)].difficulty.agg(list))
cell_info["content_difficulty"] = aggregate_difficulty(content_difficulties)
cell_info["difficulty"] = normalized_combination(cell_info, CELL_DIFFICULTY_WEIGHTS, scale=10)
# cell_info.nsmallest(20, "content_difficulty")

# info = aggregate_difficulty(cell_info["content_difficulties"])
# info.nsmallest(20, "difficulty").reset_index()

# cell_info = cell_info.join(info["difficulty"], on=["row_cat", "row_val"])

In [None]:
cell_info.nlargest(20, "content_difficulty")

In [None]:
import itertools

game_info = pd.DataFrame([{"game": game,
                           "rows": [(cat.key, value) for cat, value in game.rows],
                           "cols": [(cat.key, value) for cat, value in game.cols]} for game in games])

def get_cell_indices(cell_data):
    for (cat1, val1), (cat2, val2) in itertools.product(cell_data["rows"], cell_data["cols"]):
        (row_cat, row_val), (col_cat, col_val) = max((cat1, val1), (cat2, val2)), min((cat1, val1), (cat2, val2))
        yield cell_info.index[(cell_info["row_cat"] == row_cat) & (cell_info["row_val"] == row_val) & (cell_info["col_cat"] == col_cat) & (cell_info["col_val"] == col_val)].tolist()[0]

game_info["cell_indices"] = game_info.apply(lambda cell_data: list(get_cell_indices(cell_data)), axis=1)
game_info["cell_difficulties"] = game_info["cell_indices"].apply(lambda ix: cell_info.loc[ix, "difficulty"].tolist())
game_info["max_cell_difficulty"] = game_info["cell_difficulties"].apply(max)
game_info["avg_cell_difficulty"] = game_info["cell_difficulties"].apply(np.mean)
game_info["num_unique"] = game_info["game"].apply(lambda game: len([1 for solutions in sum(game.solutions, []) if len(solutions) == 1]))

game_info.nsmallest(20, "max_cell_difficulty")

In [None]:
for game in game_info.nlargest(20, "max_cell_difficulty")["game"]:
    print(game.data)
    display(game.to_dataframe(solution=True))

In [None]:
plt.hist(game_info["max_cell_difficulty"], bins=20, rwidth=.9)
plt.title("Max cell difficulty")
plt.show()

In [None]:
game_info["difficulty"] = normalized_combination(game_info, GAME_DIFFICULTY_WEIGHTS, scale=10)
# difficulty_order: "This game is harder than x% of all games."
game_info["difficulty_order"] = pd.qcut(game_info["avg_cell_difficulty"], q=100, labels=False)
ix_easy = (game_info["max_cell_difficulty"] < 6) & (game_info["num_unique"] <= 2) & (game_info["difficulty_order"] <= 40)
hard_bound = game_info[~ix_easy]["difficulty_order"].median()
ix_medium =  ~ix_easy & (game_info["difficulty_order"] <= hard_bound)
ix_hard = ~ix_easy & ~ix_medium

# game_info["class"] = pd.qcut(game_info["difficulty"], q=3, labels=False)
game_info["level"] = 0
game_info.loc[ix_easy, "level"] = DifficultyLevel.EASY
game_info.loc[ix_medium, "level"] = DifficultyLevel.MEDIUM
game_info.loc[ix_hard, "level"] = DifficultyLevel.HARD

for i, game in enumerate(games):
    game.data["max_cell_difficulty"] = game_info.loc[i, "max_cell_difficulty"]
    game.data["avg_cell_difficulty"] = game_info.loc[i, "avg_cell_difficulty"]
    game.data["difficulty_level"] = str(game_info.loc[i, "level"])

In [None]:
scatter = plt.scatter(x=game_info["avg_cell_difficulty"], y=game_info["max_cell_difficulty"], c=game_info["level"])
plt.xlabel("Average cell difficulty")
plt.ylabel("Maximum cell difficulty")
plt.title("Distribution of game difficulty")
plt.xlim([0,10])
plt.ylim([0,10])
ax = plt.gca()
legend = ax.legend(scatter.legend_elements()[0], ["Easy", "Medium", "Hard"], loc="lower right", title="Difficulty Level")
ax.add_artist(legend)
plt.show()

In [None]:
game_info["level"].value_counts()

In [None]:
plt.hist(cell_info["content_difficulty"], bins=20, rwidth=.9)
plt.title("Cell content difficulty")
plt.show()

In [None]:
save_games(games, "with-difficulty")

In [None]:
# plt.scatter(x=cell_info["row_col_difficulty"], y=cell_info["content_difficulty"])
plt.scatter(x=cell_info["row_difficulty"], y=cell_info["col_difficulty"], c=cell_info["content_difficulty"])
# plt.scatter(x=cell_info["row_col_difficulty"], y=cell_info["row_col_difficulty_harmonic"])
plt.show()

In [None]:
cell_info = preprocessing.cell_info.copy()

# pd.merge(cell_info, info[["key", "value", "difficulty"]].add_prefix("row_"), left_on=["row_cat", "row_val"], right_on=["row_key", "row_value"])
cell_info["row_difficulty"] = cell_info.join(info["difficulty"], on=["row_cat", "row_val"])
cell_info
# for orient in ["row", "col"]:
#     cell_info[f"{orient}_difficulty"] = 

In [None]:
# Calculate difficulty of continents

data = df[["iso", "name", "continent", "difficulty"]].copy()
data.rename(columns={"continent": "category"}, inplace=True)

# Step 1: Calculate average difficulty per category
# Step 2: Calculate standard deviation per category
# Step 3: Adjust for category size (you can adjust the value to your liking)
categories = pd.DataFrame({"avg": data.groupby('category')['difficulty'].mean(),
                           "median": data.groupby('category')['difficulty'].median(),
                           "std": data.groupby('category')['difficulty'].std(),
                           "size": data['category'].value_counts()})

# Idea: entropy (europe is easy because oceania is hard)

# Step 4: Combine
# Step 5: Normalize
categories["difficulty"] = normalize(2 * categories["avg"] + categories["std"] + np.sqrt(categories["size"]), scale=10)

categories.sort_values("difficulty")

In [None]:
def entropy(x: pd.Series) -> float:
    p = x.value_counts() / len(x)
    return np.sum(-p * np.log2(p))

def entropies(x: pd.Series, y: pd.Series):
    pass
#     return pd.concat([x, y], axis=1).groupby(x.name)[y.name].agg(entropy)


# entropy(pd.Series(np.array([1,2,3,4])))
# entropy(pd.Series(np.array([1,1,1,1])))
# entropies(df["continent"], df["difficulty"])

In [None]:
import matplotlib.pyplot as plt

# plt.scatter(x=np.log(df["gdp"]) - np.log(df["population"]), y=df["gdp_per_capita"])
# plt.scatter(x=np.log(df["population"]), y=np.log(df["gdp"]), c=df["difficulty"])
# plt.scatter(x=np.log(df["population"]), y=np.log(df["gdp"]), c=df["difficulty"])
plt.show()

In [None]:
def get_difficulty(game):
    game_cats = [cat for cat, val in game.rows + game.cols]
    cat_sizes = [len(cat.sets) for cat in game_cats]
    
    solutions = sum([[cell for cell in row] for row in game.solutions], [])
    alt_solutions = sum([[cell for cell in row] for row in game.alt_solutions], [])
    all_solutions = list(sorted(set(sum(solutions + alt_solutions, []))))
    cell_sizes = list(map(len, solutions))
    num_unique = len([i for i, size in enumerate(cell_sizes) if size == 1])
    cat_difficulty = sum(cat.difficulty for cat in game_cats)
    
    return {
        "cell_sizes": cell_sizes,
        "cell_sizes_median": pd.Series(cell_sizes).median(),
        "cell_min_difficulties": [df[df["iso"].isin(cell)]["difficulty"].min() for cell in solutions],
        "cell_median_difficulties": [df[df["iso"].isin(cell)]["difficulty"].median() for cell in solutions],
        "num_unique": num_unique,
        "max_num_cells": max(len([cell for cell in solutions if c in cell]) for c in all_solutions),
        "max_num_cells_unique_solution": max(len([cell for cell in solutions if len(cell) == 1 and c in cell]) for c in all_solutions),
        "cat_sizes": cat_sizes,
        "all_solutions": len(all_solutions),
        "cat_sizes_median": pd.Series(cell_sizes).median(),
        "cat_difficulty": cat_difficulty / len(game_cats)
    }

# info = pd.DataFrame([{"game": game, **get_difficulty(game)} for game in games])
# info["score"] = -info["num_unique"] + info["cat_difficulty"] + info["cell_sizes_median"] + info["cat_sizes_median"]

info["cell_max_min_difficulties"] = info["cell_min_difficulties"].apply(max)


# info.head(30)

# idea: countries with small population / size are harder to guess
# log(sum(populations of all solution)) median of all cells

In [None]:
info

In [None]:
plt.hist(df["difficulty"], bins=20, rwidth=.9)
plt.title("Country difficulties")
plt.show()

In [None]:
plt.hist(log_name_length, bins=15, rwidth=.9)
plt.title("Country name length")
plt.show()

In [None]:
plt.hist(info["cell_max_min_difficulties"], bins=20, rwidth=.9)
plt.title("Games: Hardest cell solution difficulty")
plt.show()

In [None]:
plt.hist(info["num_unique"], bins=range(10), rwidth=.9)
plt.title("Games: Number of unique-solution cells")
plt.show()

In [None]:
info["difficulty"] = 5 * info["cell_max_min_difficulties"] + 2 * info["num_unique"] + 5 * info["cat_difficulty"] + .5 * info["cell_sizes_median"] + .5 * info["cat_sizes_median"]
info["difficulty"] = 10 * ((info["difficulty"] - info["difficulty"].min()) / (info["difficulty"].max() - info["difficulty"].min()))

easiest = [game for i, game in enumerate(games) if i in info.nsmallest(20, "difficulty").index.tolist()]

for game in easiest:
    display(game.to_dataframe(solution=True))
    
json.dump([game.to_json() for game in easiest], open("../../data/games_easy.json", mode="w", encoding="utf-8"))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(12,3))
plt.subplot(121)

plt.hist(info["max_num_cells"], rwidth=.9, bins=range(10))
plt.xticks(np.arange(.5,10,1), labels=range(10))
plt.xlim([-.5,9.5])
# plt.xticklabels()
plt.title("Countries appear in up to x cells")


plt.subplot(122)

plt.hist(info["max_num_cells_unique_solution"], rwidth=.9, bins=range(10))
plt.xticks(np.arange(.5,10,1), labels=range(10))
plt.xlim([-.5,9.5])
# plt.xticklabels()
plt.title("Countries appear in up to x cells as unique solution")
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.hist(info["cat_difficulty"], rwidth=.9)
plt.title("Category difficulty sum")
plt.show()

In [None]:
games[0].to_json()

In [None]:
# save()