In [None]:
import preprocessing

In [None]:
# from importlib import reload
# reload(preprocessing)

In [None]:
df = preprocessing.df

# Compute country difficulties

In [None]:
def normalize(x: pd.Series, scale=1):
    return scale * (x - x.min()) / (x.max() - x.min())

In [None]:
import numpy as np

log_name_length = normalize(df["name"].apply(len).apply(np.log), scale=10)
log_population = normalize(np.log(df["population"]), scale=10)
log_gdp = normalize(np.log(df["gdp"]), scale=10)

df["difficulty"] = 10 - normalize(1 * log_population + 2 * log_gdp - .1 * log_name_length, scale=10)

# df1 = df.nsmallest(50, "difficulty").reset_index()
# df1
# df1[df1["landlocked"]][["iso", "difficulty", "gdp", "population"]]

In [None]:
df.nlargest(20, "difficulty")[["name", "gdp", "population", "difficulty"]]

In [None]:
import pandas as pd

# Sample data (replace this with your actual DataFrame)
data = df[["iso", "name", "continent", "difficulty"]].copy()
data.rename(columns={"continent": "category"}, inplace=True)

# Step 1: Calculate average difficulty per category
# Step 2: Calculate standard deviation per category
# Step 3: Adjust for category size (you can adjust the value to your liking)
categories = pd.DataFrame({"avg": data.groupby('category')['difficulty'].mean(),
                           "median": data.groupby('category')['difficulty'].median(),
                           "std": data.groupby('category')['difficulty'].std(),
                           "size": data['category'].value_counts()})

# Idea: entropy (europe is easy because oceania is hard)

# Step 4: Combine
# Step 5: Normalize
categories["difficulty"] = normalize(2 * categories["avg"] + categories["std"] + np.sqrt(categories["size"]), scale=10)

categories.sort_values("difficulty")

In [None]:
def entropy(x: pd.Series) -> float:
    p = x.value_counts() / len(x)
    return np.sum(-p * np.log2(p))

def entropies(x: pd.Series, y: pd.Series):
    pass
#     return pd.concat([x, y], axis=1).groupby(x.name)[y.name].agg(entropy)


# entropy(pd.Series(np.array([1,2,3,4])))
# entropy(pd.Series(np.array([1,1,1,1])))
# entropies(df["continent"], df["difficulty"])

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x=np.log(df["gdp"]), y=df["difficulty"])
plt.show()

In [None]:
# import importlib
# importlib.reload(preprocessing)

In [None]:
categories = preprocessing.categories

In [None]:
from game import *

constraints = [
    # We always want a continent
    Constraint.category_at_least("continent", 1),
    
    # Some categories are pretty boring to appear multiple times
    Constraint.category_at_most("capital_ending_letter", 1),
    Constraint.category_at_most("capital_starting_letter", 1),
    Constraint.category_at_most("ending_letter", 1)
]

games = [preprocessing.create_game(constraints, shuffle=True) for _ in range(1000)]

In [None]:
def get_difficulty(game):
    game_cats = [cat for cat, val in game.rows + game.cols]
    cat_sizes = [len(cat.sets) for cat in game_cats]
    
    solutions = sum([[cell for cell in row] for row in game.solutions], [])
    alt_solutions = sum([[cell for cell in row] for row in game.alt_solutions], [])
    all_solutions = list(sorted(set(sum(solutions + alt_solutions, []))))
    cell_sizes = list(map(len, solutions))
    num_unique = len([i for i, size in enumerate(cell_sizes) if size == 1])
    cat_difficulty = sum(cat.difficulty for cat in game_cats)
    
    return {
        "cell_sizes": cell_sizes,
        "cell_sizes_median": pd.Series(cell_sizes).median(),
        "cell_min_difficulties": [df[df["iso"].isin(cell)]["difficulty"].min() for cell in solutions],
        "cell_median_difficulties": [df[df["iso"].isin(cell)]["difficulty"].median() for cell in solutions],
        "num_unique": num_unique,
        "max_num_cells": max(len([cell for cell in solutions if c in cell]) for c in all_solutions),
        "max_num_cells_unique_solution": max(len([cell for cell in solutions if len(cell) == 1 and c in cell]) for c in all_solutions),
        "cat_sizes": cat_sizes,
        "all_solutions": len(all_solutions),
        "cat_sizes_median": pd.Series(cell_sizes).median(),
        "cat_difficulty": cat_difficulty / len(game_cats)
    }

info = pd.DataFrame([{"game": game, **get_difficulty(game)} for game in games])
# info["score"] = -info["num_unique"] + info["cat_difficulty"] + info["cell_sizes_median"] + info["cat_sizes_median"]

info["cell_max_min_difficulties"] = info["cell_min_difficulties"].apply(max)


# info.head(30)

# idea: countries with small population / size are harder to guess
# log(sum(populations of all solution)) median of all cells

In [None]:
info

In [None]:
plt.hist(df["difficulty"], bins=20, rwidth=.9)
plt.title("Country difficulties")
plt.show()

In [None]:
plt.hist(log_name_length, bins=15, rwidth=.9)
plt.title("Country name length")
plt.show()

In [None]:
plt.hist(info["cell_max_min_difficulties"], bins=20, rwidth=.9)
plt.title("Games: Hardest cell solution difficulty")
plt.show()

In [None]:
plt.hist(info["num_unique"], bins=range(10), rwidth=.9)
plt.title("Games: Number of unique-solution cells")
plt.show()

In [None]:
info["difficulty"] = 5 * info["cell_max_min_difficulties"] + 2 * info["num_unique"] + 5 * info["cat_difficulty"] + .5 * info["cell_sizes_median"] + .5 * info["cat_sizes_median"]
info["difficulty"] = 10 * ((info["difficulty"] - info["difficulty"].min()) / (info["difficulty"].max() - info["difficulty"].min()))

easiest = [game for i, game in enumerate(games) if i in info.nsmallest(20, "difficulty").index.tolist()]

for game in easiest:
    display(game.to_dataframe(solution=True))
    
json.dump([game.to_json() for game in easiest], open("../../data/games_easy.json", mode="w", encoding="utf-8"))

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.figure(figsize=(12,3))
plt.subplot(121)

plt.hist(info["max_num_cells"], rwidth=.9, bins=range(10))
plt.xticks(np.arange(.5,10,1), labels=range(10))
plt.xlim([-.5,9.5])
# plt.xticklabels()
plt.title("Countries appear in up to x cells")


plt.subplot(122)

plt.hist(info["max_num_cells_unique_solution"], rwidth=.9, bins=range(10))
plt.xticks(np.arange(.5,10,1), labels=range(10))
plt.xlim([-.5,9.5])
# plt.xticklabels()
plt.title("Countries appear in up to x cells as unique solution")
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.hist(info["cat_difficulty"], rwidth=.9)
plt.title("Category difficulty sum")
plt.show()

In [None]:
games[0].to_json()

In [None]:
import json

def save():
    json.dump([game.to_json() for game in games], open("../../data/games1.json", mode="w", encoding="utf-8"))

In [None]:
# save()