In [None]:
import pandas as pd
from generator import Constraint
from utils import *

# Ensure we're running in the right directory
chdir_this_file()

FIELD_SIZE = 3
MIN_CELL_SIZE = 1
MAX_CELL_SIZE = None
CONSTRAINTS = lambda countries: [
    # Some categories are pretty boring to appear multiple times
    Constraint.category_at_most("capital_ending_letter", 1),
    Constraint.category_at_most("capital_starting_letter", 1),
    Constraint.category_at_most("ending_letter", 1),
    # Limit the number of cells a country can appear in
    *Constraint.solutions_at_most(countries.iso.tolist(), 3)
]
CATEGORY_PROBS = {
    "continent": 4,
    "starting_letter": 3,
    "ending_letter": 1.5,
    "capital_starting_letter": 2,
    "capital_ending_letter": .5,
    "flag_colors": 3,
    "landlocked": 2,
    "island": 2,
    "top_20_population": 2.5,
    "bottom_20_population": 2,
    "top_20_area": 2.5,
    "bottom_20_area": 2,
    "elevation_sup5k": 2.5,
    "elevation_sub1k": 2,
    "capital_not_largest": 2,
}

In [None]:
from preprocessing import Preprocessor
from difficulty import DifficultyEstimator, DifficultyLevel

LANGUAGES = ["de", "en"]
NUM_GAMES = 5000
RUN_NAME = "capital-not-largest"

class GenerationPipeline:
    def __init__(self, languge):
        self.language = language
        self.countries = pd.read_json(f"../../public/data/countries/countries-{self.language.lower()}.json", encoding="utf8")
        self.preprocessor = Preprocessor(countries=self.countries,
                                         language=self.language,
                                         field_size=FIELD_SIZE,
                                         min_cell_size=MIN_CELL_SIZE,
                                         max_cell_size=MAX_CELL_SIZE)
        self.constraints = CONSTRAINTS(self.countries)
        self.generator = self.preprocessor.get_generator(self.constraints, CATEGORY_PROBS,
                                                         seed=None, selection_mode="shuffle_setkeys",
                                                         uniform=False, shuffle=True)
        self.estimator = DifficultyEstimator(self.preprocessor)
        
    def generate(self):
        # Generate games
        self.games = list(self.generator.sample_games(n=NUM_GAMES))
        # Difficulty computation
        self.difficulty_info = self.estimator.compute_game_difficulties(self.games)
        # Save games and categories to JSON file
        self.preprocessor.save_games(self.games, name=RUN_NAME)

pipelines = {}
for language in LANGUAGES:
    pipelines[language] = GenerationPipeline(language)
    pipelines[language].generate()

In [None]:
EN = pipelines["en"]
DE = pipelines["de"]
EN.games[0].data

## From `game_stats.ipynb`

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
plt.hist([game.sample_tries for game in EN.games], label="EN", bins=50, rwidth=.9, alpha=.75)
plt.hist([game.sample_tries for game in DE.games], label="DE", bins=50, rwidth=.9, alpha=.75)
plt.title("Number of tries for game generation")
plt.legend()
plt.show()

In [None]:
# sample_tries = pd.Series([game.sample_tries for game in games]).value_counts()
# sample_tries = np.array([sample_tries.get(i, 0) for i in range(100)])
# p = (1 - sample_tries / np.roll(np.where(sample_tries != 0, sample_tries, np.nan), 1))[1:]
# plt.scatter(x=p, y=np.zeros_like(p))
# plt.show()

In [None]:
DE.difficulty_info["max_occurences"] = DE.difficulty_info["game"].apply(
    lambda game: pd.Series(sum(sum(game.solutions, []), [])).value_counts().max()
)
EN.difficulty_info["max_occurences"] = EN.difficulty_info["game"].apply(
    lambda game: pd.Series(sum(sum(game.solutions, []), [])).value_counts().max()
)
DE.difficulty_info["max_occurences"].describe()

In [None]:
plt.hist(EN.difficulty_info["max_occurences"], bins=range(10), rwidth=.9)
plt.title("Maximum number of cells a country appears in")
plt.show()

In [None]:
difficulty_info

In [None]:
plt.hist(difficulty_info["max_cell_difficulty"], bins=20, rwidth=.9)
plt.title("Max cell difficulty")
plt.show()

In [None]:
for game in difficulty_info.nlargest(20, "max_cell_difficulty")["game"]:
    print(game.data)
    display(game.to_dataframe(solution=True))

In [None]:
scatter = plt.scatter(x=difficulty_info["avg_cell_difficulty"], y=difficulty_info["max_cell_difficulty"], c=difficulty_info["level"])
plt.xlabel("Average cell difficulty")
plt.ylabel("Maximum cell difficulty")
plt.title("Distribution of game difficulty")
plt.xlim([0,10])
plt.ylim([0,10])
ax = plt.gca()
legend = ax.legend(scatter.legend_elements()[0], ["Easy", "Medium", "Hard"], loc="lower right", title="Difficulty Level")
ax.add_artist(legend)
plt.show()

print(difficulty_info["level"].value_counts())

In [None]:
cell_info = estimator.cell_info
cell_info.nsmallest(20, "difficulty")

In [None]:
plt.hist(cell_info["solution_difficulty"], bins=20, rwidth=.9)
plt.title("Cell solution difficulty")
plt.show()

In [None]:
plt.scatter(x=cell_info["row_col_difficulty"], y=cell_info["solution_difficulty"])
plt.xlabel("Row-col difficulty")
plt.ylabel("Solution difficulty")
plt.show()

plt.scatter(x=cell_info["row_difficulty"], y=cell_info["col_difficulty"], c=cell_info["solution_difficulty"])
plt.xlabel("Row difficulty")
plt.ylabel("Col difficulty")
plt.show()
# plt.scatter(x=cell_info["row_col_difficulty"], y=cell_info["row_col_difficulty_harmonic"])
# plt.show()

In [None]:
# difficulty_info.nlargest(10, "difficulty")

In [None]:


# plt.scatter(x=np.log(df["gdp"]) - np.log(df["population"]), y=df["gdp_per_capita"])
# plt.scatter(x=np.log(df["population"]), y=np.log(df["gdp"]), c=df["difficulty"])
# plt.scatter(x=np.log(df["population"]), y=np.log(df["gdp"]), c=df["difficulty"])
# plt.show()