In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sqlite3

connection = sqlite3.connect("file:db/words.db?mode=ro")

In [None]:
username = 'stormalinblue'
user_id = connection.execute('select id from users where users.user_name = ?', (username,)).fetchone()[0]

In [None]:
import lib.game.model as gamemodel
from lib.common.util import utc_now_sec_timestamp

In [None]:
import pandas as pd
import numpy as np
import scipy

In [None]:
prior_correct = 0.25
prior_incorrect = 0.75
prior_total = prior_correct + prior_incorrect

word_weight_table = gamemodel.word_weight_table(
    connection,
    user_id,
    utc_now_sec_timestamp(),
    prior_correct=prior_correct,
    prior_incorrect=prior_incorrect)
word_pos_definitions = pd.read_sql(
    '''
select
    word_pos.id as word_pos_id, words.word as word, parts_of_speech.name as pos
from
    word_parts_of_speech as word_pos
    join words on words.id = word_pos.word_id
    join parts_of_speech on parts_of_speech.id == word_pos.part_of_speech_id
order by words.word, parts_of_speech.name''',
    con=connection,
    index_col=['word_pos_id']
)

word_table = pd.concat([word_pos_definitions, word_weight_table], axis=1)
print('num word pos', word_table.shape[0])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

x_axis = np.linspace(0, 1, 100)

n_highest = 10
word_table['mean_prob'] = (word_table['correct']) / (word_table['correct'] + word_table['incorrect'])
highest = word_table.sort_values('mean_prob', ascending=False).head(n_highest)
print(highest)
for word_pos_id, row in highest.iterrows():
    ax.plot(x_axis, scipy.stats.beta(row.correct, row.incorrect).cdf(x_axis), label=f'{row.word} ({row.pos})')

ax.set_ylim(0, 1)
ax.legend()
ax.set_title(f'PDFs for beta distributions for the top {n_highest} words')

In [None]:
fig, ax = plt.subplots()
n_lowest = 10
lowest = word_table.sort_values('mean_prob', ascending=True).head(n_lowest)
print(lowest)
ax.plot(x_axis, scipy.stats.beta(prior_correct, prior_incorrect).cdf(x_axis), label=f'Beta({prior_correct}, {prior_incorrect})')
for word_pos_id, row in lowest.iterrows():
    ax.plot(x_axis, scipy.stats.beta(row.correct, row.incorrect).cdf(x_axis), label=f'{row.word} ({row.pos})')

ax.legend()
ax.set_ylim(0, 1)
ax.set_title(f'PDFs for beta distributions for the bottom {n_lowest} words')

In [None]:
fig, ax = plt.subplots()

ax.set_xlabel('Incorrect')
ax.set_ylabel('Correct')
ax.set_title("Distribution of 'correct' and 'incorrect' weights for all words")
ax.grid(True)

mappable = ax.hist2d(word_table['incorrect'], word_table['correct'])
fig.colorbar(mappable[3], ax=ax)
ax.set_aspect(1)

In [None]:
fig, ax = plt.subplots()

ax.hist(word_table['correct'] + word_table['incorrect'], range=(0, 30), bins=60)
ax.set_title('Histogram of weights caused by user over word-pos\'s')
ax.set_xlabel('Weights')
ax.set_ylabel('Num words')

In [None]:
fig, ax = plt.subplots()

ax.hist(word_table['mean_prob'], range=(0, 1), bins=20)
ax.set_title('Distribution of mean probabilities')
ax.set_xlabel('Mean probability of correctness')
ax.set_ylabel('Number of words')

In [None]:
overall_incorrect = (word_table['incorrect']).sum() / word_table.shape[0]
overall_correct = (word_table['correct']).sum() / word_table.shape[0]

fig, ax = plt.subplots()
x_axis = np.linspace(0, 1, 1000)
overall_distribution = scipy.stats.beta(overall_correct, overall_incorrect)
ax.plot(x_axis, overall_distribution.cdf(x_axis))

median_prob = overall_distribution.median()
ax.axvline(median_prob, linestyle='--')
ax.annotate(
    text=f'$m$ = {median_prob:.02f}',
    xy=(median_prob, 0.9),
    xycoords=('data', 'axes fraction'),
    xytext=(10, 0),
    textcoords=('offset points'))
mean_prob = overall_distribution.mean()
ax.axvline(mean_prob, linestyle='--')
percentile_25 = overall_distribution.ppf(0.25)
ax.axvline(percentile_25, linestyle='--')
ax.annotate(
    text=f'$\\mu$ = {mean_prob:.02f}',
    xy=(mean_prob, 0.9),
    xycoords=('data', 'axes fraction'),
    ha='right',
    xytext=(-10, 0),
    textcoords=('offset points'))
percentiles = [0.05, 0.25, 0.5, 0.75, 0.95]
for percentile in percentiles:
    pctile_value = overall_distribution.ppf(percentile)
    ax.axvline(pctile_value)
ax.annotate(
    text=f'$\\alpha$ = {overall_correct:.02f}, $\\beta$ = {overall_incorrect:.02f}',
    xy=(0, 0.9),
    xycoords=('data', 'axes fraction'))
print(overall_distribution.ppf(0.05), overall_distribution.ppf(0.25), overall_distribution.ppf(0.5), overall_distribution.ppf(0.75))