# Leela Zerozeug

In [None]:
import sys
import os
import math

import numpy as np
import scipy.stats as stats

import sqlite3

from PIL import Image
from IPython import display
import matplotlib.pyplot as pyplot
%matplotlib inline

%load_ext autoreload
%autoreload 2

from src import *

In [None]:
# Leela Zero Training Match Database
db_path = 'data/leela-zero.db'

# Confidence Level for Hypothesis Tests
alpha = 0.01

## Resignation Rate

While scrutinising the *Leela Zero* web pages, I noticed what seemed like an alarmingly high resignation rate. How large, exactly?

In [None]:
with sqlite3.connect(db_path) as sql:
    count_games = sql.execute("SELECT COUNT(*) FROM Game").fetchone()[0]
    count_resignations = sql.execute("SELECT COUNT(*) FROM Game WHERE resign=1").fetchone()[0]
    
print("{0} of {1} games ({2:.4%}) ended with resignation.".format(count_resignations, count_games, (count_resignations / count_games)))

pyplot.figure()
pyplot.pie([count_resignations, count_games - count_resignations],
           labels = ['resigned', 'scored'], colors = ['xkcd:fawn', 'xkcd:kiwi'], explode = (0.01, 0),
           autopct = '%1.1f%%', shadow=False, startangle=70)
pyplot.axis('equal')
pyplot.show()

Well. I wonder why so many match games end prematurely. These games are played *without* Dirichlet noise.

## Fairness Tests

Here is a routine to test fairness using a *coin-toss* model in which various trials are modelled as *independent* Bernoulli trials.

In [None]:
def test_fairness(trials, p, successes, alpha, silent=False):
    mu = trials * p
    sigma = math.sqrt(trials * p * (1.0 - p))
    delta = abs(mu - successes)
    z_binomial = 2.0 * stats.binom.cdf(mu - delta, trials, p) if (delta > 0) else 1.0
    z_normal = 2.0 * (1.0 - stats.norm.cdf((delta - 0.5) / sigma)) if (delta > 0) else 1.0
    reject = (z_binomial < alpha)
    
    if not silent:
        print("Expected number of successes after {0} Bernoulli trials (p = {1:.1%}): {2}".format(trials, p, mu))
        print("Observed successes after {0} Bernoulli trials: {1}".format(trials, successes))
        print("Probability of observing ≤ {0} or ≥ {1} successes: {2:.4%}    (binomial distribution)".format((mu - delta), (mu + delta), z_binomial))
        print("Probability of observing ≤ {0} or ≥ {1} successes: {2:.4%}    (normal approximation)".format((mu - delta), (mu + delta), z_normal))
        if reject:
            print("At an α = {0} confidence level, we REJECT the null hypothesis that p = {1}.".format(alpha, p))
        else:
            print("At an α = {0} confidence level, we have insufficient evidence to reject the null hypothesis that p = {1}.".format(alpha, p))
            
    return reject

As a sanity check, let's apply the test to the scraped data in the simplest way imaginable. It is immediately apparent that the *nigiri* procedure is, at least, unbiased.

<span style="color: red;">(For all following tests, unless otherwise stated, it is assumed that *nigiri* is fair and that the stronger and weaker networks get equal opportunities to play as both colours within the scope of a match.)</span>

In [None]:
with sqlite3.connect(db_path) as sql:
    count_games = sql.execute("SELECT COUNT(*) FROM Game").fetchone()[0]
    count_challenger_as_black = sql.execute("SELECT COUNT(*) FROM Game"
                                            " JOIN Match ON Game.match_id = Match.id"
                                            " WHERE Game.black = Match.challenger").fetchone()[0]
    
print("In {0} of {1} games ({2:.4%}), the challenger took black.".format(count_challenger_as_black, count_games, (count_challenger_as_black / count_games)))
print()

test_fairness(count_games, 0.5, count_challenger_as_black, alpha)

pyplot.figure()
pyplot.pie([count_challenger_as_black, count_games - count_challenger_as_black],
           labels = ['black', 'white'], colors = ['lightcoral', 'lightskyblue'], explode = (0.01, 0),
           autopct = '%1.4f%%', shadow=False, startangle=70)
pyplot.axis('equal')
pyplot.show()

Now, let's test the assumption that *komi*, alone, makes the game of *Go* fair. Let's be naïeve and consider only global totals, weighting game results equally, regardless of the match in which they were played or the network that played them.

In [None]:
with sqlite3.connect(db_path) as sql:
    (black_victories, white_victories) = sql.execute("SELECT SUM(case [victor] when 1 then 1 else 0 end) as [black], SUM(case [victor] when 2 then 1 else 0 end) as [white] FROM Game").fetchone()
    
print("Black won {0} games; white won {1}.".format(black_victories, white_victories))
print()

test_fairness((black_victories + white_victories), 0.5, black_victories, alpha)

pyplot.figure()
pyplot.pie([black_victories, white_victories],
           labels = ['black', 'white'], colors = ['lightcoral', 'lightskyblue'], explode = (0.01, 0),
           autopct = '%1.4f%%', shadow=False, startangle=70)
pyplot.axis('equal')
pyplot.show()

That's interesting: white appears to be favoured and the margin appears to be statistically significant. Taken by itself, this result could be enough to conclude that *Leela Zero* prefers white although it gives no indication of *why* this might be the case. The model used and the data-selection criteria are extremely naïeve, however -- we need to go deeper!

Let's restrict the query to the thirty most recent, completed matches -- about 30% of the available data -- and also repeat the hypothesis test for each match, independently.

In [None]:
history = 30

match_count = 0
match_black_victories = []
match_white_victories = []
match_dates = []
match_rejects = []
total_black_victories = 0
total_white_victories = 0
black_rejects = 0
white_rejects = 0

with sqlite3.connect(db_path) as sql:
    cursor = sql.execute("SELECT Game.match_id, Match.start_date, SUM(case [victor] when 1 then 1 else 0 end) as [black], SUM(case [victor] when 2 then 1 else 0 end) as [white]"
                         " FROM Game"
                         " JOIN Match ON Game.match_id = Match.id"
                         " WHERE Match.result is not null"
                         " GROUP BY Game.match_id, Match.start_date"
                         " ORDER BY Match.start_date DESC"
                         " LIMIT ?", [history])
    
    for r in cursor:
        (match_id, start_date, black_victories, white_victories) = r
        total_black_victories += black_victories
        total_white_victories += white_victories
        
        reject = test_fairness((black_victories + white_victories), 0.5, black_victories, alpha, silent=True)
        if (reject) and (black_victories > white_victories):
            black_rejects = black_rejects + 1
        elif (reject) and (white_victories > black_victories):
            white_rejects = white_rejects + 1
        
        match_count = match_count + 1
        match_black_victories.append(black_victories)
        match_white_victories.append(white_victories)
        match_dates.append(start_date)
        match_rejects.append((max(black_victories, white_victories) + 10) if reject else None)

print("Black won {0} games; white won {1}.".format(total_black_victories, total_white_victories))
print()

test_fairness((total_black_victories + total_white_victories), 0.5, total_black_victories, alpha)

pyplot.figure()
pyplot.pie([total_black_victories, total_white_victories],
           labels = ['black', 'white'], colors = ['lightcoral', 'lightskyblue'], explode = (0.01, 0),
           autopct = '%1.4f%%', shadow=False, startangle=70)
pyplot.axis('equal')
pyplot.show()
            
match_index = np.arange(match_count)

pyplot.figure(figsize=(12, 6))
pyplot.bar(match_index, match_black_victories, 0.4, color='lightcoral', label='black')
pyplot.bar(match_index + 0.4, match_white_victories, 0.4, color='lightskyblue', label='white')
pyplot.scatter(match_index + 0.2, match_rejects, marker='x', color='red')

pyplot.suptitle('Black-White Win-Rates')
pyplot.title('(last {0} matches)'.format(history))
pyplot.ylabel('victories')
pyplot.xticks(match_index + 0.2, match_dates, rotation=90)
pyplot.legend(bbox_to_anchor=(1.11, 1.0), loc='upper right', borderaxespad=0)
pyplot.show()

print()
print("Data from {0} of {1} recent matches proved sufficient to reject the null hypothesis at α = {2}.".format(sum(r is not None for r in match_rejects), len(match_rejects), alpha))
print("{0} of these matches favoured black; {1} of these favoured white.".format(black_rejects, white_rejects))

## Fetch Web Data

Scrape http://zero.sjeng.org/ for data from Leela Zero's training matches and store that data in a Sqlite database that can be queried offline.

In [None]:
web.fetch_database(db_path)