In [66]:
import collections
import itertools
import json
import re

import pandas as pd
import numpy as np

In [74]:
def analyze(base, section):
    report = [json.loads(line)
              for line
              in open('../logdirs/20190201-hs-allmatches-anysplit/{}/debug-{}-step2600.jsonl'.format(base, section))]
    templates = json.load(open('../data/hearthstone-idioms-20190201/all-matches-trees-anysplit/{}/templates.json'.format(base)))
    # Count how often 
    template_match_counts = collections.defaultdict(int)
    template_choice_ranks = {
        'all': collections.defaultdict(list),
        'templates only': collections.defaultdict(list),
    }
    for item in report:
        for entry in item['history']:
            if not isinstance(entry['choices'][0], str):
                continue
            
            all_ranks = {}
            template_only_ranks = {}
            
            template_only_i = 0
            for i, choice in enumerate(entry['choices']):
                all_ranks[choice] = i
                template_only_ranks[choice] = template_only_i
                if not re.match('Template(\d+).*', choice):
                    template_only_i += 1
                
            for choice in entry['valid_choices']:
                m = re.match('Template(\d+).*', choice)
                if not m:
                    continue
                template_id = int(m.group(1))
                template_match_counts[template_id] += 1
                
                # Determine its rank
                template_choice_ranks['all'][template_id].append(all_ranks[choice])
                # Determine its rank, excluding other templates
                template_choice_ranks['templates only'][template_id].append(template_only_ranks[choice])
                    
    ks = (1, 2, 3)
    top_k_freq = {
        type_name: {
            k: {i: np.sum(np.array(ranks) < k) / len(ranks) for i, ranks in ranks_of_type.items()} for k in ks
        } for type_name, ranks_of_type in template_choice_ranks.items()}
    
    return templates, pd.DataFrame({
        'Head': {t['id']: t['idiom'][0] for t in templates},
        'Matches': template_match_counts, 
        **{
            'Top-{} rank, {}'.format(k, type_name): top_k_freq[type_name][k]
            for type_name in top_k_freq.keys()
            for k in ks
        }
    })

In [71]:
for filt, st, nt in itertools.product(
    ('none', 'contains-hole'), ('cov-xent', 'cov-examples'), ('10', '20', '40', '80')):
    name = 'filt-{}_st-{}_nt-{}'.format(filt, st, nt)
    _, df = analyze(name, 'train')
    print(name)
    print('Templates containing matches:        {}'.format(sum(df['Matches'] > 0)))
    print('Templates with non-zero rank 1 freq: {}'.format(sum(df['Top-1 rank, templates only'] > 0)))
    print()

filt-none_st-cov-xent_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 1

filt-none_st-cov-xent_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 0

filt-none_st-cov-xent_nt-40
Templates containing matches:        34
Templates with non-zero rank 1 freq: 3

filt-none_st-cov-xent_nt-80
Templates containing matches:        71
Templates with non-zero rank 1 freq: 2

filt-none_st-cov-examples_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 0

filt-none_st-cov-examples_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 2

filt-none_st-cov-examples_nt-40
Templates containing matches:        30
Templates with non-zero rank 1 freq: 5

filt-none_st-cov-examples_nt-80
Templates containing matches:        66
Templates with non-zero rank 1 freq: 3

filt-contains-hole_st-cov-xent_nt-10
Templates containing matches:        9
Templates with non-zero rank 1 freq: 3

filt-c

In [80]:
for filt, st, nt in itertools.product(
    ('none', 'contains-hole'), ('cov-xent', 'cov-examples'), ('10', '20', '40', '80')):
    name = 'filt-{}_st-{}_nt-{}'.format(filt, st, nt)
    _, df = analyze(name, 'val')
    print(name)
    print('Templates containing matches:        {}'.format(sum(df['Matches'] > 0)))
    print('Templates with non-zero rank 1 freq: {}'.format(sum(df['Top-1 rank, templates only'] > 0)))
    print()

filt-none_st-cov-xent_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 1

filt-none_st-cov-xent_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 0

filt-none_st-cov-xent_nt-40
Templates containing matches:        34
Templates with non-zero rank 1 freq: 2

filt-none_st-cov-xent_nt-80
Templates containing matches:        71
Templates with non-zero rank 1 freq: 2

filt-none_st-cov-examples_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 0

filt-none_st-cov-examples_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 2

filt-none_st-cov-examples_nt-40
Templates containing matches:        30
Templates with non-zero rank 1 freq: 3

filt-none_st-cov-examples_nt-80
Templates containing matches:        66
Templates with non-zero rank 1 freq: 3

filt-contains-hole_st-cov-xent_nt-10
Templates containing matches:        9
Templates with non-zero rank 1 freq: 3

filt-c

In [76]:
_, df = analyze('filt-none_st-cov-examples_nt-40', 'train')
df

Unnamed: 0,Head,Matches,"Top-1 rank, all","Top-1 rank, templates only","Top-2 rank, all","Top-2 rank, templates only","Top-3 rank, all","Top-3 rank, templates only"
0,ClassDef,345.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Module,532.0,0.0,0.0,1.0,1.0,1.0,1.0
2,ClassDef-body,,,,,,,
3,FunctionDef,131.0,0.0,0.0,0.0,0.0,0.0,0.0
4,FunctionDef,533.0,0.0,0.0,0.0,0.0,0.0,0.0
5,FunctionDef,154.0,0.0,0.0,0.0,0.0,0.0,0.0
6,FunctionDef,550.0,0.0,0.0,0.0,0.0,0.0,0.0
7,FunctionDef,712.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Expr,538.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Call,253.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
templates, df = analyze('filt-none_st-cov-examples_nt-40', 'val')
df

Unnamed: 0,Head,Matches,"Top-1 rank, all","Top-1 rank, templates only","Top-2 rank, all","Top-2 rank, templates only","Top-3 rank, all","Top-3 rank, templates only"
0,ClassDef,41.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Module,66.0,0.0,0.0,1.0,1.0,1.0,1.0
2,ClassDef-body,,,,,,,
3,FunctionDef,12.0,0.0,0.0,0.0,0.0,0.0,0.0
4,FunctionDef,67.0,0.0,0.0,0.0,0.0,0.0,0.0
5,FunctionDef,18.0,0.0,0.0,0.0,0.0,0.0,0.0
6,FunctionDef,74.0,0.0,0.0,0.0,0.0,0.0,0.0
7,FunctionDef,92.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Expr,68.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Call,32.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
templates[37]['idiom']

['Expr',
 [],
 None,
 [['Call',
   [],
   None,
   [['Call-func',
     [],
     None,
     [['Attribute',
       [],
       None,
       [['Attribute-value', [], 0, []], ['Attribute-attr', [], 1, []]]]]],
    ['Call-args', [], 2, []],
    ['Call-keywords', [], 3, []]]]]]

In [81]:
def analyze2(base, section):
    report = [json.loads(line)
              for line
              in open('../logdirs/20190201-hs-allmatches-anysplit-multimean/{}/debug-{}-step1000.jsonl'.format(base, section))]
    templates = json.load(open('../data/hearthstone-idioms-20190201/all-matches-trees-anysplit/{}/templates.json'.format(base)))
    # Count how often 
    template_match_counts = collections.defaultdict(int)
    template_choice_ranks = {
        'all': collections.defaultdict(list),
        'templates only': collections.defaultdict(list),
    }
    for item in report:
        for entry in item['history']:
            if not isinstance(entry['choices'][0], str):
                continue
            
            all_ranks = {}
            template_only_ranks = {}
            
            template_only_i = 0
            for i, choice in enumerate(entry['choices']):
                all_ranks[choice] = i
                template_only_ranks[choice] = template_only_i
                if not re.match('Template(\d+).*', choice):
                    template_only_i += 1
                
            for choice in entry['valid_choices']:
                m = re.match('Template(\d+).*', choice)
                if not m:
                    continue
                template_id = int(m.group(1))
                template_match_counts[template_id] += 1
                
                # Determine its rank
                template_choice_ranks['all'][template_id].append(all_ranks[choice])
                # Determine its rank, excluding other templates
                template_choice_ranks['templates only'][template_id].append(template_only_ranks[choice])
                    
    ks = (1, 2, 3)
    top_k_freq = {
        type_name: {
            k: {i: np.sum(np.array(ranks) < k) / len(ranks) for i, ranks in ranks_of_type.items()} for k in ks
        } for type_name, ranks_of_type in template_choice_ranks.items()}
    
    return templates, pd.DataFrame({
        'Head': {t['id']: t['idiom'][0] for t in templates},
        'Matches': template_match_counts, 
        **{
            'Top-{} rank, {}'.format(k, type_name): top_k_freq[type_name][k]
            for type_name in top_k_freq.keys()
            for k in ks
        }
    })

In [83]:
for filt, st, nt in itertools.product(
    ('none', 'contains-hole'), ('cov-xent', 'cov-examples'), ('10', '20', '40', '80')):
    name = 'filt-{}_st-{}_nt-{}'.format(filt, st, nt)
    _, df = analyze2(name, 'val')
    print(name)
    print('Templates containing matches:        {}'.format(sum(df['Matches'] > 0)))
    print('Templates with non-zero rank 1 freq: {}'.format(sum(df['Top-1 rank, templates only'] > 0)))
    print()

filt-none_st-cov-xent_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 5

filt-none_st-cov-xent_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 8

filt-none_st-cov-xent_nt-40
Templates containing matches:        34
Templates with non-zero rank 1 freq: 22

filt-none_st-cov-xent_nt-80
Templates containing matches:        71
Templates with non-zero rank 1 freq: 33

filt-none_st-cov-examples_nt-10
Templates containing matches:        8
Templates with non-zero rank 1 freq: 3

filt-none_st-cov-examples_nt-20
Templates containing matches:        17
Templates with non-zero rank 1 freq: 13

filt-none_st-cov-examples_nt-40
Templates containing matches:        30
Templates with non-zero rank 1 freq: 24

filt-none_st-cov-examples_nt-80
Templates containing matches:        66
Templates with non-zero rank 1 freq: 36

filt-contains-hole_st-cov-xent_nt-10
Templates containing matches:        9
Templates with non-zero rank 1 freq: 9

f

In [86]:
_, df = analyze2('filt-contains-hole_st-cov-examples_nt-40', 'val')
df

Unnamed: 0,Head,Matches,"Top-1 rank, all","Top-1 rank, templates only","Top-2 rank, all","Top-2 rank, templates only","Top-3 rank, all","Top-3 rank, templates only"
0,Module,11.0,0.181818,0.363636,0.363636,1.0,1.0,1.0
1,ClassDef,41.0,1.0,1.0,1.0,1.0,1.0,1.0
2,Module,66.0,0.969697,1.0,1.0,1.0,1.0,1.0
3,ClassDef-body,,,,,,,
4,FunctionDef,12.0,0.0,0.0,0.0,1.0,0.0,1.0
5,FunctionDef,67.0,0.955224,0.955224,0.955224,1.0,1.0,1.0
6,FunctionDef,18.0,0.611111,0.722222,0.888889,1.0,1.0,1.0
7,FunctionDef,74.0,0.0,0.013514,0.0,1.0,0.121622,1.0
8,FunctionDef,92.0,0.021739,0.293478,0.423913,1.0,0.978261,1.0
9,Expr,68.0,0.764706,0.911765,0.926471,1.0,0.941176,1.0
