In [73]:
from numpy.ma.core import masked_less
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt

from diveslowlearnfast.train import StatsDB
from diveslowlearnfast.train.stats import get_value, get_tuple, get_column, get_dict, get_df

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
stats = StatsDB('stats.db')
run22_id, run30_id = stats.execute_query('SELECT DISTINCT(run_id) FROM stats', extractor=get_tuple)

run22_id, run30_id

('/home/s2871513/Projects/diveslowlearnfast/results/run22',
 '/home/s2871513/Projects/diveslowlearnfast/results/run30')

In [18]:
stats.execute_query("""SELECT
    COUNT(DISTINCT(video_id)) as video_counts,
    run_id
FROM stats
WHERE split = "train"
GROUP BY run_id""", extractor=get_df)

Unnamed: 0,video_counts,run_id
0,15027,/home/s2871513/Projects/diveslowlearnfast/resu...
1,15027,/home/s2871513/Projects/diveslowlearnfast/resu...


In [99]:
start_epoch = stats.execute_query('SELECT MIN(epoch) FROM stats WHERE run_id = ?', run30_id, extractor=get_value)
start_epoch

133

In [98]:
max_epoch = stats.execute_query('SELECT MAX(epoch) FROM stats WHERE run_id = ?', run30_id, extractor=get_value)
max_epoch

160

In [40]:
result_df = stats.get_below_median_samples(
    start_epoch - 10,
    run22_id,
    'train',
    extractor=get_df
)

result_df.head(3)

Unnamed: 0,video_id,gt,acc,median
0,_lmT4WlK7G0_00012,36,0.1,1.0
1,VNvb5oLOpLg_00296,37,0.2,1.0
2,k1F4LHeYhBs_00207,7,0.3,1.0


In [81]:
masked_video_ids = result_df['video_id']

In [129]:
def accuracy_per_video(run_id, epoch, video_ids, not_in=False):
    if not_in:
        not_in = 'NOT'
    else:
        not_in = ''
    return stats.execute_query(f"""SELECT video_id, (correct_n / n) as acc FROM(
            SELECT
                video_id,
                gt,
                epoch,
                split,
                CAST(SUM(CASE WHEN pred = gt THEN 1 ELSE 0 END) as REAL) as correct_n,
                CAST(COUNT(*) as REAL) as n
            FROM stats
            WHERE epoch > ?
            AND video_id {not_in} IN ({','.join(['?'] * len(video_ids))})
            AND run_id = ?
            GROUP BY video_id, gt
        ) ORDER BY acc
        """,
    epoch,
    *video_ids,
    run_id,
    extractor=get_df)

before_egl_df = accuracy_per_video(run22_id, start_epoch - 10, masked_video_ids)
after_egl_df = accuracy_per_video(run30_id, max_epoch - 10, masked_video_ids)

In [130]:
before_egl_df.head(3)

Unnamed: 0,video_id,acc
0,_lmT4WlK7G0_00012,0.1
1,VNvb5oLOpLg_00296,0.2
2,k1F4LHeYhBs_00207,0.3


In [102]:
combined_df = before_egl_df.merge(after_egl_df, on='video_id', how='inner')
combined_df.rename(columns={'acc_x': 'acc_before'}, inplace=True)
combined_df.rename(columns={'acc_y': 'acc_after'}, inplace=True)
combined_df.head(7)

Unnamed: 0,video_id,acc_before,acc_after
0,_lmT4WlK7G0_00012,0.1,0.4
1,VNvb5oLOpLg_00296,0.2,0.4
2,k1F4LHeYhBs_00207,0.3,0.333333
3,vlfy4cny75s_00070,0.3,0.6
4,zYHstCxnAPA_00328,0.3,0.3
5,D8YKHC5hmUs_00034,0.333333,1.0
6,LkluZoNfKu8_00103,0.4,0.3


In [103]:
combined_df['change'] = combined_df['acc_after'] - combined_df['acc_before']
combined_df

Unnamed: 0,video_id,acc_before,acc_after,change
0,_lmT4WlK7G0_00012,0.1,0.400000,0.300000
1,VNvb5oLOpLg_00296,0.2,0.400000,0.200000
2,k1F4LHeYhBs_00207,0.3,0.333333,0.033333
3,vlfy4cny75s_00070,0.3,0.600000,0.300000
4,zYHstCxnAPA_00328,0.3,0.300000,0.000000
...,...,...,...,...
7043,zbAC7t15q3k_00139,0.9,0.900000,0.000000
7044,zbAC7t15q3k_00142,0.9,1.000000,0.100000
7045,zbAC7t15q3k_00144,0.9,0.888889,-0.011111
7046,zbAC7t15q3k_00147,0.9,0.800000,-0.100000


In [104]:
combined_df['change'].mean() * 100

5.334373817631479

## Non masked examples
In this section we'll analyse the effect of EGL on non-masked videos

In [105]:
# `not_in=True` negates the video_ids, i.e. we now query for video_ids for which NO mask exists.
before_egl_df = accuracy_per_video(run22_id, start_epoch - 10, masked_video_ids, not_in=True)
after_egl_df = accuracy_per_video(run30_id, max_epoch - 10, masked_video_ids, not_in=True)
len(before_egl_df), len(after_egl_df)

(7979, 7979)

In [106]:
before_egl_df.head(3)

Unnamed: 0,video_id,acc
0,-mmq0PT-u8k_00006,1.0
1,-mmq0PT-u8k_00011,1.0
2,-mmq0PT-u8k_00012,1.0


In [107]:
after_egl_df.head(3)

Unnamed: 0,video_id,acc
0,D8YKHC5hmUs_00138,0.3
1,zbAC7t15q3k_00057,0.3
2,-mmq0PT-u8k_00115,0.5


In [108]:
combined_df = before_egl_df.merge(after_egl_df, on='video_id', how='inner')
combined_df.rename(columns={'acc_x': 'acc_before'}, inplace=True)
combined_df.rename(columns={'acc_y': 'acc_after'}, inplace=True)
combined_df.head(7)

Unnamed: 0,video_id,acc_before,acc_after
0,-mmq0PT-u8k_00006,1.0,1.0
1,-mmq0PT-u8k_00011,1.0,0.9
2,-mmq0PT-u8k_00012,1.0,1.0
3,-mmq0PT-u8k_00013,1.0,0.9
4,-mmq0PT-u8k_00014,1.0,0.888889
5,-mmq0PT-u8k_00015,1.0,0.9
6,-mmq0PT-u8k_00019,1.0,1.0


In [109]:
combined_df['change'] = combined_df['acc_after'] - combined_df['acc_before']
combined_df

Unnamed: 0,video_id,acc_before,acc_after,change
0,-mmq0PT-u8k_00006,1.0,1.000000,0.000000
1,-mmq0PT-u8k_00011,1.0,0.900000,-0.100000
2,-mmq0PT-u8k_00012,1.0,1.000000,0.000000
3,-mmq0PT-u8k_00013,1.0,0.900000,-0.100000
4,-mmq0PT-u8k_00014,1.0,0.888889,-0.111111
...,...,...,...,...
7974,zbAC7t15q3k_00145,1.0,0.900000,-0.100000
7975,zbAC7t15q3k_00148,1.0,0.888889,-0.111111
7976,zbAC7t15q3k_00150,1.0,1.000000,0.000000
7977,zbAC7t15q3k_00151,1.0,0.888889,-0.111111


In [110]:
combined_df['change'].mean() * 100

-5.341242985057999

Observing the before and after results for the __training__ set we can conclude that the EGL method had, on average, a positive effect on the videos individual performance.

## Analyses of the test results
In this section we'll get all the test video_ids and determine whether their performance improved by using the RRRLoss.

In [96]:
test_video_ids = stats.execute_query("""SELECT DISTINCT(video_id)
FROM stats
WHERE split = 'test'""", extractor=get_column(0))
len(test_video_ids)

1970

In [135]:
before_egl_df = accuracy_per_video(run22_id, start_epoch - 10, test_video_ids, not_in=True)
after_egl_df = accuracy_per_video(run30_id, max_epoch - 10, test_video_ids, not_in=True)

In [136]:
after_egl_df.head(3)

Unnamed: 0,video_id,acc
0,mGbuP7nT2ck_00014,0.222222
1,zYHstCxnAPA_00159,0.222222
2,9jZYYtzYqwE_00070,0.3


In [137]:
combined_df = before_egl_df.merge(after_egl_df, on='video_id', how='inner')
combined_df.rename(columns={'acc_x': 'acc_before'}, inplace=True)
combined_df.rename(columns={'acc_y': 'acc_after'}, inplace=True)
combined_df.head(7)

Unnamed: 0,video_id,acc_before,acc_after
0,_lmT4WlK7G0_00012,0.1,0.4
1,VNvb5oLOpLg_00296,0.2,0.4
2,k1F4LHeYhBs_00207,0.3,0.333333
3,vlfy4cny75s_00070,0.3,0.6
4,zYHstCxnAPA_00328,0.3,0.3
5,D8YKHC5hmUs_00034,0.333333,1.0
6,LkluZoNfKu8_00103,0.4,0.3


In [138]:
combined_df['change'] = combined_df['acc_after'] - combined_df['acc_before']
combined_df

Unnamed: 0,video_id,acc_before,acc_after,change
0,_lmT4WlK7G0_00012,0.1,0.400000,0.300000
1,VNvb5oLOpLg_00296,0.2,0.400000,0.200000
2,k1F4LHeYhBs_00207,0.3,0.333333,0.033333
3,vlfy4cny75s_00070,0.3,0.600000,0.300000
4,zYHstCxnAPA_00328,0.3,0.300000,0.000000
...,...,...,...,...
15022,zbAC7t15q3k_00145,1.0,0.900000,-0.100000
15023,zbAC7t15q3k_00148,1.0,0.888889,-0.111111
15024,zbAC7t15q3k_00150,1.0,1.000000,0.000000
15025,zbAC7t15q3k_00151,1.0,0.888889,-0.111111


In [143]:
combined_df['change'].mean() * 100

-0.33413929001870746

In [152]:
stats.execute_query(
f"""SELECT epoch, (correct_n / n) as acc FROM(
            SELECT
                video_id,
                gt,
                epoch,
                CAST(SUM(CASE WHEN pred = gt THEN 1 ELSE 0 END) as REAL) as correct_n,
                CAST(COUNT(*) as REAL) as n
            FROM stats
            WHERE split = 'test'
            GROUP BY epoch
        ) ORDER BY epoch DESC
        """,
    extractor=get_df
).head(3)

Unnamed: 0,epoch,acc
0,150,0.810152
1,140,0.71269
2,130,0.817766


In [159]:
combined_df[:500]['change'].mean() * 100, combined_df[:100]['change'].mean() * 100

(16.82666666666667, 19.17777777777778)