In [1]:
import pandas as pd
import numpy as np

from util.constant import *
%load_ext autoreload
%autoreload 2

In [2]:
from util.scan_database import ExtractController, scan_dir

In [3]:
data_path = 'data'

In [4]:
import sqlite3
from util.db_action import get_all_table_names, get_all_information_from_table_as_pd_dataframe
from collections import defaultdict
tot_data = defaultdict(dict)
inc = 0

for d in scan_dir(data_path):
    if d.endswith('.rar') or d.endswith('.zip'):
        with ExtractController(d) as extracted_dir:
            for t in scan_dir(extracted_dir):
                if t.endswith('.db'):
                    con = sqlite3.connect(t)
                    table_names = get_all_table_names(con)
                    for n in table_names:
                        df = get_all_information_from_table_as_pd_dataframe(con, n)
                        tot_data[inc][n] = df
        inc += 1

In [5]:
def get_one_table(n):
    return {i:d[n] for i, d in tot_data.items()}

# content

In [6]:
def count_cols_in_one_table(table_name, cols):
    contents = get_one_table(table_name)
    def count_times(n, c):
        n = n[c]
        m = set(n)
        return {a: (n==a).sum() for a in m}
    return [{i: count_times(t, c) for i, t in contents.items()} for c in cols]

In [7]:
#content的各项操作的个数
count_cols_in_one_table(CONTENT_INFO, ['operation'])

[{0: {'Delete': 98, 'Insert': 350, 'Replace': 69, 'Save': 62},
  1: {'Delete': 32, 'Insert': 205, 'Replace': 36, 'Save': 7},
  2: {'Delete': 178, 'Insert': 485, 'Replace': 158, 'Save': 27},
  3: {'Delete': 140, 'Insert': 428, 'Replace': 151, 'Save': 15}}]

In [25]:
# content的相邻两次操作之间的时间间隔
def get_delta_time(n, col_name='time'):
    from util.utility import string_to_datetime, timedelta_milliseconds
    is_new = True
    pre=None
    dts=[]
    for t in n.iterrows():
        t=t[1]
        if is_new:
            pre=string_to_datetime(t[col_name])
            is_new=False
        else:
            dts.append(timedelta_milliseconds(string_to_datetime(t[col_name])-pre))
            pre = string_to_datetime(t[col_name])
    return pd.Series(dts)

contents = get_one_table(CONTENT_INFO)
for i, ii in contents.items():
    m = get_delta_time(ii)
    print("id: {}".format(i))
    print("total time:{}".format(m.sum()))
    print(m.describe())

id: 0
total time:5900000.0
count    5.780000e+02
mean     1.020761e+04
std      6.808618e+04
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+03
75%      4.000000e+03
max      1.447000e+06
dtype: float64
id: 1
total time:1911000.0
count       279.000000
mean       6849.462366
std       32588.654811
min           0.000000
25%           0.000000
50%        1000.000000
75%        4000.000000
max      383000.000000
dtype: float64
id: 2
total time:7200000.0
count    8.470000e+02
mean     8.500590e+03
std      5.349172e+04
min      0.000000e+00
25%      0.000000e+00
50%      1.000000e+03
75%      4.000000e+03
max      1.020000e+06
dtype: float64
id: 3
total time:5239000.0
count       733.00000
mean       7147.33970
std       34757.12152
min           0.00000
25%           0.00000
50%        1000.00000
75%        3000.00000
max      625000.00000
dtype: float64


# COMMAND TEXT

In [8]:
#text commmand 各个操作的个数
count_cols_in_one_table(COMMAND_TEXT, ['action'])

[{0: {'Copy': 11, 'Paste': 7, 'Save': 62},
  1: {'Copy': 5, 'Cut': 345, 'Paste': 8, 'Save': 7},
  2: {'Copy': 2, 'Cut': 6, 'Paste': 14, 'Save': 23},
  3: {'Copy': 1, 'Cut': 2, 'Paste': 3, 'Save': 15}}]

# BREAK_POINT

In [9]:
#断点的个数
{i: len(j)  for i, j in count_cols_in_one_table(BREAK_POINT, ['id'])[0].items()}

{0: 0, 1: 0, 2: 0, 3: 52}

# DEBUG_INFO

In [10]:
#debug时停顿原因的统计
count_cols_in_one_table(DEBUG_INFO,['type'])

[{0: {'break': 16, 'run': 16},
  1: {},
  2: {'break': 18, 'exception_not_handled': 2, 'run': 17},
  3: {'break': 86, 'exception_not_handled': 2, 'run': 72}}]

In [13]:
def cal_time_delta(n):
    from util.utility import string_to_datetime, timedelta_milliseconds
    import datetime
    zero_time = timedelta_milliseconds(datetime.timedelta(microseconds=0))
    ts_list = []
    n_list = []
    c = 0
    is_new = True
    pre_time = None
    for t in n.iterrows():
        t=t[1]
        if t['type'] == 'run':
            if not is_new:
                c += 1
            else:
                is_new = False
            ts_list.append(zero_time)
            n_list.append(c)
            pre_time = string_to_datetime(t['timestamp'])
        else:
            ts_list.append(timedelta_milliseconds(string_to_datetime(t['timestamp'])-pre_time))
            pre_time = string_to_datetime(t['timestamp'])
            n_list.append(c)
    return pd.DataFrame({'seq_id': n_list, 'delta_time': ts_list})

In [20]:
# 单次debug的时间
# 单次debug的长度
debug_info = get_one_table(DEBUG_INFO)
for i, ii in debug_info.items():
    m = cal_time_delta(ii)
    print("id: {}".format(i))
    print("mean miliseconds: {}".format(m.groupby('seq_id')['delta_time'].sum().describe()))
    print("run count: {}".format(m.groupby('seq_id')['delta_time'].count().describe()))
    print("")

id: 0
mean miliseconds: count        16.000000
mean      32500.000000
std       30988.169786
min        1000.000000
25%       10000.000000
50%       18000.000000
75%       44750.000000
max      116000.000000
Name: delta_time, dtype: float64
run count: count    16.0
mean      2.0
std       0.0
min       2.0
25%       2.0
50%       2.0
75%       2.0
max       2.0
Name: delta_time, dtype: float64

id: 1
mean miliseconds: count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: delta_time, dtype: float64
run count: count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: delta_time, dtype: float64

id: 2
mean miliseconds: count       17.000000
mean      3529.411765
std       2267.091944
min       1000.000000
25%       2000.000000
50%       3000.000000
75%       5000.000000
max      10000.000000
Name: delta_time, dtype: float64
run count: count    17.000000
mean      2.176471
std       