# Mining Specifications from Execution Traces

* The content of this notebook corresponds to the analysis done in section 4.3 in the paper.
* As mentioned in the paper, it took 3 days to mine sequences of 27 projects.
* In this notebook we use 1 project as example, but it still takes more than 10 minutes to complete (Run carefully).

## Loading data

In [2]:
## We use this project as example
import json

with open("callgraph_seq/p20.json") as dseq:
    seq_calls = json.load(dseq)

## Extracting and Preprocessing callers/callees sequences

In [3]:
## Parsing the sequence to callers and the sequence of calles within each caller
## The format is something like:
    ## -- caller
    ##      -- callee
    ##      -- callee
    ##      -- callee ...
def split_by_callers(calls_sequence):
    counter = 1
    caller = calls_sequence[0].split("##")[-1]
    splitted_sequences = []
    s_temp = []
    s_temp.append(calls_sequence[0])
    i = 1
    while True:
        if i == len(calls_sequence):
            splitted_sequences.append(s_temp)
            break
        c = calls_sequence[i]
        if c.startswith("#START#"):
            if counter == 0 and c.split("##")[-1] != caller:
                splitted_sequences.append(s_temp)
                s_temp = [c]
                if i == len(calls_sequence):
                    splitted_sequences.append(s_temp)
                    break
                else:
                    caller = c
            else:
                s_temp.append(c)
            counter += 1
        elif c.startswith("#END#"):
            counter -= 1
            s_temp.append(c)
        i = i + 1
    return splitted_sequences

splitted_seqs = split_by_callers(seq_calls)

In [4]:
## recursively extract sequences callees within each caller
## 
final_list = []
seq_calls = [seq_calls]
while True:
    temp_seq = []
    for sc in seq_calls:
        temp_seq.extend(split_by_callers(sc))
    final_list.extend(temp_seq)
    seq_calls = []
    for s in temp_seq:
        if len(s) not in [0, 1, 2]:
            seq_calls.append(s[1:-1])

    if len(seq_calls)==0:
        break

simple_final_list = []
for l in final_list:
    simple_final_list.append([c.split("##")[0] for c in l])

In [5]:
## number of sequences
len(simple_final_list)

25849

In [16]:
## Filtering sequences of calls that were interrupted with an exception or sudden termination
filtered_list = []
for l in simple_final_list:
    try:
        new_l = []
        i = 0
        while l[0].startswith("#END#"):
            l = l[1:]
        c_old = l[0].split("#")[2]
        for c in l[1:]:
            if c.split("#")[2] == c_old:
                new_l.append(c_old)
            c_old = c.split("#")[2]
        filtered_list.append(new_l)
    except Exception as e:
        print(e)
filtered_list = [f  for f in filtered_list if len(f)!=0]

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out o

In [29]:
## Number of valid sequences
len(filtered_list)

18662

In [28]:
def get_calls_set(calls):
    parsed_list = []
    if calls[0] == "#START#":
        sub_calls = calls[1:-1]
        func_name = sub_calls[0]
        func_calls = []
        sub_calls = calls[1:]
        stop = False
        f_calls = []
        count_start = 0
        count_end = 0
        for s in sub_calls:
            if count_start == 1 and s!= "#END#":
                func_calls.append(s)
            if s == "#START#":
                count_start += 1
                f_calls.append(s)
            elif s == "#END#":
                count_end += 1
                f_calls.append(s)
            else:
                f_calls.append(s)
            if count_start == count_end and count_start != 0:
                parsed_list.extend(get_calls_set(f_calls))
                count_start = 0
                count_end = 0
                f_calls = []

        parsed_list.append((func_name, func_calls))

    return parsed_list

In [30]:
## Considering sequences with only 3 element at least
filtered_list = [f for f in filtered_list if len(f) >= 3]

## Finally, mining sequence patterns

In [31]:
## Here we use the package Seq2Pat
## for more information check: https://github.com/fidelity/seq2pat
from sequential.seq2pat import Seq2Pat, Attribute

## !!! IMPORTANT: WE LIMITED THE LIST TO THE FIRST 100 ITEMS FOR DEMO PURPOSE OTHERWISE IT WOULD TAKE A LONG TIME TO RUN THIS CODE
seq2pat = Seq2Pat(sequences=filtered_list[:100])


patterns = seq2pat.get_patterns(min_frequency=3)

In [32]:
## A view of the list of patterns
patterns

[['shutil.which', 'thefuck.utils.which', 28],
 ['str.replace', 're.compile', 26],
 ['str.replace', 'str.replace', 26],
 ['str.replace', 'str.replace', 're.compile', 26],
 ['str.replace', 'str.replace', 'str.replace', 26],
 ['str.replace', 'str.replace', 'str.replace', 'str.replace', 26],
 ['str.replace',
  'str.replace',
  'str.replace',
  'str.replace',
  'str.replace',
  26],
 ['argparse._ActionsContainer.add_argument',
  'argparse._ActionsContainer.add_argument',
  24],
 ['argparse._ActionsContainer.add_argument',
  'argparse._ActionsContainer.add_argument',
  'argparse._ActionsContainer.add_argument',
  24],
 ['argparse._ActionsContainer.add_mutually_exclusive_group',
  'argparse._ActionsContainer.add_argument',
  24],
 ['argparse._ActionsContainer.add_mutually_exclusive_group',
  'argparse._ActionsContainer.add_argument',
  'argparse._ActionsContainer.add_argument',
  24],
 ['argparse._ActionsContainer.add_mutually_exclusive_group',
  'argparse._ActionsContainer.add_argument',
  '