# APSEC2019 Results

Date: 2019-07-19

Description: An analysis of the semantic clone detection capabilities of `doLLy` using the solutions of google code jam

In [36]:
# Check system version
import sys
print(sys.version)

3.7.3 (default, Mar 27 2019, 09:23:15) 
[Clang 10.0.1 (clang-1001.0.46.3)]


In [1]:
# Import Core libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import seaborn as sns

# Graphics
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
mpl.rcParams['figure.figsize'] = (8,6)
mpl.rcParams['font.family'] = 'SF Mono'
mpl.rcParams.update({'font.size': 12})
mpl.rcParams['axes.titlepad'] = 20 
mpl.rcParams['axes.titlesize'] = 12
mpl.rcParams['axes.labelpad'] = 10

# Seaborn settings
sns.set(font_scale = 1.25)
sns.set_style('ticks', {'font.family' : 'SF Mono', 
                        'axes.grid' : True, 
                        'grid.linestyle': 'dotted',
                        'grid.color': '0.8'})
sns.axes_style({'axes.grid' : 'True'})
sns.set_palette(sns.color_palette(['Black']))

In [38]:
import os
import glob
import itertools

# create the ground truth file
files = map(os.path.basename, glob.glob('../data/*.c'))
pairs = pd.DataFrame(list(itertools.combinations(files, 2)), columns=['filename0', 'filename1'])
def is_clone(row):
    pset0 = row['filename0'].split('_')[0]
    pset1 = row['filename1'].split('_')[0]
    if pset0 == pset1:
        return True
    else:
        return False
pairs['clone'] = pairs.apply(is_clone, axis=1)


# parse the doLLy csv files
def parse_doLLy_csv(pairs, threshold):    
    doLLy_pairs = pd.read_csv(f'doLLy/gcj-doLLy-{threshold}.csv')
    doLLy_pairs[f'doLLy-{threshold}'] = True
    doLLy_pairs = doLLy_pairs.rename(columns={'filename': 'filename0', 'filename.1': 'filename1'})
    doLLy_pairs = doLLy_pairs[['filename0', 'filename1', f'doLLy-{threshold}']]
    doLLy_pairs = doLLy_pairs.drop_duplicates()
    doLLy_pairs_rev = doLLy_pairs.rename(columns={'filename0': 'filename1', 'filename1': 'filename0'})
    doLLy_pairs = doLLy_pairs.append(doLLy_pairs_rev, sort=False)
    pairs = pd.merge(pairs, doLLy_pairs, how='left', on=['filename0', 'filename1'], validate='1:m')
    pairs = pairs.fillna(False)
    return pairs
pairs = parse_doLLy_csv(pairs, '0.30')
pairs = parse_doLLy_csv(pairs, '0.40')
pairs = parse_doLLy_csv(pairs, '0.50')
pairs = parse_doLLy_csv(pairs, '0.60')


# parse NiCad clone pairs
import xml.etree.ElementTree as ET

def parse_NiCad_xml(pairs, threshold):
    root = ET.parse(f'NiCad/gcj_functions-blind-clones-{threshold}.xml').getroot()
    NiCad = []
    for child in root:
        if child.tag == 'clone':
            pair = []
            for source in child:
                file = source.get('file')
                file = os.path.basename(file)
                file = file[:-8]
                pair.append(file)
            NiCad.append(pair)
    NiCad_pairs = pd.DataFrame(NiCad, columns=['filename0', 'filename1']).drop_duplicates()
    NiCad_pairs_rev = pd.DataFrame(NiCad, columns=['filename1', 'filename0']).drop_duplicates()
    NiCad_pairs[f'NiCad-{threshold}'] = True
    NiCad_pairs_rev[f'NiCad-{threshold}'] = True
    NiCad_pairs = NiCad_pairs.append(NiCad_pairs_rev, sort=False)
    pairs = pd.merge(pairs, NiCad_pairs[['filename0', 'filename1', f'NiCad-{threshold}']], how='left', on=['filename0', 'filename1'], validate='1:m')
    pairs = pairs.fillna(False)
    return pairs
pairs = parse_NiCad_xml(pairs, '0.30')
pairs = parse_NiCad_xml(pairs, '0.40')
pairs = parse_NiCad_xml(pairs, '0.50')


# parse CCCD clone pairs
import re

CCCD_pairs = pd.read_csv('CCCD/gcj_comparisionReport.csv')
def parse_CCCD_files(files):
    _, file0, file1 = re.split(r'gcj\.(?=[\w])', files)
    file0 = file0.split('..')[0]
    file1 = file1.split('..')[0]
    return pd.Series([file0 + '.c', file1 + '.c'], index=['filename0', 'filename1'])
    

CCCD_pairs = CCCD_pairs.join(CCCD_pairs['Files'].apply(parse_CCCD_files))
CCCD_pairs = CCCD_pairs.drop('Files', axis=1)

CCCD_pairs = CCCD_pairs.groupby(['filename0', 'filename1']).min()
CCCD_pairs = pd.DataFrame(CCCD_pairs.to_records())
CCCD_pairs = CCCD_pairs.append(CCCD_pairs.rename(columns={"filename0": "filename1", "filename1": "filename0"}), sort=False).drop_duplicates()
CCCD_pairs['CCCD-0.30'] = CCCD_pairs['LevenDistance'] <= 30.0
CCCD_pairs['CCCD-0.40'] = CCCD_pairs['LevenDistance'] <= 40.0
CCCD_pairs['CCCD-0.50'] = CCCD_pairs['LevenDistance'] <= 50.0
pairs = pd.merge(pairs, CCCD_pairs[['CCCD-0.30', 'CCCD-0.40', 'CCCD-0.50', 'filename0', 'filename1']], how='left', on=['filename0', 'filename1'], validate='1:m').drop_duplicates(subset=['filename0','filename1'])

In [39]:
def confusion_matrix(tool, pairs):
    tp = len(pairs[pairs[tool] & pairs['clone']])
    fp = len(pairs[~pairs['clone'] & pairs[tool]])
    tn = len(pairs[~pairs['clone'] & ~pairs[tool]])
    fn = len(pairs[pairs['clone'] & ~pairs[tool]])

    if fn == 0:
        recall = 1.0
    else:
        recall = tp / (tp + fn)
    if fp == 0:
        precision = 1.0
    else:
        precision = tp / (tp + fp)
    if (precision + recall) == 0:
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return [precision, recall, f1, accuracy, tp, fp, tn, fn]

## Evaluation

In [45]:
pairs = pairs.replace(np.nan, False)
results = [
    confusion_matrix('doLLy-0.30', pairs),
    confusion_matrix('doLLy-0.40', pairs),
    confusion_matrix('doLLy-0.50', pairs),
    confusion_matrix('doLLy-0.60', pairs),
    confusion_matrix('NiCad-0.30', pairs),
    confusion_matrix('NiCad-0.40', pairs),
    confusion_matrix('NiCad-0.50', pairs),
    confusion_matrix('CCCD-0.30', pairs),
    confusion_matrix('CCCD-0.40', pairs),
    confusion_matrix('CCCD-0.50', pairs),
]
results = pd.DataFrame(results, columns=['precision', 'recall', 'f1', 'accuracy', 'tp', 'fp', 'tn', 'fn'],
                      index=['doLLy-0.30', 'doLLy-0.40', 'doLLy-0.50', 'doLLy-0.60',
                             'NiCad-0.30', 'NiCad-0.40', 'NiCad-0.50',
                             'CCCD-0.30', 'CCCD-0.40', 'CCCD-0.50'
                            ])
print(results.sort_values('f1', ascending=False).round(3)[['tp', 'fp', 'tn', 'fn']].to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   tp &    fp &     tn &    fn \\
\midrule
doLLy-0.50 &  296 &  1906 &  41594 &  1054 \\
doLLy-0.40 &  130 &   527 &  42973 &  1220 \\
doLLy-0.60 &  588 &  7340 &  36160 &   762 \\
CCCD-0.40  &  119 &   923 &  42577 &  1231 \\
CCCD-0.50  &  216 &  3836 &  39664 &  1134 \\
doLLy-0.30 &   61 &   259 &  43241 &  1289 \\
NiCad-0.50 &   47 &   365 &  43135 &  1303 \\
CCCD-0.30  &   37 &   220 &  43280 &  1313 \\
NiCad-0.40 &   20 &   177 &  43323 &  1330 \\
NiCad-0.30 &    8 &    85 &  43415 &  1342 \\
\bottomrule
\end{tabular}



candidates to show case where doLLy detects a clone and NiCad does not:
* oversized-pancake-flipper: mimo31 and Neelesh.Sinha
* alphabet-cake: shikhar1997 and squeekeek
* oversized-pancake-flipper: helloneo.pe.kr and algmyr
* mushroom-monster: syuxuan and maups
* alphabet-cake: Ioannis70 and lemmaa

## doLLy vs CCCD

In [8]:
pair_subset = pairs[~pd.isnull(pairs['CCCD-0.50'])]
results = [
    confusion_matrix('doLLy-0.30', pair_subset),
    confusion_matrix('doLLy-0.40', pair_subset),
    confusion_matrix('doLLy-0.50', pair_subset),
    confusion_matrix('doLLy-0.60', pair_subset),
    confusion_matrix('NiCad-0.30', pair_subset),
    confusion_matrix('NiCad-0.40', pair_subset),
    confusion_matrix('NiCad-0.50', pair_subset),
    confusion_matrix('CCCD-0.30', pair_subset),
    confusion_matrix('CCCD-0.40', pair_subset),
    confusion_matrix('CCCD-0.50', pair_subset),
]
results = pd.DataFrame(results, columns=['precision', 'recall', 'f1', 'accuracy'],
                      index=['doLLy-0.30', 'doLLy-0.40', 'doLLy-0.50', 'doLLy-0.60',
                             'NiCad-0.30', 'NiCad-0.40', 'NiCad-0.50',
                             'CCCD-0.30', 'CCCD-0.40', 'CCCD-0.50'])
print(results.sort_values('f1', ascending=False).round(3).to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &  precision &  recall &     f1 &  accuracy \\
\midrule
doLLy-0.50 &      0.132 &   0.239 &  0.170 &     0.930 \\
doLLy-0.40 &      0.209 &   0.109 &  0.144 &     0.961 \\
CCCD-0.40  &      0.114 &   0.191 &  0.143 &     0.920 \\
doLLy-0.60 &      0.074 &   0.465 &  0.128 &     0.810 \\
CCCD-0.30  &      0.144 &   0.069 &  0.093 &     0.958 \\
CCCD-0.50  &      0.053 &   0.300 &  0.091 &     0.792 \\
doLLy-0.30 &      0.135 &   0.038 &  0.059 &     0.964 \\
NiCad-0.50 &      0.062 &   0.030 &  0.040 &     0.958 \\
NiCad-0.40 &      0.054 &   0.012 &  0.020 &     0.964 \\
NiCad-0.30 &      0.000 &   0.000 &  0.000 &     0.966 \\
\bottomrule
\end{tabular}

