## Analysing the identical lines of code within notebooks in DASWOW dataset
** Number of lines of code that is identically (i.e., type-1) cloned, appearing at least three or more times. 

** The purpose of the code is identified using the primary label of the cell where every time the code appears. 

In [None]:
no_clones = 3 #set the appearance

In [None]:
#os related operations
import os 
from os import listdir
from os.path import isfile, join

import random
import json
import numpy as np
import pandas as pd
import collections
from shutil import copyfile,copy

from glob import glob

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
# set path
path = os.getcwd()
features_path = os.path.join(path,'features','')
results_path = os.path.join(path,'results','f_label_distribution','')

if os.path.exists(results_path):
    pass
else:
    os.mkdir(results_path)

In [None]:
# load the dataset
daswow_df = pd.read_pickle(features_path+'f_DASWOW.pkl')
daswow_df.shape

In [None]:
notebooks = daswow_df.groupby('filename')

In [None]:
def get_clone_line_sets(unique_loc_clone_list):
    clone_sets_in_dataset = {}
    for k,v in unique_loc_clone_list:
        #for each notebook, create a new clone set
        #print("notebook ", k)
        clone_sets=[]
        #for each clone pair in notebook
        for each in v:
            #print("processing ",each)
            added = 0
            #if notebook has existing clone sets, check if the current pair belongs to those sets
            #if either of the elements in the pair belongs, add the elements and change added to 1
            if len(clone_sets) > 0:
                for s in clone_sets:                
                    if not added:
                        if (each[0] in s) or (each[1] in s):
                            s.add(each[0])
                            s.add(each[1])
                            added = 1    

            #if either of the elements do not exist in the current clone sets, create a new clone set for the current clone pair 
            else:    
                if not added:
                    new_set = set()
                    new_set.add(each[0])
                    new_set.add(each[1])
                    clone_sets.append(new_set)
                    added = 1

            #if notebook has no existing clone sets yet, create a new clone set for the current clone pair 
            if not added:
                new_set = set()
                new_set.add(each[0])
                new_set.add(each[1])
                clone_sets.append(new_set)
            #print("current state: ", clone_sets)
        #add the clone sets to dictionary
        clone_sets_in_dataset[k] = clone_sets

    #return clone sets
    return clone_sets_in_dataset

In [None]:
def get_clone_lines(cells):
    lines = [line for cell in cells for line in cell]
    clones = []
    for i in range(len(lines)):
        for j in range(i+1,len(lines)):
            if lines[i]==lines[j]:
                #print(i,j,lines[i],"***",lines[j])
                clones.append((i,j))
    return clones

In [None]:
loc_clone_list_daswow = []
for k,v in notebooks:
    #print(k)
    loc_clone_list_daswow.append((k,get_clone_lines(v['text'].values)))

In [None]:
loc_clone_sets_daswow = get_clone_line_sets(loc_clone_list_daswow) #includes all clones

### Append the information on where the clones appear

In [None]:
clones_labels = {}
for k,v in notebooks:
        #per notebook
        label_pairs, cell_matches = [],[]
        lines = [line for cell in v['text'].values for line in cell]
        counter = [len(cell) for cell in v['text'].values]
        cellno = v['cell_number'].values
        #identify the cells
        for i in range(len(cellno)):
            for c in range(counter[i]):
                cell_matches.append(cellno[i]) 
        clones = None
        for each,values in loc_clone_sets_daswow.items():
            if each == k:
                clones = values
        #print("***clones*** ",clones)
        #add the clone labels 
        for each in clones:
            #print(lines[each[0]],lines[each[1]],cell_matches[each[0]],cell_matches[each[1]])
            pairs=[]
            for el in each:
                pairs.append(daswow_df[(daswow_df.filename== k)&(daswow_df.cell_number==cell_matches[el])]['primary_label'].values[0])
            label_pairs.append(pairs)
        clones_labels[k]=label_pairs

### Insights

In [None]:
#an example record in loc_clone_sets_daswow : 'nb_108860.ipynb': [{177, 186}]
unique_lines_of_code_clones_found_per_notebook,lines_of_code_clones_found_per_notebook = [],[]
for each,v in loc_clone_sets_daswow.items():
    if len(v)>0:
        lines_of_code_clones_found_per_notebook.append(len(v))
    if len([e for e in v if len(e)>=no_clones]) > 0:
        unique_lines_of_code_clones_found_per_notebook.append(len([e for e in v if len(e)>=no_clones]))

In [None]:
print("No. of notebooks with atleast one clone that appears {0} times: {1}".format(no_clones,len(unique_lines_of_code_clones_found_per_notebook)/470))
print("Of those that have clones, average number of clones appearing {0}".format(round(np.mean(unique_lines_of_code_clones_found_per_notebook),2)))
print("Of those that have clones, median number of clones appearing {0}".format(round(np.median(unique_lines_of_code_clones_found_per_notebook),2)))
print("Of those that have clones, {0}% of notebooks have above average (>{1}) clones".format(round((np.unique(unique_lines_of_code_clones_found_per_notebook > np.mean(unique_lines_of_code_clones_found_per_notebook), return_counts=True)[1][1])/len(unique_lines_of_code_clones_found_per_notebook)*100,2),round(np.mean(unique_lines_of_code_clones_found_per_notebook),2)))

In [None]:
clones_to_plot = {}
label_list= {'comment_only':0,'data_exploration':0,'data_preprocessing':0,
             'evaluation':0,'helper_functions':0,'load_data':0,'modelling':0,
             'prediction':0,'result_visualization':0,'save_results':0}
notebook_share_list= {'comment_only':0,'data_exploration':0,'data_preprocessing':0,
             'evaluation':0,'helper_functions':0,'load_data':0,'modelling':0,
             'prediction':0,'result_visualization':0,'save_results':0}

for k,v in clones_labels.items():
    clones_to_plot[k] = []
    for values in v:
        if len(values)>=3:
            counter=collections.Counter(values)
            #only consider those that have atleast three or more appearance in the same label
            clones_to_plot[k].append(collections.Counter({k: c for k, c in counter.items() if c >= 3}))
    label_set = set()
    for each in clones_to_plot[k]:
        for key,val in each.items():
            label_list[key] = label_list[key] + 1
            label_set.add(key)
            
    #update the notebook share for each label 
    for each in label_set:
        notebook_share_list[each] = notebook_share_list[each] + 1

In [None]:
clone_data = dict(sorted(label_list.items(), key=lambda item: item[1],reverse=True))
clone_df = pd.DataFrame({'labels':clone_data.keys(),'% in total clones across steps':[round((v/sum(clone_data.values()))*100,2) for k,v in clone_data.items()],
                         '% notebooks with >1 clone':[round((notebook_share_list[k]/len(unique_lines_of_code_clones_found_per_notebook))*100,2) for k,v in clone_data.items()]})
clone_df

### Visualise the distribution of type-1 clones in DASWOW

In [None]:
data = pd.melt(clone_df, id_vars =['labels'], value_vars =['% in total clones across steps',
                                                           '% notebooks with >1 clone'])
data.head()

In [None]:
plt.figure(figsize=(12,3))
g = sns.catplot(x = 'value', y='labels', hue='variable', data=data, kind='bar',order=clone_data.keys(),legend=False)    
title = "no_of_lines_of_code_with_type-1_clones_in_a_notebook"
for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(format(p.get_width(), '.2f'), 
                   (p.get_x() + p.get_width()+10,p.get_y()+.5), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
#ax.xaxis.grid(False)
ax.set_xlabel("% share",fontsize=12,fontweight='bold')
ax.set_ylabel("",fontsize=12,fontweight='bold')
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
#axes=plt.gca()
ax.set(xlim=(0, 100))
plt.legend(loc='lower right')
#plt.tight_layout()
plt.savefig(results_path+title+'.eps', format='eps')