### Distribution of labels
In this notebook, we perform an analysis to investigate the presence and distribution of data science activities in 470 Jupyter notebooks (performing a data science task). 

In [None]:
# import libraries
import os 
from os import listdir
from os.path import isfile, join
from os.path import dirname as up

import random
import json
import numpy as np
import pandas as pd
from shutil import copyfile,copy
import itertools
from itertools import chain, combinations
from collections import Counter

from glob import glob
import scipy.stats as stats
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
# set path
path = os.getcwd()
features_path = os.path.join(path,'features','')
results_path = os.path.join(path,'results','f_label_distribution','')

if os.path.exists(results_path):
    pass
else:
    os.mkdir(results_path)

In [None]:
# load the dataset
features_df = pd.read_pickle(features_path+'f_DASWOW.pkl')
features_df.shape

In [None]:
features_df.describe()

In [None]:
# look at the features
features_df.columns

In [None]:
# data preprocessing
#features_df.fillna(0,inplace=True)
#features_df['execution_count'].fillna(-1,inplace=True) # if no execution count is available, set it to -1

In [None]:
labels = ['comment_only', 'data_exploration',
       'data_preprocessing', 'evaluation','helper_functions', 'load_data','modelling','prediction', 
       'result_visualization', 'save_results']

In [None]:
ds = selected.groupby('filename')['modelling']
selected = features_df.copy()
print(len(set(selected['filename'].values)))

In [None]:
modelling_nbs = []
for key, item in ds:
    if ds.get_group(key).values.sum()>=1:
        modelling_nbs.append(key)
        
ds1 = selected[selected.filename.isin(modelling_nbs)] # modelling notebooks
ds1.index = range(ds1.shape[0])
ds1files = ds1.groupby('filename')

ds2 = selected[~selected.filename.isin(modelling_nbs)] #non-modelling notebooks
ds2.index = range(ds2.shape[0])
ds2files = ds2.groupby('filename')

In [None]:
# verify that non-modelling notebooks have atleast one of the labels: data exploration or result visualization
[key for key,item in ds2files if (ds2files.get_group(key)['data_exploration'].values.sum() + ds2files.get_group(key)['result_visualization'].values.sum()) < 1]

In [None]:
### distribution of labels throughout the modelling notebooks dataset
labels_modelling = (ds1['primary_label'].value_counts()/ds1.shape[0])*100
labels_modelling

In [None]:
### distribution of labels throughout the non modelling notebooks dataset
labels_nonmodelling = (ds2['primary_label'].value_counts()/ds2.shape[0])*100
labels_nonmodelling

In [None]:
order = ['data_exploration','data_preprocessing','modelling','helper_functions',
         'load_data','evaluation','result_visualization','prediction',
         'comment_only','save_results']

In [None]:
model,value,label=[],[],[]
for o in order:
    label.append(o)
    model.append('modelling')
    try:
        value.append(labels_modelling[o])
    except:
        value.append(0)
    
for o in order:   
    label.append(o)    
    model.append('non-modelling')
    try:
        value.append(labels_nonmodelling[o])
    except:
        value.append(0)
        
dfs = pd.DataFrame(data={'type': model, 
                         'value': value,
                        'label':label})
dfs

In [None]:
sns.set(rc={'figure.figsize':(9,3)})
g = sns.catplot(x = 'value', y='label', hue='type', data=dfs, kind='bar',order=order,legend=False)    
title = "distribution_of_labels_by_type_of_data_science_task"
for ax in g.axes.ravel():
    for p in ax.patches:
        ax.annotate(format(p.get_width(), '.2f'), 
                   (p.get_x() + p.get_width()+10,p.get_y()+.5), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
#ax.xaxis.grid(False)
ax.set_xlabel("% of code cells in the dataset",fontsize=12,fontweight='bold')
ax.set_ylabel("",fontsize=12,fontweight='bold')
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
#axes=plt.gca()
ax.set(xlim=(0, 100))
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(results_path+title+'.eps', format='eps')

### Commonly occuring sequences

In [None]:
all_seq = []
for idx,row in selected.iterrows():
    seq = []
    for l in labels:
        if row[l]==1:
            seq.append(l)
    all_seq.append(tuple(seq))

In [None]:
top,top1,top2,top3,top4,top5 = [],[],[],[],[],[]

topsequences = Counter(all_seq).most_common()
for each in topsequences:
    top.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==1:
        if len(top1)<5:
            top1.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==2:
        if len(top2)<5:
            top2.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==3:
        if len(top3)<5:
            top3.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==4:
        if len(top4)<5:
            top4.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==5:
        if len(top5)<5:
            top5.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))

In [None]:
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))


In [None]:
labelsets = powerset(labels)
labelsets_dict = {}
for labelset in labelsets:
    count = 0
    for seq in all_seq:
        if (all(s in list(seq) for s in labelset)):
            count += 1
    labelsets_dict[labelset] = round((count/selected.shape[0])*100,2)

In [None]:
{k: v for k, v in sorted(labelsets_dict.items(), key=lambda item: item[1], reverse=True) if len(k)==3}

### Analyse the distribution of labels

In [None]:
### distribution of number of labels throughout the dataset
c = Counter(features_df[labels].sum(axis=1).values)
step_dict = pd.Series(data=c, index=c.keys())

In [None]:
### distribution of primary labels throughout the dataset
d = pd.DataFrame([dict(step_dict)]).T
d.columns = ['count']
d['step'] = d.index.astype(str)
d['percentage'] = ((step_dict/selected.shape[0])*100).values.round(2) 
d.index = range(d.shape[0])
truedistdf = d.copy()
d

In [None]:
def plot_distribution(d,col,title,xlabel,ylabel,order):
    plt.figure(figsize=(9,3))

    ax = sns.barplot(x=col,y='step',data=d,color="skyblue")
    for p in ax.patches:
        ax.annotate(format(p.get_width(), '.2f'), 
                   (p.get_x() + p.get_width()+1.2, p.get_y()+0.5), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 0), 
                   textcoords = 'offset points')

    ax.xaxis.grid(False)
    ax.set_xlabel(xlabel,fontsize=12,fontweight='bold')
    ax.set_ylabel(ylabel,fontsize=12,fontweight='bold')
    ax.tick_params(axis='both', which='major', labelsize=12)
    ax.tick_params(axis='both', which='minor', labelsize=12)
    plt.xticks(rotation=0)

    axes=plt.gca()
    plt.tight_layout()
    #plt.savefig(results_path+title+".png",dpi=300)
    plt.savefig(results_path+title+'.eps', format='eps')

In [None]:
plot_distribution(d,'percentage',
                  "no_of_data_science_activities_in_the_dataset","% of code cells in the dataset","no of labels",
                  d.step.values)

In [None]:
import operator

def get_truth(inp, op, cut):
    return op(inp, cut)

def get_n_label_notebooks(df,labels,n,op):
    filegroup = df.groupby(['filename'])
    single_label_nbs = [k for k,v in filegroup if get_truth(max(v[labels].sum(axis=1).values),op,n)]
    
    print("No. of single label notebooks in the dateset: {0}.".format(len(single_label_nbs)),
          "% of single label notebooks in the dataset: {0:.2f}".format(len(single_label_nbs)/len(set(df.filename.values))*100))

    #### update the dataframe with notebooks that have only one labels per cell throughout
    single_label_df = df[df.filename.isin(single_label_nbs)]
    return single_label_df

In [None]:
#check that the notebooks have only one label per cell throughout
single_label_df = get_n_label_notebooks(selected,labels,1,operator.eq)
print(single_label_df.shape)

In [None]:
### distribution of primary labels throughout the dataset
step_dict = selected['primary_label'].value_counts()
d = pd.DataFrame([dict(step_dict)]).T
d.columns = ['count']
d['step'] = d.index
d['percentage'] = ((step_dict/selected.shape[0])*100).values.round(2) 
d.index = range(d.shape[0])
d

In [None]:
plot_distribution(d,'percentage',
                  "primary_label_data_science_activities_in_the_dataset",
                  "% of code cells in the dataset","",
                 step_dict.keys())

In [None]:
unsorted_multi_dict = selected[labels].sum(axis=0)
multi_dict = sorted(unsorted_multi_dict.items(), key=lambda item: item[1], reverse=True)

d = pd.DataFrame([dict(multi_dict)]).T
d.columns = ['count']
d['step'] = d.index
### distribution of labels throughout the dataset
d['percentage'] = d['count'].div(selected.shape[0]) #d['count'].sum()
d['percentage'] = d['percentage'].apply(lambda x: x*100).round(2)
d.index = range(d.shape[0])
dtrue = d.copy()
d

In [None]:
plot_distribution(d,'percentage',
                  "all_labels_data_science_activities_in_the_dataset",
                  "% of code cells in the dataset","",
                 step_dict.keys())

### Analyse the distribution of code and comment using primary label

In [None]:
tot_linesofcode = len(selected['text'].sum(axis=0))
tot_linesofcomments = len(selected['comment'].sum(axis=0))
print("Total lines of code: ", tot_linesofcode, " Total lines of comment: ", tot_linesofcomments)
for l in labels:
    print(l, " -> ", "linesofcode {0:.2f}".format((len(selected[selected['primary_label']==l]['text'].sum(axis=0))/tot_linesofcode)*100), "linesofcomment {0:.2f}".format((len(selected[selected['primary_label']==l]['comment'].sum(axis=0))/tot_linesofcomments)*100))

In [None]:
prep_code = 0
code = 0
lines_of_code = {}
    
for idx,row in selected.iterrows():
        code += len(row['text'])
        if row['primary_label'] == 'data_exploration':
            prep_code += len(row['text'])
print(code,prep_code,prep_code/code*100)

### Analyse the distribution of lines of code 

In [None]:
df = features_df[features_df.cell_type=='code'].copy()
#lines of code
len(list(itertools.chain(*list(df['text'].values))))

In [None]:
#lines of comment
len(list(itertools.chain(*list(df['comment'].values))))

In [None]:
cell_length = []
for cell in df['text'].values:
    cell_length.append(len(cell))
c = Counter(cell_length)
loc_dict = {'0':0,'1-5':0,'6-10':0,'11-20':0,'>20':0}
for k,v in c.items():
    if k==0:
        loc_dict['0'] = loc_dict['0']+v
    elif (k>=1) and (k<=5):
        loc_dict['1-5'] = loc_dict['1-5']+v
    elif (k>=6) and (k<=10):
        loc_dict['6-10'] = loc_dict['6-10']+v
    elif (k>=11) and (k<=20):
        loc_dict['11-20'] = loc_dict['11-20']+v
    elif k>20:
        loc_dict['>20'] = loc_dict['>20']+v

In [None]:
d = pd.DataFrame([loc_dict]).T
d.columns = ['count']
d['lines of code'] = d.index
d['percentage'] = d['count']/d['count'].sum()
d['percentage'] = d['percentage'].round(2)
d.index = range(d.shape[0])

In [None]:
plt.figure(figsize=(9,3))

ax = sns.barplot(x='percentage',y='lines of code',data=d,color="skyblue",order=loc_dict.keys())
for p in ax.patches:
    ax.annotate(format(p.get_width(), '.2f'), 
                   (p.get_x() + p.get_width()+0.02, p.get_y()+0.7), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')
title = "updated no. of lines of code per cell"

ax.xaxis.grid(False)
ax.set_xlabel("% of cells in the dataset",fontsize=12,fontweight='bold')
ax.set_ylabel("no. of lines of code per cell",fontsize=12,fontweight='bold')
ax.tick_params(axis='both', which='major', labelsize=12)
ax.tick_params(axis='both', which='minor', labelsize=12)
ax.set(xlim=(0, 0.8))
plt.xticks(rotation=0)

axes=plt.gca()
plt.tight_layout()
plt.savefig(results_path+"codelinespercell.eps",format='eps')

### Analyse the distribution of predicted labels

In [None]:
dfpred = pd.read_pickle(os.path.join(path,'results','multi-label-classification-model-02-20-2021','')+'prediction_multi.pkl')

In [None]:
unsorted_multi_dict = dfpred[labels].sum(axis=0)
multi_dict = sorted(unsorted_multi_dict.items(), key=lambda item: item[1], reverse=True)

d = pd.DataFrame([dict(multi_dict)]).T
d.columns = ['count']
d['step'] = d.index
### distribution of labels throughout the dataset
d['percentage'] = d['count'].div(dfpred.shape[0])
d['percentage'] = d['percentage'].apply(lambda x: x*100).round(2)
d.index = range(d.shape[0])
dpred = d.copy()
d

In [None]:
plot_distribution(d,'percentage',
                  "predicted_label_data_science_activities_in_the_dataset",
                  "% of code cells in the dataset","",
                 d.step.values)

In [None]:
truevals,predvals = [],[]
for l in labels:
    predvals.append(dpred[dpred['step']==l]['percentage'].values[0])
    truevals.append(dtrue[dtrue['step']==l]['percentage'].values[0])

In [None]:
from sklearn.metrics import mean_squared_error 
from scipy.stats import spearmanr,kendalltau
    
# calculate kendall's correlation
coef, p = kendalltau(truevals, predvals)
print('Kendall correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
	print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('Samples are correlated (reject H0) p=%.3f' % p)

In [None]:
### distribution of number of labels throughout the dataset
c = Counter(dfpred[labels].sum(axis=1).values)
step_dict = pd.Series(data=c, index=c.keys())

### distribution of primary labels throughout the dataset
d = pd.DataFrame([dict(step_dict)]).T
d.columns = ['count']
d['step'] = d.index.astype(str)
d['percentage'] = ((step_dict/dfpred.shape[0])*100).values.round(2) 
d.index = range(d.shape[0])
preddistdf = d.copy()
d


In [None]:
plot_distribution(d,'percentage',
                  "no_of_data_science_activities_in_the_predicted_dataset","% of code cells in the dataset","no of labels",
                  d.step.values)

In [None]:
truevals,predvals = [],[]
for l in [0,1,2,3,4,5]:
    try:
        truevals.append(truedistdf[truedistdf['step']==str(l)]['percentage'].values[0])
    except:
        truevals.append(0)
    try:
        predvals.append(preddistdf[preddistdf['step']==str(l)]['percentage'].values[0])
    except:
        predvals.append(0)
print(truevals)
print(predvals)
# calculate kendall's correlation
coef, p = kendalltau(truevals, predvals)
print('Kendall correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
	print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('Samples are correlated (reject H0) p=%.3f' % p)

In [None]:
all_seq = []
for idx,row in dfpred.iterrows():
    seq = []
    for l in labels:
        if row[l]==1:
            seq.append(l)
    all_seq.append(tuple(seq))
    
top,top1,top2,top3,top4,top5 = [],[],[],[],[],[]
topsequences = Counter(all_seq).most_common()
for each in topsequences:
    top.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==1:
        if len(top1)<5:
            top1.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==2:
        if len(top2)<5:
            top2.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==3:
        if len(top3)<5:
            top3.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==4:
        if len(top4)<5:
            top4.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
    if len(each[0])==5:
        if len(top5)<5:
            top5.append((each[0],each[1],round((each[1]/features_df.shape[0])*100,2)))
            
labelsets = powerset(labels)
labelsets_dict = {}
for labelset in labelsets:
    count = 0
    for seq in all_seq:
        if (all(s in list(seq) for s in labelset)):
            count += 1
    labelsets_dict[labelset] = round((count/selected.shape[0])*100,2)
    

In [None]:
{k: v for k, v in sorted(labelsets_dict.items(), key=lambda item: item[1], reverse=True) if len(k)==1}

### Distribution of number of lines of code per label

In [None]:
labels = ['helper_functions','load_data',
              'data_preprocessing','data_exploration',
              'modelling','evaluation','prediction',
              'result_visualization','save_results',
              'comment_only']

plt.figure(figsize=(12, 6))
sns.set_theme(style="ticks", palette="colorblind")

sns.set_style("darkgrid")
ax = sns.boxplot(x="linesofcode", y="primary_label", 
            showmeans=True, meanprops={"marker": "+",
                       "markeredgecolor": "black",
                       "markersize": "10"}, 
            data=features_df[features_df['cell_type']=='code'],
            showfliers=True,order=labels, 
           flierprops = dict(markerfacecolor='0.75', markersize=5,
              linestyle='none'))
#sns.despine(offset=10, trim=False)
ax.set_xscale("log")
plt.ylabel("data science step (primary_label)", size=12)
plt.xlabel("number of lines of code", size=12)
plt.tight_layout()
plt.savefig(results_path+"loc_log.eps",format='eps')