In [111]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.stats.contingency_tables as statsmodels
import statsmodels.stats.multitest as multitest
from statsmodels.stats.contingency_tables import Table2x2

In [112]:
# Processes the times considering cases where results are correct 
# vs. incorrect, independently of atoms. 
atoms = pd.read_csv('results_JS_replication_study.csv')
atoms.dropna(thresh=10, inplace=True)
times = atoms.loc[:, 'Question 1 time':'Question 48 time']
correct = atoms.loc[:,'Question 1 correct':'Question 48 correct']
correct.columns = times.columns
correctTimesTemp = times.where(correct)
correctTimes = pd.Series(correctTimesTemp.values.flatten()).dropna()
incorrectTimesTemp = times.where(np.logical_not(correct))
incorrectTimes = pd.Series(incorrectTimesTemp.values.flatten()).dropna()
correctTimes = correctTimes[correctTimes.map(lambda t: type(t) != type('ok'))]
incorrectTimes = incorrectTimes[incorrectTimes.map(lambda t: type(t) != type('ok'))]
print(stats.ranksums(correctTimes, incorrectTimes))
print(np.mean(correctTimes), np.mean(incorrectTimes))
print(np.median(correctTimes), np.median(incorrectTimes))
print(np.std(correctTimes), np.std(incorrectTimes))


RanksumsResult(statistic=-5.042914598471609, pvalue=4.5849391631250766e-07)
35.44412803398052 44.50444720101791
24.362499999999997 29.2825
38.427615862599204 62.48393041895574


In [113]:
# Auxiliary functions
def getSnippetId(questionNum:int) -> (int, str, int):
    topL = (questionNum+5) // 6
    secondL = "obfuscated" if (questionNum % 2) == 1 else "clear"
    thirdL = (((questionNum-1)%6)//2)+1
    return (topL, secondL, thirdL)

# The results parameter stores the result. The function does not 
# return anything.
def processAtomsOfInterest(theAtoms, results):
    numQuestions = 48
    j = 0
    while j < len(theAtoms):
        row = theAtoms[j]
        i = 0
        while i < numQuestions:
            results.loc[j, getSnippetId(row[i])] = row[i + numQuestions]
            i += 1
        j += 1

# Same as above, but for counts.
def countAtoms(atomData, counts):
    j = 0
    while j < len(atomData):
        row = atomData.loc[j]
        i = 1
        while i <= 24:
            obfuscatedD = row[(i, 'obfuscated')].dropna().values[0]
            clearD = row[(i, 'clear')].dropna().values[0]
            if obfuscatedD:
                counts.loc[i, 'obfuscated_ok'] += 1
            else:
                counts.loc[i, 'obfuscated_wrong'] += 1
     
            if clearD:
                counts.loc[i, 'clear_ok'] += 1
            else:
                counts.loc[i, 'clear_wrong'] += 1
            i += 1
        j += 1

def printOddsRatio(contingencyTable, label):
    ct = Table2x2(contingencyTable)
    print(label)
    print("Odds ratio: ", ct.oddsratio)
    print("p-value: ", ct.oddsratio_pvalue())
    print("Confidence interval at 95% confidence level:", ct.oddsratio_confint(alpha=0.05, method='normal'))
    print()

def outputOddsRatios(counts):
    i = 1
    while i <= len(counts):
        row = atomCounts.loc[i]
        print(row)
        printOddsRatio(buildContingencyTable(row), 'Atom ' + str(i))
        i += 1

def outputMcNemar(counts):
    i = 1
    while i <= len(counts):
        row = atomCounts.loc[i]
        print(row)
        print(statsmodels.mcnemar(buildContingencyTable(row), exact=True, correction=True), 'Atom ' + str(i))
        i += 1

def buildContingencyTable(aSeries):
    return [[aSeries.iloc[0], aSeries.iloc[1]], [aSeries.iloc[2], aSeries.iloc[3]]]



In [114]:
# Processes and analyzes the correctness data.

topLevel = [x//6 for x in range(6, 150)]
secondLevel = ["obfuscated" if ((x%6) <3) else "clear" for x in range(0, 144)]
thirdLevel = [(x%3)+1 for x in range(0, 144)]

temp = np.zeros((70, 144))
atomResults = pd.DataFrame(temp)
atomResults.columns=[topLevel, secondLevel, thirdLevel]
atomResults.replace(0, np.nan, inplace=True)
atomResults.loc[0, (3, 'obfuscated', 2)]

atomsOfInterest = atoms.loc[:, 'Question 1 number':'Question 48 correct'].values

processAtomsOfInterest(atomsOfInterest, atomResults)
atomCounts = pd.DataFrame(np.zeros((24, 4)), columns=['obfuscated_ok', 'obfuscated_wrong', 'clear_ok', 'clear_wrong'])
atomCounts.index=range(1,25)

countAtoms(atomResults, atomCounts)

  return runner(coro)


In [115]:
atomResults.columns.names = ['atom', 'version', 'variant']
groupedResults = atomResults.groupby(['atom', 'version'], axis=1).mean() # As a result, True will become 1 and False 0
groupedResults


atom,1,1,2,2,3,3,4,4,5,5,...,20,20,21,21,22,22,23,23,24,24
version,clear,obfuscated,clear,obfuscated,clear,obfuscated,clear,obfuscated,clear,obfuscated,...,clear,obfuscated,clear,obfuscated,clear,obfuscated,clear,obfuscated,clear,obfuscated
0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
67,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [116]:
questions = groupedResults.groupby(['atom'], axis=1)
contingencyTablesForMcNemar = []
atomNames = ['IMPLICIT PREDICATE', 'INFIX OPERATOR PRECEDENCE', 'POST INC DEC', 'PRE INC DEC', 'CONSTANT VARIABLES', 'INDENTATION NO BRACES', 'CONDITIONAL OPERATOR', 'ARITHMETIC AS LOGIC', 'COMMA OPERATOR', 'ASSIGNMENT AS VALUE', 'LOGIC AS CONTROL FLOW', 'REPURPOSED VARIABLES', 'DEAD UNREACHABLE REPEATED', 'CHANGE LITERAL ENCODING', 'OMITTED CURLY BRACES', 'TYPE CONVERSION', 'INDENTATION WITH BRACES', 'OMIT SEMICOLON', 'PROPERTY ACCESS', 'ARROW FUNCTION', 'ARRAY SPREAD', 'OBJECT SPREAD', 'ARRAY DESTRUCTURING', 'OBJECT DESTRUCTURING']
i = 0
for x in questions:
    df = x[1][x[0]]
    contingencyTablesForMcNemar.append((atomNames[i], [[len(df[(df['obfuscated'] == 1) & (df['clear'] == 1)]), len(df[(df['obfuscated'] == 1) & (df['clear'] == 0)])], [len(df[(df['obfuscated'] == 0) & (df['clear'] == 1)]), len(df[(df['obfuscated'] == 0) & (df['clear'] == 0)])]]))
    i += 1

print(contingencyTablesForMcNemar)

[('IMPLICIT PREDICATE', [[30, 11], [14, 15]]), ('INFIX OPERATOR PRECEDENCE', [[43, 4], [10, 13]]), ('POST INC DEC', [[6, 2], [11, 51]]), ('PRE INC DEC', [[12, 1], [16, 41]]), ('CONSTANT VARIABLES', [[59, 0], [2, 9]]), ('INDENTATION NO BRACES', [[28, 2], [22, 18]]), ('CONDITIONAL OPERATOR', [[54, 6], [8, 2]]), ('ARITHMETIC AS LOGIC', [[41, 11], [6, 12]]), ('COMMA OPERATOR', [[6, 2], [23, 39]]), ('ASSIGNMENT AS VALUE', [[15, 3], [28, 24]]), ('LOGIC AS CONTROL FLOW', [[17, 16], [11, 26]]), ('REPURPOSED VARIABLES', [[15, 8], [8, 39]]), ('DEAD UNREACHABLE REPEATED', [[61, 2], [3, 4]]), ('CHANGE LITERAL ENCODING', [[8, 1], [28, 33]]), ('OMITTED CURLY BRACES', [[32, 3], [19, 16]]), ('TYPE CONVERSION', [[3, 2], [24, 41]]), ('INDENTATION WITH BRACES', [[50, 3], [5, 12]]), ('OMIT SEMICOLON', [[1, 14], [20, 35]]), ('PROPERTY ACCESS', [[37, 14], [5, 14]]), ('ARROW FUNCTION', [[52, 2], [6, 10]]), ('ARRAY SPREAD', [[18, 4], [13, 35]]), ('OBJECT SPREAD', [[16, 5], [14, 35]]), ('ARRAY DESTRUCTURING', 

In [119]:
# The calculation of the Odds Ratio was taken from the following book:
# Mangiafico, S.S. 2016. Summary and Analysis of Extension Program Evaluation in R, version 1.19.10. rcompanion.org/handbook/. (Pdf version: rcompanion.org/documents/RHandbookProgramEvaluation.pdf.)

# Still missing: 
# Names of the atoms
correction = len(contingencyTablesForMcNemar)
pvalues = {}

for ctm in contingencyTablesForMcNemar:
    print(ctm[0])
    table = ctm[1]
    pvalue, statistic = statsmodels.mcnemar(table).pvalue, statsmodels.mcnemar(table).statistic
    pvalues[ctm[0]] = pvalue
#    print('p-value, original:', pvalue)
#    print('p-value, corrected:', pvalue*correction)
#    print('statistic:', statistic)
    # considering the Odds Ratio to be max(b/c, c/b), ignoring cases where the denominator is 0. 
    print("Odds ratio: ", max(0 if table[1][0] == 0 else table[0][1]/table[1][0], 0 if table[0][1] == 0 else table[1][0]/table[0][1]))
"""    ct = Table2x2(ctm[1])
    print("Odds ratio: ", ct.oddsratio)
    print("p-value: ", ct.oddsratio_pvalue())
    print("Confidence interval at 95% confidence level:", ct.oddsratio_confint(alpha=0.05, method='normal'))
""" 

# using Benjamini-Hochberg correction for repeated tests. 
correctedPvalues = multitest.multipletests(list(pvalues.values()), alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False)[1]
correctedPvalues = dict(zip({k for k in pvalues.copy()}, correctedPvalues))

print('Original p-values:')
print(pvalues)
print('Corrected p-values:')
print(correctedPvalues)

#outputOddsRatios(atomCounts)

# 1 - Arithmetic as Logic
# 2 - Infix Operator Precedence
# 3 - Post Inc Dec (or Assignment as Value?)
# 4 - Pre Inc Dec
# 5 - Constant Variables????
# 6 - Omitted Curly Braces
# 7 - Conditional Operator 
# 8 - ????
# 9 - Comma Operator
# 10 - Assignment as Value
# 11 - Logic as Control Flow???? 
# 12 -
"""
1-IMPLICIT PREDICATE *
INFIX OPERATOR PRECEDENCE
POST INC DEC *
PRE INC DEC *
CONSTANT VARIABLES
INDENTATION NO BRACES
CONDITIONAL OPERATOR *
ARITHMETIC AS LOGIC *
COMMA OPERATOR *
ASSIGNMENT AS VALUE *
LOGIC AS CONTROL FLOW *
REPURPOSED VARIABLES
DEAD UNREACHABLE REPEATED
CHANGE LITERAL ENCODING
OMITTED CURLY BRACES *
TYPE CONVERSION
INDENTATION WITH BRACES
OMIT SEMICOLON
PROPERTY ACCESS
ARROW FUNCTION
ARRAY SPREAD
OBJECT SPREAD
ARRAY DESTRUCTURING
24-OBJECT DESTRUCTURING"""


IMPLICIT PREDICATE
Odds ratio:  1.2727272727272727
INFIX OPERATOR PRECEDENCE
Odds ratio:  2.5
POST INC DEC
Odds ratio:  5.5
PRE INC DEC
Odds ratio:  16.0
CONSTANT VARIABLES
Odds ratio:  0.0
INDENTATION NO BRACES
Odds ratio:  11.0
CONDITIONAL OPERATOR
Odds ratio:  1.3333333333333333
ARITHMETIC AS LOGIC
Odds ratio:  1.8333333333333333
COMMA OPERATOR
Odds ratio:  11.5
ASSIGNMENT AS VALUE
Odds ratio:  9.333333333333334
LOGIC AS CONTROL FLOW
Odds ratio:  1.4545454545454546
REPURPOSED VARIABLES
Odds ratio:  1.0
DEAD UNREACHABLE REPEATED
Odds ratio:  1.5
CHANGE LITERAL ENCODING
Odds ratio:  28.0
OMITTED CURLY BRACES
Odds ratio:  6.333333333333333
TYPE CONVERSION
Odds ratio:  12.0
INDENTATION WITH BRACES
Odds ratio:  1.6666666666666667
OMIT SEMICOLON
Odds ratio:  1.4285714285714286
PROPERTY ACCESS
Odds ratio:  2.8
ARROW FUNCTION
Odds ratio:  3.0
ARRAY SPREAD
Odds ratio:  3.25
OBJECT SPREAD
Odds ratio:  2.8
ARRAY DESTRUCTURING
Odds ratio:  7.5
OBJECT DESTRUCTURING
Odds ratio:  0.0
Original p-va

'\n1-IMPLICIT PREDICATE *\nINFIX OPERATOR PRECEDENCE\nPOST INC DEC *\nPRE INC DEC *\nCONSTANT VARIABLES\nINDENTATION NO BRACES\nCONDITIONAL OPERATOR *\nARITHMETIC AS LOGIC *\nCOMMA OPERATOR *\nASSIGNMENT AS VALUE *\nLOGIC AS CONTROL FLOW *\nREPURPOSED VARIABLES\nDEAD UNREACHABLE REPEATED\nCHANGE LITERAL ENCODING\nOMITTED CURLY BRACES *\nTYPE CONVERSION\nINDENTATION WITH BRACES\nOMIT SEMICOLON\nPROPERTY ACCESS\nARROW FUNCTION\nARRAY SPREAD\nOBJECT SPREAD\nARRAY DESTRUCTURING\n24-OBJECT DESTRUCTURING'