In [1]:
import Plot
from datetime import datetime
import numpy
import numpy as np
import json_storage
from scipy.optimize import curve_fit
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages

In [2]:
graphsFolder = "/Users/andreleone/SeniorProject/Graphs/GeneratedGraphs/"
folderPath = "/Users/andreleone/SeniorProject/EvalDiagnostics/"

In [3]:
def plotEditDistanceAnalysisLine():
    fileName = folderPath + "evalData_javaarraylist-instrumenting_add_remove.json"
    jsonData = json_storage.readDataFromJson(fileName)
    
    legendArr = list()
    
    xs = list()
    ys = list()
    for pairing in jsonData:
        xs.append(float(pairing[2]))
        if(pairing[6]):
            ys.append(1.0)
        else:
            ys.append(-1.0)

    x = np.array(xs)
    y = np.array(ys)
    
    xArr = [xs]
    yArr = [ys]
    legendArr.append("Hackerrank Java Arraylist - Using Edit Dist. Sums")
    
    
    useLogYAxis=False
    usingDates=False
    xMax=2250.0
    xMin=-0.000001
    yAxisMin=-1.5
    yAxisMax=1.5
    useLogXAxis=False
    useScatter=True
    
    colors = None
    symbols = None
    secondAsLine=False
    saveFileLocation = None
    saveFileLocation = graphsFolder + "Edit Distance Analysis Hackerrank Javaarraylist_With Sums instead of individual distances.pdf"
    linewidth = 1
    legendFontSize = 10
    Plot.plotLineGraph("", "Edit Distance", "Score Match (1=Match and -1=Nonmatch)", xArr, yArr, legendArr, 0.2, useLogYAxis, usingDates, xMax, xMin, yAxisMin, yAxisMax, useLogXAxis, useScatter, colors, symbols, secondAsLine, saveFileLocation, linewidth, legendFontSize)

In [4]:
def plotEditDistanceAnalysisLineWithForcast():
    import matplotlib.pyplot as plt
    plotExponential = False
    
    fileName = folderPath + "evalData_javaarraylist-instrumenting_add_remove.json"
    jsonData = json_storage.readDataFromJson(fileName)
    
    legendArr = list()
    
    editDistDict = dict()
    xs = list()
    ys = list()
    ySum = 0.0
    for pairing in jsonData:
        dist = float(pairing[2])
        for innerDist in pairing[3]:
            dist = float(innerDist)
            xs.append(dist)
            if(not (dist in editDistDict)):
                editDistDict[dist] = list()
            if(pairing[6]):
                ys.append(1.0)
                editDistDict[dist].append(1.0)
                ySum += 1.0
            else:
                ys.append(-1.0)
                editDistDict[dist].append(-1.0)
                ySum += -1.0
    
    x = np.array(xs)
    y = np.array(ys)
    
    
    #find the point at which there is more wrong edit distances then right (ySum < 0)
    #editDistanceTippingPt = 0.0
    #sortedDistances = sorted(list(editDistDict.keys()))
    #print("sorted Distances: " + str(sortedDistances))
    #for editDistance in sortedDistances:
        #print("ySum: " + str(ySum))
        #if(ySum <= 0.0):
            #break
        #editDistanceTippingPt = editDistance
        #for yAmt in editDistDict[editDistance]:
            #ySum -= yAmt
    #print("Tipping Point: x=" + str(editDistanceTippingPt))
    
    # Modeling with Numpy
    p, cov = np.polyfit(x,y,1,cov=True)           # parameters and covariance from of the fit
    y_model = np.polyval(p, x)                    # model using the fit parameters; NOTE: parameters here are coefficients

    if(plotExponential):
        p = popt
        cov = pcov
        y_model = redYs
    
    # Statistics
    n = y.size                              # number of observations
    m = p.size                                    # number of parameters
    DF = n - m                                    # degrees of freedom
    t = stats.t.ppf(0.95, n - m)                  # used for CI and PI bands

    # Estimates of Error in Data/Model
    resid = y - y_model                           
    chi2 = np.sum((resid/y_model)**2)             # chi-squared; estimates error in data
    chi2_red = chi2/(DF)                          # reduced chi-squared; measures goodness of fit
    s_err = np.sqrt(np.sum(resid**2)/(DF))        # standard deviation of the error


    # Plotting --------------------------------------------------------------------
    fig, ax = plt.subplots(figsize=(8,6))

    # Data
    ax.plot(x,y,'o', color='#b9cfe7', markersize=4, markeredgewidth=1, markerfacecolor='None',markeredgecolor='b', label="Hackerrank Java Arraylist")
    #ax.plot(x,y,'o', color='#222fe7', markersize=4, label=r"Mean $\alpha$ given $\Theta$")
    
    # Fit
    if(not plotExponential):
        ax.plot(x,y_model,'-', color='0.1', linewidth=2, alpha=0.5, label='Linear Fit')  
    else:
        # Exponential Fit
        ax.plot(redXs,redYs,'-', color='0.1', linewidth=2, alpha=0.5, label='Exponential Fit')  
    
    x2 = np.linspace(np.min(x), np.max(x), 100)
    y2 = np.linspace(np.min(y_model), np.max(y_model), 100)
    
    #reverse the y2 array because of negative slope
    if(p[0] < 0.0):
        y2 = y2[::-1]
    
    # Confidence Interval
    CI = t*s_err*np.sqrt(1/n +(x2-np.mean(x))**2/np.sum((x-np.mean(x))**2))
    if(not plotExponential):
        ax.fill_between(x2, y2+CI, y2-CI, color='#b9cfe7', edgecolor='')

    if(not plotExponential):
        '''Minor hack for labeling CI fill_between()'''
        ax.plot(x2, y2+CI, '-', color='#b9cfe7', label='95% Confidence Limits')
    
    # Prediction Interval
    PI = t*s_err*np.sqrt(1+1/n+(x2-np.mean(x))**2/np.sum((x-np.mean(x))**2))
    if(not plotExponential):
        ax.fill_between(x2, y2+PI, y2-PI, color='None', linestyle='--')
        ax.plot(x2, y2-PI, '--', color='0.5', label='95% Prediction Limits')
        ax.plot(x2, y2+PI, '--', color='0.5')

    
    # Figure Modifications --------------------------------------------------------
    # Borders
    ax.spines['top'].set_color('0.5')
    ax.spines['bottom'].set_color('0.5')
    ax.spines['left'].set_color('0.5')
    ax.spines['right'].set_color('0.5')
    ax.get_xaxis().set_tick_params(direction='out')
    ax.get_yaxis().set_tick_params(direction='out')
    ax.xaxis.tick_bottom()
    ax.yaxis.tick_left() 

    # Labels
    plt.gca().yaxis.grid(True) #horizontal gridlines
    plt.xlabel("Edit Distance", fontsize=16)
    plt.ylabel("Score Match (1=Match and -1=Nonmatch)", fontsize=16)
    #legend = plt.legend(loc=9, bbox_to_anchor=(0, -0.21, 1., .102), ncol=3, mode='expand')
    #frame = legend.get_frame().set_edgecolor('0.5')
    plt.xlim(np.min(x),np.max(x))
    plt.ylim(-1.5, 3.25)

    plt.legend(loc='best', framealpha=1.0, borderpad=0.8, fontsize=14)
    
    # We change the fontsize of minor ticks label 
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.tick_params(axis='both', which='minor', labelsize=10)
    
    saveFileLocation=None
    if(plotExponential):
        saveFileLocation = graphsFolder + "Network Creation/Threshold Analysis/2000-2016 Threshold Analysis Scatter Plot Exponential_bigger_font.pdf"
    else:
        saveFileLocation = graphsFolder + "Edit Distance Analysis Hackerrank Javaarraylist With Forcast.pdf"
    #saveFileLocation=None
    #save the plot to a file
    pp = PdfPages(saveFileLocation)
    pp.savefig(fig)
    pp.close()
    
    
    #plt.show()

In [5]:
def plotEditDistanceAnalysisSumScatter():
    import matplotlib.pyplot as plt
    plotExponential = False
    
    fileName = folderPath + "evalData_javaarraylist-instrumenting_add_remove.json"
    jsonData = json_storage.readDataFromJson(fileName)
    
    legendArr = list()
    
    editDistDict = dict()
    xs = list()
    ys = list()
    ySum = 0.0
    for pairing in jsonData:
        dist = float(pairing[2])
        for innerDist in pairing[3]:
            dist = float(innerDist)
            xs.append(dist)
            if(not (dist in editDistDict)):
                editDistDict[dist] = list()
            if(pairing[6]):
                ys.append(1.0)
                editDistDict[dist].append(1.0)
                ySum += 1.0
            else:
                ys.append(-1.0)
                editDistDict[dist].append(-1.0)
                ySum += -1.0
    xs = list()
    ys = list()
    for editDistance in editDistDict.keys():
        xs.append(editDistance)
        ys.append(sum(editDistDict[editDistance]))
    
    x = np.array(xs)
    y = np.array(ys)
    
    
    xArr = [xs]
    yArr = [ys]
    legendArr.append("Hackerrank Java Arraylist")
    
    
    useLogYAxis=False
    usingDates=False
    xMax=np.max(x) + 1
    xMin=np.min(x) - 0.05
    yAxisMin=np.min(y) - 1
    yAxisMax=np.max(y) + 1
    useLogXAxis=False
    useScatter=True
    
    colors = None
    symbols = None
    secondAsLine=False
    #saveFileLocation = None
    saveFileLocation = graphsFolder + "Edit Distance Analysis Hackerrank Javaarraylist Sum Scatter Individual.pdf"
    linewidth = 1
    legendFontSize = 10
    Plot.plotLineGraph("", "Edit Distance", "Score Match Sum (1=Match and -1=Nonmatch)", xArr, yArr, legendArr, 0.2, useLogYAxis, usingDates, xMax, xMin, yAxisMin, yAxisMax, useLogXAxis, useScatter, colors, symbols, secondAsLine, saveFileLocation, linewidth, legendFontSize)

In [6]:
#plotEditDistanceAnalysisLine()
#plotEditDistanceAnalysisLineWithForcast()
plotEditDistanceAnalysisSumScatter()