In this script we will extract metrics for each commit. For that you need to download and install Understand Tool, available here https://scitools.com/trial-download-3/ . To run you will need license to Understand, student get free in https://scitools.com/non-commercial-license/

In [None]:
def printStatus(index, size):
    print("{0}% Completed samples".format((index / size) * 100))

In [None]:
from git import Repo

In [None]:
def getCommitsFrom(project):
    projectPath = "repositories\\"+project
    repository = Repo(projectPath)
    repository.git.checkout("master", "-f")
    iterCommits = repository.iter_commits()
    commits = []
    for c in iterCommits:
        commits.append(c)
    return commits

In [None]:
def checkoutTo(project, sha):
    projectPath = "repositories\\" + project
    repository = Repo(projectPath)
    repository.git.checkout(sha, "-f")

In [None]:
import os

In [None]:
def extractMetricsWithUnderstand(owner, project):
    understandPath = "understand\\SciTools\\bin\\pc-win64\\und"
    os.system('cmd /c "{0} create -languages java {1}"'.format(understandPath, project))
    os.system('cmd /c "{0} add {1} {2}"'.format(understandPath, "repositories\\"+owner+"\\"+project, project))
    os.system('cmd /c "{0} settings -metrics all {1}"'.format(understandPath, project))
    os.system('cmd /c "{0} settings -metricsOutputFile {1}.csv {1}"'.format(understandPath, project))
    os.system('cmd /c "{0} -quiet analyze {1}"'.format(understandPath, project))
    os.system('cmd /c "{0} metrics {1}"'.format(understandPath, project))

In [None]:
import pandas as pd
import numpy as np

In [None]:
def sumMetricsPerSample(owner, project):
    dataframe = pd.read_csv(project+".csv")
    numberJavaFiles = 0 if dataframe.empty else dataframe["Kind"].value_counts()["File"]
    dataframe = dataframe[dataframe["Kind"] == "Public Class"]
    dataframe["sample"] = owner+"\\"+project
    dataframe = dataframe.groupby("sample").sum()
    dataframe["numberJavaFiles"] = numberJavaFiles
    return dataframe

In [None]:
def averageMetricsPerSample(dataframe, owner, project):
    dataframe = dataframe[(dataframe["Kind"] == "Public Class") | (dataframe["Kind"] == "Private Class") | (dataframe["Kind"] == "Class")]
    dataframe["sample"] = owner+"\\"+project
    dataframe = dataframe.groupby("sample").mean()
    return dataframe

In [None]:
def medianMetricsPerSample(dataframe, owner, project):
    dataframe = dataframe[(dataframe["Kind"] == "Public Class") | (dataframe["Kind"] == "Private Class") | (dataframe["Kind"] == "Class")]
    dataframe["sample"] = owner+"\\"+project
    dataframe = dataframe.groupby("sample").median()
    return dataframe

In [None]:
def otherMetricsPerSample(dataframe, owner, project):
    numberJavaFiles = 0 if dataframe.empty else dataframe["Kind"].value_counts()["File"]
    dataframe = dataframe[dataframe["Kind"] == "Package"]
    dataframe.drop(dataframe.columns.difference(['CountDeclClass']), 1, inplace=True)
    dataframe["sample"] = owner+"\\"+project
    dataframe = dataframe.groupby("sample").sum()
    dataframe["numberJavaFiles"] = numberJavaFiles
    return dataframe

In [None]:
import fnmatch

In [None]:
def readMetricsFromCsv(project):
    dtype = {
        'Kind': np.str, 
        'Name': np.str, 
        'AvgCyclomatic': np.float64, 
        'AvgCyclomaticModified': np.float64,
        'AvgCyclomaticStrict': np.float64, 
        'AvgEssential': np.float64, 
        'AvgLine': np.float64, 
        'AvgLineBlank': np.float64,
       'AvgLineCode': np.float64, 
        'AvgLineComment': np.float64, 
        'CountClassBase': np.float64, 
        'CountClassCoupled': np.float64,
       'CountClassCoupledModified': np.float64, 
        'CountClassDerived': np.float64, 
        'CountDeclClass': np.float64,
       'CountDeclClassMethod': np.float64, 
        'CountDeclClassVariable': np.float64,
       'CountDeclExecutableUnit': np.float64, 
        'CountDeclFile': np.float64, 
        'CountDeclFunction': np.float64,
       'CountDeclInstanceMethod': np.float64, 
        'CountDeclInstanceVariable': np.float64,
       'CountDeclMethod': np.float64, 
        'CountDeclMethodAll': np.float64, 
        'CountDeclMethodDefault': np.float64,
       'CountDeclMethodPrivate': np.float64, 
        'CountDeclMethodProtected': np.float64,
       'CountDeclMethodPublic': np.float64, 
        'CountInput': np.float64, 
        'CountLine': np.float64, 
        'CountLineBlank': np.float64,
       'CountLineCode': np.float64, 
        'CountLineCodeDecl': np.float64, 
        'CountLineCodeExe': np.float64,
       'CountLineComment': np.float64, 
        'CountOutput': np.float64, 
        'CountPath': np.float64, 
        'CountPathLog': np.float64,
       'CountSemicolon': np.float64, 
        'CountStmt': np.float64, 
        'CountStmtDecl': np.float64, 
        'CountStmtExe': np.float64,
       'Cyclomatic': np.float64, 
        'CyclomaticModified': np.float64, 
        'CyclomaticStrict': np.float64, 
        'Essential': np.float64,
       'Knots': np.float64, 
        'MaxCyclomatic': np.float64, 
        'MaxCyclomaticModified': np.float64,
       'MaxCyclomaticStrict': np.float64, 
        'MaxEssential': np.float64, 
        'MaxEssentialKnots': np.float64,
       'MaxInheritanceTree': np.float64, 
        'MaxNesting': np.float64, 
        'MinEssentialKnots': np.float64,
       'PercentLackOfCohesion': np.float64, 
        'PercentLackOfCohesionModified': np.float64,
       'RatioCommentToCode': np.float64, 
        'SumCyclomatic': np.float64, 
        'SumCyclomaticModified': np.float64,
       'SumCyclomaticStrict': np.float64, 
        'SumEssential': np.float64
    }
    dataframe = pd.read_csv(project+".csv", dtype=dtype)
    return dataframe

In [None]:
def getUnderstandMetrics(owner, project):
    extractMetricsWithUnderstand(owner, project)
    
    dataframe = readMetricsFromCsv(project)
    
    avgMetrics = averageMetricsPerSample(dataframe.copy(), owner, project)
    avgMetrics["type"] = "average"
    avgMetrics.set_index("type")
    
    medMetrics = medianMetricsPerSample(dataframe.copy(), owner, project)
    medMetrics["type"] = "median"
    medMetrics.set_index("type")
    
    otherMetrics = otherMetricsPerSample(dataframe.copy(), owner, project)
    otherMetrics["type"] = "absolute"
    otherMetrics.set_index("type")
    
    return pd.concat([avgMetrics, medMetrics, otherMetrics], axis=0)

In [None]:
def findPaths(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        if '.git' in root:
            continue
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result

In [None]:
pip install JPype1

In [None]:
# Boiler plate stuff to start the module
import jpype
from jpype import *
from statistics import mean

class Readability(object):
    def __init__(self, project):
        super(Readability, self).__init__()
        self.project = project
        self.startJVM()
        self.readabilityPackage = JPackage("raykernel").apps.readability.eval.Main
        self.repositoryPath = "repositories\\"+project

    def startJVM(self):
        if not jpype.isJVMStarted():
            jpype.startJVM(jpype.getDefaultJVMPath(), '-Djava.class.path=readability.jar',  '-ea', '-Xmx4096M', convertStrings=True)

    def shutdownJVM(self):
        jpype.shutdownJVM()

    def getReadability(self):
        javaFiles = findPaths("*.java", self.repositoryPath)
        if len(javaFiles) == 0: return 0
        readability = 0
        array = []
        for javaFile in javaFiles:
            try:
                file = open(javaFile).read()
            except:
                file = ""
            array.append(float(self.readabilityPackage.getReadability(file)))
        return mean(array)

In [None]:
def getReadability(owner, project):
    r = Readability(owner+"\\"+project)
    readability = r.getReadability()
    del r
    return readability

In [None]:
def getMetrics(commit, owner, project):
    metrics = getUnderstandMetrics(owner, project)
    metrics["commitSha"] = commit.hexsha
    metrics["commitDate"] = commit.authored_datetime
    metrics["readability"] = getReadability(owner, project)
    return metrics

In [None]:
def deleteUnusedFiles(sample):
    os.remove(sample + ".csv")
    os.remove(sample + ".udb")

In [None]:
def createDirectoryIfNotExists(dirName):
    if not os.path.exists(dirName):
            os.makedirs(dirName)

In [None]:
def replaceSamplePathForWindowsLike(sample):
    return sample.replace("/", "\\")

In [None]:
def extractMetricsByCommit(sample, commit):
    owner, project = sample.split("\\")
    checkoutTo(sample, commit.hexsha)
    print("commit ======= " + commit.hexsha)
    metrics = getMetrics(commit, owner, project)
    return metrics

In [None]:
def extractMetricsForAllCommits(commits, sample):
    allCommits = pd.DataFrame()
    for index, commit in enumerate(commits):
        metrics = extractMetricsByCommit(sample, commit)
        allCommits = allCommits.append(metrics, ignore_index=True)
        print("{0}% of commits completed from sample {1}".format((index/len(commits) * 100), sample))
#         allCommits.to_csv("metrics\\"+sample+".csv", index=False)
    return allCommits

In [None]:
def metricsByCommit(framework, samples):
    
    for index, sample in enumerate(samples):
        sample = replaceSamplePathForWindowsLike(sample)
        
        printStatus(index+1, len(samples))
        
        createDirectoryIfNotExists("metrics")
        
        commits = getCommitsFrom(sample)
        commits.reverse()
        
        owner, project = sample.split("\\")
        
        createDirectoryIfNotExists("metrics\\"+owner)

        ######### if the script crash, copy the commit sha of the last sucess analyses and paste below. And then uncomment this code
#         for index, commit in enumerate(commits):
#             if commit.hexsha == "b6e4c433c0298ae765ef79143f6e67ebbf9a67d0":
#                 position = index
#         commits = commits[position+1:]
        
        allCommits = extractMetricsForAllCommits(commits, sample)
        
        deleteUnusedFiles(project)
        
        allCommits.to_csv("metrics\\"+sample+".csv", index=False)

In [None]:
androidSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\androidSamples.csv")
awsSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\awsSamples.csv")
azureSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\azureSamples.csv")
springSamples = pd.read_csv("..\\1-GettingQuestions\\samplesWithQuestions\\springSamples.csv")

In [None]:
metricsByCommit("Azure", azureSamples["path"])

In [None]:
metricsByCommit("AWS", awsSamples["path"])

In [None]:
metricsByCommit("Spring", springSamples["path"])

In [None]:
metricsByCommit("Android", androidSamples["path"])