# Wie giftig ist der Quelltext?

## 1. Verbindung zur Datenbank
Es wird eine Verbindung zur Neo4j-Datenbank aufgebaut.

In [1]:
import py2neo

graph = py2neo.Graph(bolt=True, host='localhost', user='neo4j', password='neo4j')

## 2. Cypher-Abfrage
Es werden mehrere Abfragen an die Datenbank gestellt. Die Ergebnisse werden in Dataframes (pandas) gespeichert und schließlich in einem Dataframe zusammengeführt.

In [2]:
import pandas as pd

classfanoutcomplexity_query ="MATCH (c:Type)-[:DEPENDS_ON]->(d:Type) WHERE NOT c.name CONTAINS '$' RETURN DISTINCT c.fqn as Type, count(d) AS ClassFanOutComplexity ORDER BY ClassFanOutComplexity DESC"
classfanoutcomplexity_df = pd.DataFrame(graph.run(classfanoutcomplexity_query).data())

cyclomaticcomplexity_query ="MATCH (c:Type)-[DECLARES]->(m:Method) WHERE NOT c.name CONTAINS '$' AND EXISTS(m.cyclomaticComplexity) RETURN DISTINCT m.signature AS Method, c.fqn AS Type, m.cyclomaticComplexity as CyclomaticComplexity ORDER BY CyclomaticComplexity DESC"
cyclomaticComplexity_df = pd.DataFrame(graph.run(cyclomaticcomplexity_query).data())

methodlength_query ="MATCH (c:Type)-[DECLARES]->(m:Method) WHERE NOT c.name CONTAINS '$' AND EXISTS(m.effectiveLineCount) RETURN DISTINCT m.signature AS Method, c.fqn AS Type, m.effectiveLineCount AS MethodLength ORDER BY MethodLength DESC"
methodLength_df = pd.DataFrame(graph.run(methodlength_query).data())

parameternumber_query ="MATCH (c:Type)-[DECLARES]->(m:Method)-[HAS]->(p:Parameter) WHERE NOT c.name CONTAINS '$' RETURN m.signature AS Method, c.fqn AS Type, count(p) AS ParameterNumber ORDER BY ParameterNumber DESC"
parameterNumber_df = pd.DataFrame(graph.run(parameternumber_query).data())

## 3. Datenaufbereitung
Zunächst werden die vier Dataframes bereinigt. Hierzu müssen alle Zeilen (Klassen oder Methoden) mittels der Methode ```drop()``` und der Option ```inplace = True```gelöscht werden, die geringer als folgende Schwellenwerte sind.

Metrik |	Schwellenwert
---- |----
ClassFanOutComplexity | 30
CyclomaticComplexity | 10
MethodLength | 30
ParameterNumber | 6

In [3]:
# Entferne alle Klassen, die weniger als 30 Abhängigkeiten zu anderen Klassen besitzen.
classfanoutcomplexity_df.drop(classfanoutcomplexity_df[classfanoutcomplexity_df['ClassFanOutComplexity'] < 30 ].index , inplace=True)
display(classfanoutcomplexity_df.head())

# Entferne alle Methoden, deren zyklomatische Komplexität kleiner als 10 ist.
cyclomaticComplexity_df.drop(cyclomaticComplexity_df[cyclomaticComplexity_df['CyclomaticComplexity']<10].index, inplace=True)
display(cyclomaticComplexity_df.head())

# Entferne alle Methoden, deren Anzahl der Quelltextzeilen kleiner als 30 ist.
methodLength_df.drop(methodLength_df[methodLength_df['MethodLength'] < 30 ].index , inplace=True)
display(methodLength_df.head())

# Entferne alle Methoden, die weniger als 6 Paramter besitzen.
parameterNumber_df.drop(parameterNumber_df[parameterNumber_df['ParameterNumber'] < 6 ].index , inplace=True)
display(parameterNumber_df.head())


Unnamed: 0,ClassFanOutComplexity,Type
0,71,org.junit.experimental.categories.CategoryTest
1,61,org.junit.rules.ExpectedExceptionTest
2,58,org.junit.tests.running.methods.AnnotationTest
3,50,org.junit.runners.ParentRunner
4,50,org.junit.tests.running.classes.ParameterizedT...


Unnamed: 0,CyclomaticComplexity,Method,Type
0,18,"void arrayEquals(java.lang.String,java.lang.Ob...",org.junit.internal.ComparisonCriteria
1,13,junit.framework.TestResult start(java.lang.Str...,junit.textui.TestRunner
2,13,java.lang.String[] parseOptions(java.lang.Stri...,org.junit.runner.JUnitCommandLineParseResult
3,12,"void execTest(java.lang.String,boolean)",junit.tests.runner.TextRunnerTest
4,12,void validateFields(java.util.List),org.junit.runners.parameterized.BlockJUnit4Cla...


Unnamed: 0,Method,MethodLength,Type
0,java.lang.Object[][] testsWithEventMatcher(),60,org.junit.rules.ErrorCollectorTest
1,java.util.Collection testsWithEventMatcher(),55,org.junit.rules.ExpectedExceptionTest
2,"void arrayEquals(java.lang.String,java.lang.Ob...",35,org.junit.internal.ComparisonCriteria
3,junit.framework.Test getTest(java.lang.String),34,junit.runner.BaseTestRunner


Unnamed: 0,Method,ParameterNumber,Type


Als Nächstes werden die drei Dataframes ```cyclomaticComplexity_df```, ```methodLength_df``` und ```parameterNumber_df``` mittels der Methode ```merge()``` über die gemeinsame Spalte ```on = 'Type'``` mit der Option ``` how = 'outer'``` zusammengeführt und im Dataframe ```method_metrics_df``` gespeichert. Alle Zellen, die keinen Wert besitzen (NaN), werden mittels der Methode ```fillna(0)``` auf 0 gesetzt. 

In [4]:
# Die drei Dataframes mit Methodenmetriken werden zusammengeführt und im Dataframe method_metrics_df abgebildet.
method_metrics_df = pd.merge(cyclomaticComplexity_df, methodLength_df[['Type', 'MethodLength']], how='outer', on = 'Type').merge(parameterNumber_df[['Type', 'ParameterNumber']], how='outer', on = 'Type')


# Fehlende Werte werden auf 0 gesetzt.
method_metrics_df = method_metrics_df.fillna(0)
method_metrics_df.head(20)

Unnamed: 0,CyclomaticComplexity,Method,Type,MethodLength,ParameterNumber
0,18.0,"void arrayEquals(java.lang.String,java.lang.Ob...",org.junit.internal.ComparisonCriteria,35.0,0.0
1,13.0,junit.framework.TestResult start(java.lang.Str...,junit.textui.TestRunner,0.0,0.0
2,13.0,java.lang.String[] parseOptions(java.lang.Stri...,org.junit.runner.JUnitCommandLineParseResult,0.0,0.0
3,12.0,"void execTest(java.lang.String,boolean)",junit.tests.runner.TextRunnerTest,0.0,0.0
4,12.0,void validateFields(java.util.List),org.junit.runners.parameterized.BlockJUnit4Cla...,0.0,0.0
5,12.0,void readPreferences(),junit.runner.BaseTestRunner,34.0,0.0
6,11.0,java.lang.String processArguments(java.lang.St...,junit.runner.BaseTestRunner,34.0,0.0
7,12.0,void evaluate(),org.junit.internal.runners.statements.RunAfters,0.0,0.0
8,10.0,void runBare(),junit.framework.TestCase,0.0,0.0
9,10.0,java.io.File newFolder(java.lang.String[]),org.junit.rules.TemporaryFolder,0.0,0.0


Im folgenden Codeabschnitt werden im Dataframe ```method_metrics_df``` mehrfach auftretende Typen mittels der Methode ```groupBy()``` zusammengeführt.

In [5]:
# Führe mehrfach auftretende Typen zusammen.
method_metrics_df = method_metrics_df.groupby("Type", as_index=False).sum()
method_metrics_df.head(20)

Unnamed: 0,Type,CyclomaticComplexity,MethodLength,ParameterNumber
0,junit.framework.TestCase,10.0,0.0,0.0
1,junit.runner.BaseTestRunner,23.0,68.0,0.0
2,junit.tests.runner.TextRunnerTest,12.0,0.0,0.0
3,junit.textui.TestRunner,13.0,0.0,0.0
4,org.junit.internal.ComparisonCriteria,18.0,35.0,0.0
5,org.junit.internal.Throwables,10.0,0.0,0.0
6,org.junit.internal.runners.MethodValidator,10.0,0.0,0.0
7,org.junit.internal.runners.statements.RunAfters,12.0,0.0,0.0
8,org.junit.rules.ErrorCollectorTest,0.0,60.0,0.0
9,org.junit.rules.ExpectedExceptionTest,0.0,55.0,0.0


Als Nächstes werden die beiden Dataframes ```classfanoutcomplexity_df``` und ```method_metrics_df``` mittels der Methode ```merge()``` über die gemeinsame Spalte ```on = 'Type'``` mit der Option ``` how = 'outer'``` zusammengeführt und im Dataframe ```toxicity_df``` gespeichert. Alle Zellen, die keinen Wert besitzen (NaN), werden mittels der Methode ```fillna(0)``` auf 0 gesetzt. 

In [6]:
# Die Dataframes classfanoutcomplexity_df und method_metrics_df werden zusammengeführt.
toxicity_df = pd.merge(classfanoutcomplexity_df, method_metrics_df, how='outer', on = 'Type')

# Fehlende Werte werden auf 0 gesetzt.
toxicity_df= toxicity_df.fillna(0)
toxicity_df.head(20)

Unnamed: 0,ClassFanOutComplexity,Type,CyclomaticComplexity,MethodLength,ParameterNumber
0,71.0,org.junit.experimental.categories.CategoryTest,0.0,0.0,0.0
1,61.0,org.junit.rules.ExpectedExceptionTest,0.0,55.0,0.0
2,58.0,org.junit.tests.running.methods.AnnotationTest,0.0,0.0,0.0
3,50.0,org.junit.runners.ParentRunner,0.0,0.0,0.0
4,50.0,org.junit.tests.running.classes.ParameterizedT...,0.0,0.0,0.0
5,48.0,org.junit.runners.BlockJUnit4ClassRunner,0.0,0.0,0.0
6,45.0,org.junit.rules.TestRuleTest,0.0,0.0,0.0
7,43.0,junit.runner.BaseTestRunner,23.0,68.0,0.0
8,42.0,org.junit.tests.running.methods.TimeoutTest,0.0,0.0,0.0
9,41.0,org.junit.runners.model.TestClass,0.0,0.0,0.0


Jetzt wird mittels ```apply()``` im resultierenden Dataframe für jede Spalte der Toxicity-Wert berechnet, indem die absoluten Werte durch den jeweiligen Schwellenwert geteilt werden. Der absolute Wert wird dabei stets überschrieben. Zudem wird die Spalte ```ToxicityScore``` als Summe aller Toxicity-Werte für eine Klasse eingeführt und alle Werte entsprechend des Toxicity-Scores absteigend sortiert.

In [7]:
# Berechne den Toxicity-Wert für ClassFanOutComplexity.
toxicity_df['ClassFanOutComplexity'] = toxicity_df['ClassFanOutComplexity'].apply(lambda value: value/30)

# Berechne den Toxicity-Wert für CyclomaticComplexity.
toxicity_df['CyclomaticComplexity'] = toxicity_df['CyclomaticComplexity'].apply(lambda value: value/10)

# Berechne den Toxicity-Wert für MethodLength.
toxicity_df['MethodLength'] = toxicity_df['MethodLength'].apply(lambda value: value/30)

# Berechne den Toxicity-Wert für ParameterNumber.
toxicity_df['ParameterNumber'] = toxicity_df['ParameterNumber'].apply(lambda value: value/6)

# Erzeuge die Spalte ToxicityScore und berechne die Zeilensumme aus ClassFanOutComplexity, CyclomaticComplexity, MethodLength und ParameterNumber.
toxicity_df['ToxicityScore'] = toxicity_df.ClassFanOutComplexity + toxicity_df.CyclomaticComplexity + toxicity_df.MethodLength + toxicity_df.ParameterNumber

# Sortiere alle Werte absteigend nach dem ToxicityScore.
toxicity_df = toxicity_df.sort_values('ToxicityScore', ascending=False)

toxicity_df.head(20)

Unnamed: 0,ClassFanOutComplexity,Type,CyclomaticComplexity,MethodLength,ParameterNumber,ToxicityScore
7,1.433333,junit.runner.BaseTestRunner,2.3,2.266667,0.0,6.0
1,2.033333,org.junit.rules.ExpectedExceptionTest,0.0,1.833333,0.0,3.866667
11,1.3,org.junit.rules.ErrorCollectorTest,0.0,2.0,0.0,3.3
34,0.0,org.junit.internal.ComparisonCriteria,1.8,1.166667,0.0,2.966667
15,1.2,org.junit.runners.parameterized.BlockJUnit4Cla...,1.2,0.0,0.0,2.4
0,2.366667,org.junit.experimental.categories.CategoryTest,0.0,0.0,0.0,2.366667
2,1.933333,org.junit.tests.running.methods.AnnotationTest,0.0,0.0,0.0,1.933333
3,1.666667,org.junit.runners.ParentRunner,0.0,0.0,0.0,1.666667
4,1.666667,org.junit.tests.running.classes.ParameterizedT...,0.0,0.0,0.0,1.666667
5,1.6,org.junit.runners.BlockJUnit4ClassRunner,0.0,0.0,0.0,1.6


## 4. Visualisierung
Die Daten werden mittels eines Stacked Barcharts visualisiert. Die Grundlage für diese Visualisierung bildett das Toxcitiy Chart (https://erik.doernenburg.com/2008/11/how-toxic-is-your-code/).



In [8]:
from IPython.display import display, HTML

base_html = """
<!DOCTYPE html>
<html>
  <head>
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script>
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script>
  </head>
  <body>
    <figure>
      {rendered_chart}
    </figure>
  </body>
</html>
"""

In [9]:
import pygal
stacked_bar_chart = pygal.StackedBar(show_legend=True, human_readable=True, fill=False, x_label_rotation=90, truncate_label=-1, truncate_legend=-1)
stacked_bar_chart.title = 'Toxicity Chart'
stacked_bar_chart.x_labels = toxicity_df['Type'].tolist()
stacked_bar_chart.add('ClassFanOutComplexity', toxicity_df['ClassFanOutComplexity'].tolist())
stacked_bar_chart.add('CyclomaticComplexity', toxicity_df['CyclomaticComplexity'].tolist())
stacked_bar_chart.add('MethodLength', toxicity_df['MethodLength'].tolist())
stacked_bar_chart.add('ParameterNumber', toxicity_df['ParameterNumber'].tolist())
display(HTML(base_html.format(rendered_chart=stacked_bar_chart.render(is_unicode=True))))