In [135]:
import pandas
import plotly.express as px
from IPython.display import display, Markdown

%load_ext cypher
%config CypherMagic.uri='http://neo4j:neo@localhost:7474/db/data'

The cypher extension is already loaded. To reload it, use:
  %reload_ext cypher


# Experten-Analyse anhand der Git-Historie

## Fragestellung
1. Welche Entwickler können als Experten der fachlichen Komponenten identifiziert werden?
2. Wie verteilt sich das Wissen über die fachlichen Komponeten auf die Entwickler? 
    * Tragen einzelene Entwickler eher zu vielen oder eher nur zu einer einzigen fachlichen Komponente etwas bei? 
    * Gibt es Komponenten, an denen praktisch nur ein Entwickler arbeitet, sodass das Wissen stark konzentriert ist?

## Datenquelle
* Java-Strukturen der Spring-Data-MongoDB-Anwendung mittels jQAssistant gescannt und in Neo4j abfragbar
* Git-Historie mittels jQAssistant gescannt und in Neo4j abfragbar


* Identifikation der fachlichen Komponenten im Source Code (siehe 0)
* Matching zwischen Entwicklern und fachlichen Komponenten

## Annahmen
* Viele Commits eines Entwicklers ist gleichbedeutend mit vielen Beiträgen (?) (Anzahl der geänderten Zeilen eines Commits wird im jQAssistant nicht erfasst)
* **TODO**: "Committer" und "Author" eines Commits manchmal verschieden 
    * Author hat inhaltliche Änderungen gemacht. Comitter kann abweichen, z.B. beim Mergen oder wenn Author keine Commit-Rechte hat.
    * Hier erstmal nur Betrachtung des Authors.

## Validierung
* Zur Auswertung der Fragestellung betrachen wir
    * Anzahl Commits der Entwickler, die Änderungen in den fachlichen Komponenten enthalten
    * Anzahl der geänderten Dateien und Art der Modifikation, die Entwickler durch die Commits vornehmen
    * Zeitraum, über den Entwickler die Commits erstellt haben

## Implementierung

* Prüfung der Author-Knoten nach Duplikaten

In [138]:
%%cypher
// Prüfen, ob es unterschiedliche E-Mailadresse für den gleichen Namen gibt
MATCH (author:Author) 
RETURN count(DISTINCT author.name) AS AuthorNames, count(DISTINCT author.email) AS AuthorEmails

1 rows affected.


AuthorNames,AuthorEmails
149,149


* Es gibt genauso viele unterschiedliche E-Mailadressen wie Namen, als keine Namen mit mehreren E-Mail-Adressen.
* Händische Prüfung ergibt folgende (vermutete) Duplikate:
    * `Mark Pollack` `mark.pollack@springsource.com` und `mpollack` `mpollack@vmware.com`
    * `Greg Turnquist` `gturnquist@vmware.com` und `Greg L. Turnquist` `gturnquist@pivotal.io`
    * `owen.qqq` `owen.qqq@kakaocommerce.com` und `owen-q` `owen.q.dev@gmail.com`


In [139]:
%%cypher
// Bereinigung von Autor-Duplikaten (Manuelles Postprocessing)
WITH [
  ["Mark Pollack", "mark.pollack@springsource.com", "mpollack@vmware.com"],
  ["Greg Turnquist", "gturnquist@vmware.com", "gturnquist@pivotal.io"],
  ["owen.qqq", "owen.qqq@kakaocommerce.com", "owen.q.dev@gmail.com"]
] AS authors
UNWIND authors AS duplicateAuthor
MATCH (author:Author{email: duplicateAuthor[1]}),
      (duplicate:Author{email: duplicateAuthor[2]})
SET author.name = duplicateAuthor[0]      
WITH author, duplicate
MATCH (duplicate)-[:COMMITTED]->(c:Commit)
MERGE (author)-[:COMMITTED]->(c)
DETACH DELETE duplicate
RETURN author.name AS AuthorName, author.email AS AuthorMail, count(DISTINCT duplicate) AS Duplicates

0 rows affected.


AuthorName,AuthorMail,Duplicates


## Ergebnisse

### Entwickler und Anzahl Commits - allgemein

In [141]:
# Gruppierung Entwickler nach Anzahl Commits
authorsByCommitCount = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(file:File) \
                           WHERE NOT c:Merge AND NOT a.name CONTAINS "Spring" \
                           RETURN a.name as Entwickler, count(DISTINCT c) AS Commits

authorsByCommitCount_df = authorsByCommitCount.get_dataframe()

count_1_to_10 = 0
count_11_to_100 = 0
count__101_to_500 = 0
count__501_to_inf = 0
for i, row in authorsByCommitCount_df.iterrows():
    if row['Commits'] >= 501:
        count__501_to_inf += 1
    elif row['Commits'] >= 101:
        count__101_to_500 += 1
    elif row['Commits'] >= 11:
        count_11_to_100 += 1
    elif row['Commits'] >= 1:
        count_1_to_10 += 1

data = dict()
data['commit_count_intervals'] = ['1 bis 10', '11 bis 100', '101 bis 500', 'ab 501']
data['author_count'] = [count_1_to_10, count_11_to_100, count__101_to_500, count__501_to_inf]

authorCountGroupedByCommitCount_df = pandas.DataFrame(data=data)
fig = px.fig = px.bar(authorCountGroupedByCommitCount_df, x='commit_count_intervals', y='author_count', 
                      title='Gruppierung der Entwickler nach Anzahl ihrer Commits',
                      labels={'commit_count_intervals':'Anzahl der Commits', 'author_count': 'Anzahl der Entwickler'})
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1, opacity=0.6)
fig.show()


146 rows affected.


In [165]:
topAuthorsByCommitCount = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit), \
                                 (c)-[:CONTAINS_CHANGE]->(:Change)-[]->(file:File) \
                           WHERE NOT c:Merge AND NOT a.name STARTS WITH "Spring" \
                           WITH a.name as Author, count(DISTINCT c) AS Commits \
                           WHERE Commits > 10 \
                           RETURN Author, Commits \
                           ORDER BY Commits DESC
                            
topAuthorsByCommitCount_df = topAuthorsByCommitCount.get_dataframe()
fig = px.fig = px.bar(topAuthorsByCommitCount_df, x='Author', y='Commits', 
                      labels={'Author': 'Entwickler', 'Commits': 'Anzahl Commits'},
                      title='Top-13-Entwickler mit den meisten Commits (mit > 10 Commits)')
fig.update_traces(marker_color='rgb(42, 105, 137)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1, opacity=0.6)
fig.show()

13 rows affected.


In [143]:
commitCountByAuthor = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit), \
                                 (c)-[:CONTAINS_CHANGE]->(:Change)-[:MODIFIES]->(file:File) \
                           WHERE NOT c:Merge AND NOT a.name STARTS WITH 'Spring' \
                           RETURN a.name as Entwickler, count(DISTINCT c) AS Commits \
                           ORDER BY Commits DESC LIMIT 13
                            

commitCountByAuthor_df = commitCountByAuthor.get_dataframe()

fig = px.pie(commitCountByAuthor_df, values='Commits', names='Entwickler', title='Verteilung der Commits auf Top-13-Entwickler (mit min. 10 Commit und 0,2% Anteil an allen Commits)')
fig.show()

13 rows affected.


In [144]:
TOP_13_AUTHORS = []
for i, row in topAuthorsByCommitCount_df.iterrows():
    TOP_13_AUTHORS.append(row['Author'])

Betrachtung der Top-13-Entwickler im Folgenden

### Entwickler und Anzahl Commits - bezogen auf fachliche Komponenten

In [145]:
%%cypher
// Committers je Bounded Context
MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File),
      (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext)
WHERE NOT c:Merge
RETURN bC.name AS BoundedContext, a.name AS Author, count(DISTINCT c) AS Commits
ORDER BY BoundedContext, Commits Desc

262 rows affected.


BoundedContext,Author,Commits
aggregation,Christoph Strobl,146
aggregation,Mark Paluch,119
aggregation,Thomas Darimont,39
aggregation,Spring Operator,20
aggregation,Oliver Gierke,13
aggregation,Oliver Drotbohm,11
aggregation,Eddú Meléndez,4
aggregation,Matt Morrissette,3
aggregation,Gustavo de Geus,3
aggregation,Christian Ivan,3


In [146]:
%%cypher
// Top-Committer je Bounded Context
MATCH    (c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File),
         (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext),
         (a:Author)-[:COMMITTED]->(c)
WHERE    NOT c:Merge
WITH     bC.name AS BoundedContext, a.name AS Author, count(DISTINCT c) AS Commits
ORDER BY BoundedContext, Commits Desc
WITH     BoundedContext, collect(Author)[..1] AS TopAuthorList
UNWIND   TopAuthorList AS TopAuthor
RETURN   TopAuthor, collect(BoundedContext) AS BoundedContexts

3 rows affected.


TopAuthor,BoundedContexts
Christoph Strobl,"['aggregation', 'convert', 'core', 'index', 'mapping', 'query', 'repository', 'spel', 'timeseries', 'util']"
Mark Paluch,"['config', 'geo', 'gridfs', 'mapreduce', 'messaging', 'schema', 'validation']"
Spring Operator,"['monitor', 'script']"


In [147]:
%%cypher
// BC, TopAuthor with CommitCount
MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File),
      (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext)
WHERE NOT c:Merge
WITH     bC.name AS BoundedContext, a.name AS Author, count(DISTINCT c) AS Commits
ORDER BY BoundedContext, Commits Desc 
WITH   BoundedContext, collect(Commits)[..1] AS CommitCountByTopAuthorList, collect(Author)[..1] AS TopAuthorList
UNWIND CommitCountByTopAuthorList AS CommitCountByTopAuthor
UNWIND TopAuthorList AS TopAuthor
RETURN BoundedContext, TopAuthor, CommitCountByTopAuthor

19 rows affected.


BoundedContext,TopAuthor,CommitCountByTopAuthor
aggregation,Christoph Strobl,146
config,Mark Paluch,46
convert,Christoph Strobl,257
core,Christoph Strobl,231
geo,Mark Paluch,23
gridfs,Mark Paluch,41
index,Christoph Strobl,73
mapping,Christoph Strobl,57
mapreduce,Mark Paluch,22
messaging,Mark Paluch,23


In [148]:

bcCommitAndAuthorCount = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File), \
                                (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext) \
                                WHERE NOT c:Merge \
                                RETURN 	 bC.name AS BoundedContext, count(DISTINCT c) AS TotalCommitCount, count(DISTINCT a.name) AS AuthorCount \
                                ORDER BY TotalCommitCount DESC


df = bcCommitAndAuthorCount.get_dataframe()

fig = px.line(df, x='BoundedContext', y=['AuthorCount', 'TotalCommitCount'], height=1500, width=1000,
              labels={'value': 'Anzahl Commits bzw. Entwickler', 'variable': ''},
              title='Gegenüberstellung: Anzahl Commits und Anzahl beteiligter Entwickler in einer Komponente', markers=True)
fig.data[0].name = 'Anzahl beteiligter Entwickler'
fig.data[1].name = 'Anzahl Commits'
fig.show()

19 rows affected.


In [149]:
# Vergleich Commit-Anteile des Top-Entwicklers and aller anderen Entwickler
# BC, CommitCountByBC, TopAuthor, CommitCountByTopAuthor, CommitRatio
commitRatioForBc = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File), \
                                (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext) \
                                WHERE NOT c:Merge \
                                WITH bC.name AS BoundedContext, count(DISTINCT c) AS TotalCommitCount, a.name AS Author \
                                ORDER BY TotalCommitCount DESC \
                                WITH DISTINCT BoundedContext, sum(TotalCommitCount) AS CommitCountByBC, collect(Author)[..1] AS TopAuthorList, collect(TotalCommitCount)[..1] AS CommitCountByTopAuthorList \
                                UNWIND CommitCountByTopAuthorList AS CommitCountByTopAuthor \
                                UNWIND TopAuthorList AS TopAuthor \
                                RETURN BoundedContext, CommitCountByBC, TopAuthor, CommitCountByTopAuthor, (100*CommitCountByTopAuthor/CommitCountByBC) AS CommitRatio \
                                ORDER BY BoundedContext

commitRatioForBc_query_df = commitRatioForBc.get_dataframe()
print(commitRatioForBc)

commit_ratio_other_authors_list = []
for i, row in commitRatioForBc_query_df.iterrows():
    commit_ratio_other_authors = 100 - row['CommitRatio']
    commit_ratio_other_authors_list.append(commit_ratio_other_authors)

commitRatioTopAuthor_data = dict()
commitRatioTopAuthor_data['bounded_context'] = commitRatioForBc_query_df['BoundedContext']
commitRatioTopAuthor_data['commit_ratio_top_author'] = commitRatioForBc_query_df['CommitRatio']
commitRatioTopAuthor_data['commit_ratio_other_authors'] = commit_ratio_other_authors_list


commitRatioForBc_df = pandas.DataFrame(data=commitRatioTopAuthor_data)
fig = px.bar(commitRatioForBc_df, x='bounded_context', y=['commit_ratio_top_author', 'commit_ratio_other_authors'],
             title='Anteil des Top-Comitters an allen Commits einer fachlichen Komponete', 
             labels={'bounded_context': 'Bounded Context', 'value': 'Commit-Anteile in %', 'variable': ''},
             color_discrete_map={'commit_ratio_top_author': 'rgb(42, 105, 137)', 'commit_ratio_other_authors': 'rgb(157, 203, 225)'})
fig.update_traces(marker_line_color='rgb(42, 105, 137)', marker_line_width=1, opacity=0.7)
fig.data[0].name = 'Commit-Anteil des Top-Entwicklers'
fig.data[1].name = 'Commit-Anteil aller anderen Entwickler'


fig.show()

19 rows affected.
+----------------+-----------------+------------------+------------------------+-------------+
| BoundedContext | CommitCountByBC |    TopAuthor     | CommitCountByTopAuthor | CommitRatio |
+----------------+-----------------+------------------+------------------------+-------------+
|  aggregation   |       371       | Christoph Strobl |          146           |      39     |
|     config     |       180       |   Mark Paluch    |           46           |      25     |
|    convert     |       690       | Christoph Strobl |          257           |      37     |
|      core      |       725       | Christoph Strobl |          231           |      31     |
|      geo       |        61       |   Mark Paluch    |           23           |      37     |
|     gridfs     |       108       |   Mark Paluch    |           41           |      37     |
|     index      |       200       | Christoph Strobl |           73           |      36     |
|    mapping     |       206    

In [150]:
# Vergleich Commit-Anteile der Top-2-Entwicklers and aller anderen Entwickler
# BC, CommitCountByBC, TopAuthor, CommitCountByTopAuthor, CommitRatio
commitRatioForBc = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File), \
                                (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext) \
                                WHERE NOT c:Merge \
                                WITH bC.name AS BoundedContext, count(DISTINCT c) AS TotalCommitCount, a.name AS Author \
                                ORDER BY TotalCommitCount DESC \
                                RETURN BoundedContext, sum(TotalCommitCount) AS CommitCountByBC, collect(Author)[..3] AS Top3Authors, collect(TotalCommitCount)[..3] AS CommitCountByTop3Authors ORDER BY BoundedContext
        
commitRatioForBc_query_df = commitRatioForBc.get_dataframe()

author_list = []
commit_ratio_top_authors_list = []
commit_ratio_other_authors_list = []
for i, row in commitRatioForBc_query_df.iterrows():
    authors = row['Top3Authors']
    for author in authors:
        if author not in author_list:
            author_list.append(author)
    
    commit_sum_top_authors = sum(row['CommitCountByTop3Authors'])
    commit_ratio_top_authors = int(round(100 * commit_sum_top_authors / row['CommitCountByBC']))
    commit_ratio_other_authors = 100 - commit_ratio_top_authors
    commit_ratio_other_authors_list.append(commit_ratio_other_authors)
    commit_ratio_top_authors_list.append(commit_ratio_top_authors)

commitRatioTop3Authors_data = dict()
commitRatioTop3Authors_data['bounded_context'] = commitRatioForBc_query_df['BoundedContext']
commitRatioTop3Authors_data['commit_ratio_top_authors'] = commit_ratio_top_authors_list
commitRatioTop3Authors_data['commit_ratio_other_authors'] = commit_ratio_other_authors_list

commitRatioForBc_df = pandas.DataFrame(data=commitRatioTop3Authors_data)
fig = px.bar(commitRatioForBc_df, x='bounded_context', y=['commit_ratio_top_authors', 'commit_ratio_other_authors'],
             title='Anteil der Top-3-Entwickler an allen Commits einer fachlichen Komponete', 
             labels={'bounded_context': 'Bounded Context', 'value': 'Commit-Anteile in %', 'variable': ''},
             color_discrete_map={'commit_ratio_top_authors': 'rgb(42, 105, 137)', 'commit_ratio_other_authors': 'rgb(157, 203, 225)'})
fig.update_traces(marker_line_color='rgb(42, 105, 137)', marker_line_width=1, opacity=0.7)
fig.data[0].name = 'Commit-Anteil der Top-3-Entwickler'
fig.data[1].name = 'Commit-Anteil aller anderen Entwickler'
fig.show()

display(Markdown('#### Alle Namen der Top-Entwickler: '))
for author in author_list:
    display(Markdown(f'{author}'))

19 rows affected.


#### Alle Namen der Top-Entwickler: 

Christoph Strobl

Mark Paluch

Thomas Darimont

Oliver Gierke

Spring Operator

Oliver Drotbohm

### Zu wie vielen Komponenten tragen einzelne Entwickler bei?
* Sind die Entwickler, die zu den vielen Komponenten etwas beigetragen haben, identisch mit denen, die die meisten Commits erstellt haben?

In [270]:
# Entwickler, die zu den meisten Komponenten etwas beigetagen haben

bcCountByAuthor_query = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File), \
            (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext) \
            WHERE NOT c:Merge AND a.name <> "Spring Operator" \
            WITH bC.name AS BoundedContext, a.name AS Author, count(DISTINCT c) AS Commits \
            WITH collect(BoundedContext) AS BoundedContextsListForCount, Author \
            UNWIND BoundedContextsListForCount AS BoundedContextForCount \
            WITH  Author, count(BoundedContextForCount) AS BoundedContextCount \
            WHERE BoundedContextCount > 1 \
            RETURN Author, BoundedContextCount \
            ORDER BY BoundedContextCount DESC

bcCountByAuthor_query_df = bcCountByAuthor_query.get_dataframe()
rows_to_remove = []
for i, row in bcCountByAuthor_query_df.iterrows():
    if row['Author'] not in TOP_13_AUTHORS and row['BoundedContextCount'] < 4:
        rows_to_remove.append(i)

for index in rows_to_remove:
    bcCountByAuthor_query_df = bcCountByAuthor_query_df.drop(index)

column_index_non_top_authors = []
for i, row in bcCountByAuthor_query_df.iterrows():
    if row['Author'] not in TOP_13_AUTHORS:
        column_index_non_top_authors.append(i)

colors = ['rgb(179, 204, 204)'] * bcCountByAuthor_query_df.shape[0]
for columnIndex in column_index_non_top_authors:
    colors[columnIndex] = 'rgb(92, 138, 138)'

fig = px.fig = px.bar(bcCountByAuthor_query_df, x='Author', y='BoundedContextCount', 
                      labels={'Author': 'Entwickler', 'BoundedContextCount': 'Anzahl der fachlichen Komponenten'},
                      title='Entwickler und Anzahl der fachlichen Komponenten, zu denen sie etwas beigetragen haben (ab 3 Komponenten)')
fig.update_traces(marker_color=colors, marker_line_color='rgb(92, 138, 138)',
                  marker_line_width=1, opacity=0.6)
fig.show()


37 rows affected.


* Die beiden Top-Entwickler (Christoph Strobl und Mark Paluch) haben in allen 19 Komponenten Beiträge geleistet.
* Weiteren vorderen Plätze in den Top 5 decken sich ungefähr mit den Top-Entwickler nach Commit-Anzahl.
* Ausreißer:
    * Thomas Darimont hat Beiträge zu 13 Komponenten geleistet, aber nur 3,6% aller Commits beigetragen.
    * Thomas Risberg hat 4,4% aller Commits beigetragen, aber nur an in 3 fachlichen Komponenten.

In [283]:
# Verhältnis CommitsByAuthor und BcCountByAuthor

MAX_COMMIT_COUNT = 1574
MAX_BC_COUNT = 19

relative_commit_and_bc_counts = dict()
for author in TOP_13_AUTHORS: 
    relative_commit_and_bc_counts[author] = None

for i, row in topAuthorsByCommitCount_df.iterrows():
    relative_commit_count = round((row['Commits'] / MAX_COMMIT_COUNT), 3)
    relative_commit_and_bc_counts[row['Author']] = dict(name=row['Author'], rel_commit_count=relative_commit_count) 

for i, row in bcCountByAuthor_query_df.iterrows():   
    if row['Author'] in TOP_13_AUTHORS:
        relative_bc_count = round((row['BoundedContextCount'] / MAX_BC_COUNT), 3)
        relative_commit_and_bc_counts[row['Author']]['rel_bc_count'] = relative_bc_count
    
relativeCommitAndBcCounts_df = pandas.DataFrame(data=list(relative_commit_and_bc_counts.values()))
for i, row in relativeCommitAndBcCounts_df.iterrows():
    if pandas.isna(row['rel_bc_count']):
        relativeCommitAndBcCounts_df.at[i,'rel_bc_count'] = 0

# print(relativeCommitAndBcCounts_df)
fig = px.fig = px.bar(relativeCommitAndBcCounts_df, x='name', y=['rel_commit_count', 'rel_bc_count'], barmode='group',
                      color_discrete_map={'rel_commit_count': 'rgb(92, 138, 138)', 'rel_bc_count': 'rgb(179, 204, 204)'},
                      labels={'name': 'Top 13 Entwickler', 'value': 'Relative Anzahl', 'variable': ''},
                      title='Verhältnis relative Anzahl der Komponenten zu relativer Anzahl der Commits')
fig.data[0].name = 'Relative Anzahl Commits'
fig.data[1].name = 'Relative Anzahl Komponenten'
fig.update_traces(opacity=0.6)
fig.show()
    

* Wenn rel. Komponenten-Anzahl deutlich höher als rel. Commit-Anzahl: 
    * Entwickler hat in verhältnismäßig vielen Komponenten ein bisschen was beigetragen
* Wenn rel. Komponenten-Anzahl deutlich niedriger rel. Commit-Anzahl eher gleich groß:
    * Entwickler hat wenigen Komponenten verhältnismäßig viel beigetragen
    
    
**TODO: Spider-Diagramm?**

### Betrachtung über Zeitspanne

In [156]:
# Zeiträume
timeSpanAuthors_query = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File) \
    WHERE NOT c:Merge AND NOT a.name STARTS WITH "Spring" \
    WITH a.name as Author,  min(c.date) AS MinDate, max(c.date) AS MaxDate \
    WHERE MinDate <> MaxDate \
    RETURN Author, MinDate, MaxDate \
    ORDER BY MinDate

timeSpanAuthors_query_df = timeSpanAuthors_query.get_dataframe()

timeSpanAuthors_list = []
for i, row in timeSpanAuthors_query_df.iterrows():
    if row['Author'] not in TOP_13_AUTHORS:
        continue  
    min_date = pandas.to_datetime(row['MinDate'])
    max_date = pandas.to_datetime(row['MaxDate'])
    time_delta_months = (max_date.to_period('M') - min_date.to_period('M')).n
    timespan_entry = dict(Author=row['Author'], FirstCommit=row['MinDate'], LastCommit=row['MaxDate'], DurationMonths=time_delta_months)
    timeSpanAuthors_list.append(timespan_entry)
    
timeSpanAuthors_df = pandas.DataFrame(timeSpanAuthors_list)
fig_timeline = px.timeline(timeSpanAuthors_df, x_start="FirstCommit", x_end="LastCommit", y="Author", color="Author", 
                  labels={'Author': 'Entwickler'}, height=600, 
                  title="Zeitspanne der Projekt-Beteiligung der Top-13-Entwickler")
fig_timeline.update_layout(showlegend=False)
fig_timeline.show()

38 rows affected.


In [153]:
def create_list_with_commit_counts(author):
    commitsOverTime_query = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File) \
                                WHERE NOT c:Merge AND a.name ="{author}" \
                                WITH DISTINCT c.epoch AS MilliSec, substring(c.date, 0, 7) AS Month \
                                RETURN  Month, count(MilliSec) AS CommitCount \
                                ORDER BY Month
        
    commitsOverTime_query_df = commitsOverTime_query.get_dataframe()
    commit_counts_list = []
    for i, row in commitsOverTime_query_df.iterrows():
        commit_count_entry = dict(month=row['Month'], commit_count=row['CommitCount'], author=author)
        commit_counts_list.append(commit_count_entry)
    return commit_counts_list

In [154]:
commitsOverTime_data = []
commitsOverTime_data += create_list_with_commit_counts('Mark Paluch')
commitsOverTime_data += create_list_with_commit_counts('Christoph Strobl')
commitsOverTime_data += create_list_with_commit_counts('Oliver Gierke')
commitsOverTime_data += create_list_with_commit_counts('Oliver Drotbohm')
commitsOverTime_data += create_list_with_commit_counts('Thomas Risberg')
commitsOverTime_data += create_list_with_commit_counts('Thomas Darimont')
commitsOverTime_data += create_list_with_commit_counts('Jens Schauder')

commitsOverTime_df = pandas.DataFrame(data=commitsOverTime_data)

68 rows affected.
84 rows affected.
48 rows affected.
55 rows affected.
17 rows affected.
24 rows affected.
9 rows affected.


In [155]:
fig = px.scatter(data_frame=commitsOverTime_df, x='month', y='commit_count', color='author', height=850,
                 labels={'commit_count': 'Anzahl Commits', 'author': 'Entwickler', 'month': 'Monat/Jahr'},
                 title='Anzahl Commits der Top-Entwickler pro Monat')
fig.update_layout(legend={'xanchor':'center', 'yanchor':'top', 'y': -0.1, 'x': 0.1})
fig.show()

* Jens Schauder hat bisher wenige Commits beigetragen, ist aber einer der Entwickler, bis aktuell aktiv dabei sind.

In [157]:
maxDurationAuthors_list = sorted(timeSpanAuthors_list, key=lambda k: k['DurationMonths'], reverse=True)
maxDurationAuthors_df = pandas.DataFrame(maxDurationAuthors_list)
fig_max_duration = px.fig = px.bar(maxDurationAuthors_df, x='Author', y='DurationMonths',
                      labels={'Author': 'Entwickler', 'DurationMonths': 'Anzahl Monate zwischen ältestem und jüngstem Commit'},
                      title='Dauer der Projekt-Beteiligung der Top-13-Entwickler')
fig_max_duration.update_traces(marker_color='rgb(179, 204, 204)', marker_line_color='rgb(92, 138, 138)',
                  marker_line_width=1, opacity=0.6)
fig_max_duration.show()

### Betrachtung Anzahl und Art der geänderten Dateien der Commits

### Anzahl der modifizierten Dateien pro Commit
* Ausreißer:
    * mehr als 1000 Dateien: "Update copyright year to..."

In [158]:
# Anzahl der modifizierten Dateien pro Commit
numberModifiedFilesPerCommit = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File) \
                        WHERE NOT c:Merge AND NOT a.name STARTS WITH "Spring" \
                        WITH a.name as Author, c.shortMessage As CommitMessage, c.date AS CommitDate, count(DISTINCT f) AS FileCount \
                        RETURN Author, CommitMessage, CommitDate, FileCount \
                        ORDER BY FileCount DESC
    
numberModifiedFilesPerCommit_df = numberModifiedFilesPerCommit.get_dataframe()

fig = px.scatter(data_frame=numberModifiedFilesPerCommit_df, x='CommitDate', y='FileCount', color='Author', height=600,
                 labels={'FileCount': 'Anzahl der durch einen Commit geänderten Dateien', 'Author': 'Entwickler', 'CommitDate': 'Datum des Commits'},
                 title='Anzahl geänderter Dateien pro Commit (mit min. 10 geänderten Datien)')
fig.update_layout(showlegend=False)
fig.show()

3765 rows affected.


In [159]:
# Commit Messages der Commits mit den meisten geänderten Dateien

display(Markdown('#### Commit Messages der Top-10-Commits mit den meisten geänderten Dateien: '))  
for i, row in numberModifiedFilesPerCommit_df.iterrows():
    if i > 9:
        break
    else:
        display(Markdown(f'* {row["CommitMessage"]} ({row["FileCount"]} Dateien)'))

#### Commit Messages der Top-10-Commits mit den meisten geänderten Dateien: 

* Update copyright year to 2021. (957 Dateien)

* DATAMONGO-2444 - Update copyright years to 2020. (865 Dateien)

* DATAMONGO-2175 - Update copyright years to 2019. (852 Dateien)

* DATAMONGO-1844 - Update copyright years to 2018. (642 Dateien)

* DATADOC-191 - Removed 'document' from package names. (452 Dateien)

* DATADOC-175 - Broke up cyclic dependencies and added architecture management file. (351 Dateien)

* Format, organize imports, remove eclipse artifacts (217 Dateien)

* DATAMONGO-1176 - Switch to Document API. (214 Dateien)

* DATAMONGO-2481 - Polishing. (177 Dateien)

* DATAMONGO-1587 - Migrate ticket references in test code to Spring Framework style. (171 Dateien)

In [300]:
# Durchschnittl. Anzahl der modifizierten Dateien pro Commit (ohne Ausreißer über 300 Dateien)
numberModifiedFilesPerCommit = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File) \
                WHERE NOT c:Merge AND NOT a.name STARTS WITH "Spring" \
                WITH a.name as Author, c.sha AS Sha, count(DISTINCT f) AS FileCount \
                WHERE FileCount <= 300 \
                WITH Author, avg(FileCount) AS AvgFileCount \
                WHERE AvgFileCount >= 2.5 \
                RETURN Author, AvgFileCount \
                ORDER BY AvgFileCount DESC
    
numberModifiedFilesPerCommit_df = numberModifiedFilesPerCommit.get_dataframe()

column_index_top_authors = []
for i, row in numberModifiedFilesPerCommit_df.iterrows():
    if row['Author'] in TOP_13_AUTHORS:
        column_index_top_authors.append(i)

print('rows', numberModifiedFilesPerCommit_df.shape[0])

colors = ['rgb(42, 105, 137)'] * numberModifiedFilesPerCommit_df.shape[0]
for columnIndex in column_index_top_authors:
    colors[columnIndex] = 'rgb(204, 204, 0)'

fig = px.bar(data_frame=numberModifiedFilesPerCommit_df, x='Author', y='AvgFileCount',
             labels={'AvgFileCount': 'durchschnittl. Anzahl geänderten Dateien je Commit', 'Author': 'Entwickler'},
             title='Durchschnittliche Anzahl geänderter Dateien pro Commit (ohne Ausreißer von über 300 Dateien in einem Commit)')
fig.update_traces(marker_color=colors, marker_line_color='rgb(8,48,107)',
                  marker_line_width=1, opacity=0.6)
fig.show()

56 rows affected.
rows 56


* Gelb eingefärbt: 12 der Top-13-Entwickler mit den meisten Commits
    * Martin Macko fehlt in der Liste, da er immer nur 1 Datei pro Commit geändert hat
* auffällig viele Entwickler, die nur max. 10 Commits beigesteuert haben

In [305]:
# Verhältnis Anzahl Commit und Anzahl geänderter Dateien
# MAX_COMMIT_COUNT 
MAX_AVERAGE_FILE_COUNT = 31
                            
relativeNumberModifiedFilesPerCommit_df = numberModifiedFilesPerCommit_df.copy(deep=True)
relativeNumberModifiedFilesPerCommit_df = relativeNumberModifiedFilesPerCommit_df.append(dict(Author='Martin Macko', AvgFileCount=1), ignore_index=True)

authors_to_remove = []
for i, row in relativeNumberModifiedFilesPerCommit_df.iterrows():
    if row['Author'] not in TOP_13_AUTHORS:
        authors_to_remove.append(i)

for index in authors_to_remove:
    relativeNumberModifiedFilesPerCommit_df = relativeNumberModifiedFilesPerCommit_df.drop(index)
    
relativeNumberModifiedFilesPerCommit_df = relativeNumberModifiedFilesPerCommit_df.reset_index()   
 
relative_commit_and_file_counts_dict = dict()
for author in TOP_13_AUTHORS: 
    relative_commit_and_file_counts[author] = None

for i, row in topAuthorsByCommitCount_df.iterrows():
    relative_commit_count = round((row['Commits'] / MAX_COMMIT_COUNT), 3)
    relative_commit_and_file_counts[row['Author']] = dict(name=row['Author'], rel_commit_count=relative_commit_count) 

for i, row in relativeNumberModifiedFilesPerCommit_df.iterrows():
    relative_file_count = round((row['AvgFileCount'] / MAX_AVERAGE_FILE_COUNT), 3)
    relative_commit_and_file_counts[row['Author']]['rel_file_count'] = relative_file_count
    
relativeCommitAndFileCounts_df = pandas.DataFrame(data=list(relative_commit_and_file_counts.values()))

fig = px.fig = px.bar(relativeCommitAndFileCounts_df, x='name', y=['rel_commit_count', 'rel_file_count'], barmode='group',
                      color_discrete_map={'rel_commit_count': 'rgb(92, 138, 138)', 'rel_file_count': 'rgb(179, 204, 204)'},
                      labels={'name': 'Top 13 Entwickler', 'value': 'Relative Anzahl', 'variable': ''},
                      title='Verhältnis durchschnittl. Anzahl geänderter Dateien zu relativer Anzahl der Commits')
fig.data[0].name = 'Relative Anzahl Commits'
fig.data[1].name = 'Relative durchschnittl. Anzahl geänderter Dateien'
fig.update_traces(opacity=0.6)
fig.show()

In [317]:
# Addition der relativen Werte: Anzahl Commits, Anzahl BCs, Anzahl Dateien proCommit

#print(relativeCommitAndFileCounts_df)
#print(relativeCommitAndBcCounts_df)

relativeCommitAndFileAndBcCounts_df = relativeCommitAndFileCounts_df.copy(deep=True)

for i, row in relativeCommitAndFileAndBcCounts_df.iterrows():
    rel_bc_count = relativeCommitAndBcCounts_df.at[i, 'rel_bc_count']
    relativeCommitAndFileAndBcCounts_df.at[i, 'rel_bc_count'] = rel_bc_count
    sum = row['rel_commit_count'] + row ['rel_file_count'] + rel_bc_count
    relativeCommitAndFileAndBcCounts_df.at[i, 'sum'] = sum
    
relativeCommitAndFileAndBcCounts_df = relativeCommitAndFileAndBcCounts_df.sort_values(by=['sum'], ascending=False)
#print(relativeCommitAndFileAndBcCounts_df)

fig = px.bar(data_frame=relativeCommitAndFileAndBcCounts_df, x='name', y='sum',
             labels={'sum': 'Summe relativer Werte', 'name': 'Entwickler'},
             title='Summe relativer Werte (Anzahl Commits, Anzahl fachl. Komponenten, durchschnittl. Anzahl geänderter Dateien)')
fig.update_traces(marker_color='rgb(92, 138, 138)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1, opacity=0.6)
fig.show()

## Nächste Schritte
**TODO**

In [284]:
# Verhältnis CommitsByAuthor und BcCountByAuthor

MAX_COMMIT_COUNT = 1574
MAX_BC_COUNT = 19

author_commitCount_map = dict()
for i, row in commitCountByAuthor_df.iterrows():
    author_commitCount_map[row['Entwickler']] = {'commitCount': row['Commits']}

for i, row in bcCountByAuthor_query_df.iterrows():
    author = row['Author']
    if author in author_commitCount_map:
        author_commitCount_map[author]['bcCount'] = row['BoundedContextCount']

for key in author_commitCount_map:
    if 'bcCount' not in author_commitCount_map[key]:
        author_commitCount_map[key]['bcCount'] = 1
        
author_list = []
relative_commit_count_list = []
relative_bc_count_list = []
for key in author_commitCount_map:
    commit_count = author_commitCount_map[key]['commitCount']
    bc_count = author_commitCount_map[key]['bcCount']

    relative_commit_count = commit_count / MAX_COMMIT_COUNT
    relative_bc_count = bc_count / MAX_BC_COUNT
    relative_commit_count_list.append(relative_commit_count)
    relative_bc_count_list.append(relative_bc_count)
    author_list.append(key)
    
relativeCommitAndBcCount_data = dict()
relativeCommitAndBcCount_data['author'] = author_list
relativeCommitAndBcCount_data['relative_commit_count'] = relative_commit_count_list
relativeCommitAndBcCount_data['relative_bc_count'] = relative_bc_count_list

relativeCommitAndBcCount_df = pandas.DataFrame(data=relativeCommitAndBcCount_data)
print(relativeCommitAndBcCount_df)

fig = px.fig = px.bar(relativeCommitAndBcCount_df, x='author', y=['relative_commit_count', 'relative_bc_count'], barmode='group',
                      color_discrete_map={'relative_commit_count': 'rgb(92, 138, 138)', 'relative_bc_count': 'rgb(179, 204, 204)'},
                      labels={'author': 'Top 13 Entwickler', 'value': 'Relative Anzahl', 'variable': ''},
                      title='Verhältnis relative Anzahl der Komponenten zu relativer Anzahl der Commits')
fig.data[0].name = 'Relative Anzahl Commits'
fig.data[1].name = 'Relative Anzahl Komponenten'
fig.update_traces(opacity=0.6)
fig.show()

               author  relative_commit_count  relative_bc_count
0         Mark Paluch               1.000000           1.000000
1    Christoph Strobl               0.797332           1.000000
2       Oliver Gierke               0.504447           0.684211
3     Oliver Drotbohm               0.480940           0.736842
4      Thomas Risberg               0.149301           0.157895
5     Thomas Darimont               0.122618           0.684211
6        Mark Pollack               0.087675           0.315789
7         Jon Brisbin               0.041296           0.052632
8       Jens Schauder               0.035578           0.421053
9      Greg Turnquist               0.033672           0.052632
10  Sebastien Deleuze               0.010165           0.157895
11       Martin Macko               0.007624           0.157895
12      Graeme Rocher               0.006989           0.052632


In [137]:
%%cypher
// Committer, die Änderungen von anderen Authors committet haben
MATCH (n:Commit) 
WHERE n.committer <> n.author 
RETURN DISTINCT n.committer

18 rows affected.


n.committer
Christoph Strobl <cstrobl@vmware.com>
Mark Paluch <mpaluch@vmware.com>
GitHub <noreply@github.com>
Mark Paluch <mpaluch@pivotal.io>
Jens Schauder <jschauder@vmware.com>
Christoph Strobl <cstrobl@pivotal.io>
Greg Turnquist <gturnquist@pivotal.io>
Jens Schauder <jschauder@pivotal.io>
Oliver Drotbohm <odrotbohm@pivotal.io>
Oliver Gierke <ogierke@pivotal.io>


Ohne Duplikate (8 Committer)
* Christoph Strobl
* Jens Schauder
* Jon Brisbin
* Greg Turnquist
* Mark Paluch
* Oliver Drotbohm 
* Oliver Gierke
* Thomas Darimont


In [163]:
# Welche Entwickler haben über die längste Zeitspanne am Projekt gearbeitet?
maxTimeSpanBetweenCommits = %cypher MATCH (a:Author)-[:COMMITTED]->(c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File) \
    WHERE NOT c:Merge AND NOT a.name STARTS WITH "Spring" \
    WITH a.name as Author,  min(c.epoch) AS MilliSecMin, max(c.epoch) AS MilliSecMax \
    WITH Author, abs(MilliSecMax - MilliSecMin) as TimeDifference \
    WITH Author, abs(TimeDifference / (60*60*24*1000)) AS TimeDifferenceDays \
    WHERE TimeDifferenceDays > 180 \
    RETURN Author, TimeDifferenceDays \
    ORDER BY TimeDifferenceDays DESC

maxTimeSpanBetweenCommits_df = maxTimeSpanBetweenCommits.get_dataframe()

fig = px.fig = px.bar(maxTimeSpanBetweenCommits_df, x='Author', y='TimeDifferenceDays',
                      labels={'Author': 'Entwickler', 'TimeDifferenceDays': 'Anzahl Tage zwischen ältestem und jüngstem Commit'},
                      title='Länge des Zeitraums, in dem Entwickler Commits beigetragen haben')
fig.update_traces(marker_color='rgb(179, 204, 204)', marker_line_color='rgb(92, 138, 138)',
                  marker_line_width=1, opacity=0.6)
fig.show()

21 rows affected.


In [164]:
%%cypher
// Liste von Comittern pro BoundedContext
MATCH    (c:Commit)-[:CONTAINS_CHANGE]->(:Change)-[]->(f:Git:File),
         (f)<-[:HAS_SOURCE]-(:Type:Java)<-[:CONTAINS]-(bC:BoundedContext),
         (a:Author)-[:COMMITTED]->(c)
WHERE    NOT c:Merge
WITH     bC.name AS BoundedContext, a.name AS Author, count(DISTINCT c) AS Commits
ORDER BY BoundedContext, Commits Desc
WITH     BoundedContext, collect(Author) AS Authors
RETURN   BoundedContext, Authors

19 rows affected.


BoundedContext,Authors
aggregation,"['Christoph Strobl', 'Mark Paluch', 'Thomas Darimont', 'Spring Operator', 'Oliver Gierke', 'Oliver Drotbohm', 'Eddú Meléndez', 'Matt Morrissette', 'Gustavo de Geus', 'Christian Ivan', 'Sergey Shcherbakov', 'Alessio Fachechi', 'Shashank Sharma', 'Jens Schauder', 'Yadhukrishna S Pai', 'Jérome GUYON', 'Sebastian Herold', 'Tobias Trelle', 'Nikolai Bogdanov']"
config,"['Mark Paluch', 'Oliver Gierke', 'Christoph Strobl', 'Spring Operator', 'Thomas Darimont', 'Oliver Drotbohm', 'Thomas Dudouet', 'Zied Yaich', 'Stephen Tyler Conrad', 'Viktor Khoroshko', 'Mark Pollack', 'Martin Baumgartner', 'Jens Schauder', 'Maciej Walkowiak', 'Ryan Tenney', 'John Blum', 'Mike Saavedra']"
convert,"['Christoph Strobl', 'Oliver Gierke', 'Mark Paluch', 'Oliver Drotbohm', 'Thomas Darimont', 'Spring Operator', 'Thiago Diniz da Silveira', 'Kevin Dosey', 'Ken Dombeck', 'Patryk Wąsik', 'Heesu Jung', 'Christian Ivan', 'David Julia', 'Jordi Llach Fernandez', 'Divya Srivastava', 'Roman Puchkovskiy', 'Jens Schauder']"
core,"['Christoph Strobl', 'Mark Paluch', 'Oliver Gierke', 'Oliver Drotbohm', 'Spring Operator', 'Thomas Darimont', 'Mark Pollack', 'Sebastien Deleuze', 'Laszlo Csontos', 'Michal Vich', 'Eddú Meléndez', 'Mikhail Mikhaylenko', 'Ken Dombeck', 'Borislav Rangelov', 'Martin Macko', 'Brice Vandeputte', 'Jens Schauder', 'Domenique Tilleuil', 'Amol Nayak', 'Ivan Sopov', 'Tobias Trelle', 'Philipp Schneider', 'Cimon Lucas (LCM)', 'Ryan Cloherty', 'nkey', 'Jacob Botuck', 'Sola', 'Komi Serge Innocent', 'kostya05983', 'wonwoo', 'Mathieu Ouellet', 'Roman Puchkovskiy', 'Yadhukrishna S Pai', 'Martin Baumgartner', 'Niko Schmuck', 'Sebastian Herold', 'Andreas Zink', 'Mainder Singh', 'Jan Kronquist', 'A. B. M. Kowser Patwary', 'Ilho Ahn', 'eric', 'Christoph Leiter', 'Chuong Ngo', 'Patryk Wąsik', 'Michael Simons', 'GotoFinal', 'abarkan', 'Juergen Zimmermann', 'Minsu']"
geo,"['Mark Paluch', 'Spring Operator', 'Christoph Strobl', 'larsw', 'Thomas Darimont', 'Oliver Drotbohm', 'Bjorn Harvold', 'Oliver Gierke']"
gridfs,"['Mark Paluch', 'Christoph Strobl', 'Spring Operator', 'Oliver Gierke', 'Hartmut Lang', 'Thomas Darimont', 'Oliver Drotbohm', 'Nick Stolwijk', 'konradend', 'Niklas Helge Hanft', 'Mathieu Ouellet', 'Denis Zavedeev', 'Martin Baumgartner', 'Philipp Schneider']"
index,"['Christoph Strobl', 'Mark Paluch', 'Oliver Gierke', 'Spring Operator', 'Oliver Drotbohm', 'Thomas Darimont', 'Martin Macko', 'Eddú Meléndez', 'Dave Perryman', 'Mark Pollack', 'Jens Schauder', 'Laurent Canet', 'Johno Crawford', 'Jordi Llach Fernandez', 'Philipp Schneider', 'Thomas Risberg']"
mapping,"['Christoph Strobl', 'Mark Paluch', 'Oliver Gierke', 'Oliver Drotbohm', 'Spring Operator', 'Thomas Darimont', 'Eddú Meléndez', 'BraveLeeLee', 'Gatto', 'Kim Toms', 'Martin Baumgartner', 'Roman Puchkovskiy', 'Patryk Wąsik', 'Divya Srivastava', 'Michael Simons', 'Maciej Walkowiak', 'Christoph Leiter']"
mapreduce,"['Mark Paluch', 'Spring Operator', 'Christoph Strobl', 'Oliver Gierke', 'Mark Pollack', 'Oliver Drotbohm', 'Thomas Darimont', 'Jens Schauder']"
messaging,"['Mark Paluch', 'Christoph Strobl', 'Spring Operator']"
