In [51]:
import glob
import json
import re
from collections import Counter, defaultdict
from itertools import combinations
from pprint import pprint

from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import chisquare
from statsmodels.stats.inter_rater import aggregate_raters, fleiss_kappa


In [52]:
def extract_annotated_lines(files, user):
    ret = []
    for id, file in enumerate(files):
        fname = file['fileName']
        fcat = file['category']

        for image in file['lines']:
            for line in file['lines'][image]:
                ret.append((bug, user, fname, fcat, image, line['lineNumber'], line['category']))
    return ret
    

In [53]:
def plot_sankey_lines(global_commits_files_name_line_counter, output):
    path_counter = Counter()
    for f,v in global_commits_files_name_line_counter.items():
        components = f[0].split("/")
        type_change = f[1]
    
        if "po" in f[0]:
            type_change = "translation"
        if "test" in f[0] and type_change == "code":
            type_change = "test"
    
        if len(components) == 1:
            path_counter.update({ ("project", "__/__"): v})
            path_counter.update({ ("__/__", f"[{type_change}]"): v})
            continue
        else:
            if components[0]:
                path_counter.update({("project", components[0]): v})
            
        if len(components) == 2:
            path_counter.update({ (components[0], "__" + components[0] + "__"): v})
            path_counter.update({ ("__" + components[0] + "__", f"[{type_change}]"): v})
            continue
    
    
        for i in range(len(components)-2):
            if (not components[i] and components[i+1]):
                continue
            path_counter.update({tuple(components[i:i+2]):v})
        path_counter.update({(components[-2], f"[{type_change}]"):v})
    if not path_counter:
        return
    output.append("""
# Sankey files -> lines -> annotation

```mermaid
---
config:
  sankey:
    showValues: false
---

sankey-beta

""")
    for k, v in path_counter.items():
        output.append(f"{','.join(k)},{v}\n")
    output.append("\n```\n")


In [54]:
def get_commit_data(fcommit):
    date_pattern = re.compile(r'^Date:\s+(.+)$', re.MULTILINE)

    with open(fcommit, "r") as f:
        match = date_pattern.search(f.read())
    
    if match:
        date_str = match.group(1).strip()
        date_obj = datetime.strptime(date_str, '%a %b %d %H:%M:%S %Y %z')
        return date_obj
    return match

In [55]:
def parse_repository(repo_name):
    
    global_commits_counter = Counter()
    global_commits_lines_counter = Counter()
    global_commits_files_purpose_counter = Counter()
    global_commits_files_name_counter = Counter()
    global_commits_files_name_line_counter = Counter()
    
    global_commits_files_language_counter = Counter()
    global_commits_files_type_counter = Counter()
    
    non_doc_commits = []
    all_commits = [] 
    for fjson in glob.glob(f"../affected_commits/{repo_name}/*/annotation/*.json"):
        commit = fjson.split("/")[-1].split(".")[0]
        
        print(get_commit_data(f"../affected_commits/{repo_name}/{commit}/patches/{commit}.diff"))
        
        with open(fjson) as fin:
            fdata = json.load(fin)
            commit_counter = Counter()
            all_doc = True
    
            for file in fdata:
                global_commits_files_name_counter.update([file])
                global_commits_files_purpose_counter.update([fdata[file]['purpose']])
                global_commits_files_language_counter.update([fdata[file]['language']])
                global_commits_files_type_counter.update([fdata[file]['type']])
    
                for change_t in ["+", "-"]:
                    for change in fdata[file][change_t]:
                        commit_counter.update([change['type']])
                        global_commits_files_name_line_counter.update([(file,change['type'])])
                        global_commits_lines_counter.update([change['type']])
                        if change['type'] != 'documentation':
                            all_doc = False
                            
            if not all_doc:
                non_doc_commits.append(fjson)
                
            all_commits.append(fjson)
            global_commits_counter.update([tuple(set(commit_counter.keys())),])
            
    output = []
    output.append("# Commits stats")
    output.append("* All commits {len(all_commits)}\n")
    output.append("* Pure doc commits {len(all_commits) - len(non_doc_commits)}\n")
    D = sum(global_commits_lines_counter.values())
    output.append("# Lines stats\n")
    for k in global_commits_lines_counter:
        output.append(f"* {k} {global_commits_lines_counter[k]/D * 100:.2f} %\n" )

    plot_sankey_lines(global_commits_files_name_line_counter, output)
    with open(f"doc/raport_{repo_name}.md","w") as f:
        f.writelines(output)
    return global_commits_files_name_line_counter
    
        


In [56]:
out = Counter()
for repo_name in glob.glob(f"../affected_commits/*"):
    rn = repo_name.split("/")[-1]
    out.update(parse_repository(rn))
    print(rn)

outlines = []
plot_sankey_lines(out, outlines)

with open("all_commits.md", "w") as fout:
    fout.writelines(outlines)

xz-embedded
oss-fuzz
seatest
2023-05-13 00:44:41+08:00
2023-05-16 23:07:35+08:00
2023-12-20 22:43:44+08:00
2024-02-26 23:02:06+08:00
2023-05-02 20:39:56+08:00
2023-09-12 22:36:12+08:00
2023-05-13 21:21:54+08:00
2022-12-29 01:55:19+08:00
2023-02-22 20:59:41+08:00
2023-03-16 21:38:32+08:00
2023-06-27 17:27:09+03:00
2023-08-28 21:31:25+08:00
2022-09-21 20:29:28+08:00
2024-01-09 16:40:56+08:00
2023-12-16 20:28:21+08:00
2022-05-05 20:53:42+08:00
2023-01-12 22:58:36+08:00
2023-01-13 20:37:06+08:00
2023-02-10 21:35:23+08:00
2024-02-15 22:26:43+08:00
2023-10-18 19:57:10+08:00
2023-03-16 22:07:15+08:00
2023-12-21 16:39:53+08:00
2022-12-21 23:59:43+08:00
2023-01-06 00:02:29+08:00
2023-05-04 19:25:20+08:00
2023-12-07 21:48:07+08:00
2022-10-06 17:00:38+08:00
2022-08-17 20:20:16+08:00
2023-09-26 01:17:11+08:00
2023-03-17 01:30:36+08:00
2022-01-28 20:47:55+08:00
2024-02-07 20:56:57+08:00
2023-05-17 20:13:01+08:00
2023-02-03 22:52:55+08:00
2024-02-13 22:37:07+08:00
2023-05-31 20:15:53+08:00
2023-10-1

In [None]:
path_counter = Counter()
for f,v in global_commits_files_name_counter.items():
    components = f.split("/")

    if len(components) == 1:
        path_counter.update({ ("project", "__/__"): v})
    else:
        if components[0]:
            path_counter.update({("project", components[0]): v})
        
    if len(components) == 2:
        path_counter.update({ (components[0], "__" + components[0] + "__"): v})


    for i in range(len(components)-2):
        if (not components[i] and components[i+1]):
            continue
        path_counter.update({tuple(components[i:i+2]):v})

for k, v in path_counter.items():
    print(f"{','.join(k)},{v}")