## Get list of commits to analyse

In [2]:
import numpy as np
from git import Repo
import os
import pandas as pd
from datetime import datetime, timedelta
from git.exc import InvalidGitRepositoryError


repos_path = os.path.join('data', 'repos', 'simulation')
analyzed_repos_path = os.path.join('data', 'analyzed_repos_commits')
dj_file_path = os.path.join('apps', 'DJ', 'DesigniteJava.jar')

for folder in os.listdir(repos_path):
    folder_path = os.path.join(repos_path, folder)
    result_path = os.path.join(analyzed_repos_path, folder)
    if os.path.isdir(folder_path):
        
        try:
            repo = Repo(folder_path)
        except InvalidGitRepositoryError as e:
            print("Skipping this repository because its invalid")
            continue

        print("analyzing "+ repo.remotes.origin.url)
        if not repo.bare:
            commit_changes = []

            commits = list(repo.iter_commits())
            min_time_diff = timedelta(weeks=1)

            prev_commit = commits[0]
            for i in range(1, len(commits)):
                commit = commits[i]

                time_diff = datetime.fromtimestamp(prev_commit.committed_date) - datetime.fromtimestamp(commit.committed_date)

                if time_diff >= min_time_diff and commit.stats.total:
                    commit_datetime = datetime.fromtimestamp(commit.committed_date)
                    diff = repo.git.diff(prev_commit, commit, '--numstat').splitlines()
                    num_changes = len(diff)
                    commit_changes.append({'commit':commit, 'changes':num_changes, 'date':datetime.fromtimestamp(commit.committed_date)})

                    prev_commit = commit



            df = pd.DataFrame(commit_changes)
            print(df)

            if not df.empty:
                sorted_commits = df.sort_values(by=['changes', 'date'], ascending=[False, True])
                print(sorted_commits)
                res_path = os.path.join(result_path, "commits.csv")
                print(os.path.exists(result_path), result_path)
                if not os.path.exists(result_path):
                    print("Created new folder")
                    os.makedirs(result_path)

                sorted_commits.to_csv(res_path, index=False)

analyzing https://github.com/PhysicDev/1DcellularAutomata_Library
                                     commit  changes                date
0  c67df954684d26547962cf6514e9da4ad272c4bb        4 2021-09-17 12:39:23
                                     commit  changes                date
0  c67df954684d26547962cf6514e9da4ad272c4bb        4 2021-09-17 12:39:23
True data\analyzed_repos_commits\1DcellularAutomata_Library
analyzing https://github.com/YuYen/2048_simulator
Empty DataFrame
Columns: []
Index: []
analyzing https://github.com/quietsamurai98/2D-Accretion-Simulation
                                     commit  changes                date
0  e46ece97819cce013c21f31eac51599945cc8ba4        2 2016-10-18 14:50:09
1  e67c2ee8ecfb38916e8b02c2fd4dbe1ed920b64b        5 2016-09-23 18:42:13
2  f57f675e5ec5a645a82b9ddf202b708be6a0521a        5 2016-08-31 02:08:55
                                     commit  changes                date
2  f57f675e5ec5a645a82b9ddf202b708be6a0521a        5 2016-08-

KeyboardInterrupt: 

## Get Reports for each commit

In [7]:
from git import Repo
import pandas as pd
import os
import analyze_code
import shutil

repos_path = os.path.join('data', 'repos', 'simulation')
analyzed_repos_path = os.path.join('data', 'analyzed_repos', 'simulation')
analyzed_commits_path = os.path.join('data', 'analyzed_repos_commits')
dj_file_path = os.path.join('apps', 'DJ', 'DesigniteJava.jar')

for folder in os.listdir(analyzed_commits_path):
    print("Analyzing ", folder)
    folder_path = os.path.join(analyzed_commits_path, folder)

    res_path = os.path.join(folder_path, "commits.csv")
    df = pd.read_csv(res_path)
    df = df.sort_values('date', ascending=False)
    df_commits = df['commit'].tolist()
    print(df_commits)

    #check if the head compiled during analysis and skip entire preocess for no compilation 
    analyzed_repo_folder = os.path.join(analyzed_repos_path, folder)
    adf = pd.read_csv(os.path.join(analyzed_repo_folder, "MethodMetrics.csv"))

    if adf.empty:
        shutil.rmtree(folder_path)
        continue

    # check if folder is already analyzed
    if os.path.isfile(os.path.join(folder_path, "completed.txt")):
        continue

    res_path = os.path.join(folder_path, "commits.csv")
    df = pd.read_csv(res_path)
    df = df.sort_values('date', ascending=False)
    df_commits = df['commit'].tolist()
    print(df_commits)

    repo_path = os.path.join(repos_path, folder)
    repo = Repo(repo_path)
    head_commit = repo.head.commit.hexsha


    count=0
    print("Analyzing Head commit")
    result_path = os.path.join(folder_path, folder+str(count))
    commit_info = repo.commit(head_commit)
    analyze_code.analyze_repo(repo_path, result_path, os.path.abspath(dj_file_path))
    with open(os.path.join(result_path,'commit_date.txt'), 'w') as file:
        file.write(str(commit_info.authored_datetime))

    count+=1
    for commit in df_commits:
        print("Analyzing the ", count, "th commit of ", folder)
        repo.git.reset('--hard', commit)
        result_path = os.path.join(folder_path, folder+str(count))
        analyze_code.analyze_repo(repo_path, result_path, os.path.abspath(dj_file_path))
        commit_info = repo.commit(commit)
        with open(os.path.join(result_path,'commit_date.txt'), 'w') as file:
            file.write(str(commit_info.authored_datetime))


        if os.path.isfile(os.path.join(result_path, "MethodMetrics.csv")):
            # do a check to see if we get metrics and stop 
            adf = pd.read_csv(os.path.join(result_path, "MethodMetrics.csv"))
            if adf.empty:
                print("Finished Analysis of ", folder, " Early at ", count, "th step")
                shutil.rmtree(result_path)
                break

        count+=1

    # analyze_code.analyze_repo_multi(repo_path, folder_path, os.path.abspath(dj_file_path), main_branch)

    # set a file as a merker to indicate its finished analyzing
    end_file = open(os.path.join(folder_path, "completed.txt"), 'w')
    end_file.write("COMPLETED")
    end_file.close()
    print("Finished Analysis of ", folder)
    repo.git.reset('--hard', head_commit)
    print("Resetting repo back to head")

Analyzing  1DcellularAutomata_Library
['c67df954684d26547962cf6514e9da4ad272c4bb']
['c67df954684d26547962cf6514e9da4ad272c4bb']
Analyzing Head commit
Analyzingdata\repos\simulation\1DcellularAutomata_Library ...
Attempting compilation...
Did not compile
Analyzing ...
done analyzing.
Analyzing the  1 th commit of  1DcellularAutomata_Library
Analyzingdata\repos\simulation\1DcellularAutomata_Library ...
Attempting compilation...
Did not compile
Analyzing ...
done analyzing.
Finished Analysis of  1DcellularAutomata_Library
Resetting repo back to head
Analyzing  2D-Accretion-Simulation
['e46ece97819cce013c21f31eac51599945cc8ba4', 'e67c2ee8ecfb38916e8b02c2fd4dbe1ed920b64b', 'f57f675e5ec5a645a82b9ddf202b708be6a0521a']
['e46ece97819cce013c21f31eac51599945cc8ba4', 'e67c2ee8ecfb38916e8b02c2fd4dbe1ed920b64b', 'f57f675e5ec5a645a82b9ddf202b708be6a0521a']
Analyzing Head commit
Analyzingdata\repos\simulation\2D-Accretion-Simulation ...
Attempting compilation...
Did not compile
Analyzing ...


KeyboardInterrupt: 

## Group data by projects

In [16]:
import os
import pandas as pd
from datetime import  datetime

# Define the path to the main folder containing subfolders
main_folder = 'data\\analyzed_repos_commits'

# Initialize an empty list to store dataframes
idfs = []
ddfs = []
adfs = []
mdfs = []

for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(subfolder_path):
        commit_number = 0
        for commit_subfolder in os.listdir(subfolder_path):
            print("Parsing ", commit_number, "th commit of ", subfolder)
            commit_subfolder_path = os.path.join(subfolder_path, commit_subfolder)

            if os.path.exists(commit_subfolder_path) and os.path.isdir(commit_subfolder_path):
                implementaion = os.path.join(commit_subfolder_path, "ImplementationSmells.csv")
                design = os.path.join(commit_subfolder_path, "DesignSmells.csv")
                arch = os.path.join(commit_subfolder_path, "ArchitectureSmells.csv")
                metrics = os.path.join(commit_subfolder_path, "TypeMetrics.csv")


                with open(os.path.join(commit_subfolder_path, 'commit_date.txt'), 'r') as file:
                    datetime_str = file.readline().strip()


                datetime_obj = datetime.strptime(datetime_str, '%Y-%m-%d %H:%M:%S%z')

                # Extract date from datetime object
                date = datetime_obj.date()


                try:
                    if os.path.exists(implementaion):
                        idf = pd.read_csv(implementaion,on_bad_lines='warn')
                        idf['Subfolder'] = subfolder
                        idf['commit_number'] = commit_number
                        idf['date'] = date
                        idfs.append(idf)

                    if os.path.exists(design):
                        ddf = pd.read_csv(design, on_bad_lines='warn')
                        ddf['Subfolder'] = subfolder
                        ddf['commit_number'] = commit_number
                        ddf['date'] = date
                        ddfs.append(ddf)

                    if os.path.exists(arch):
                        adf = pd.read_csv(arch, on_bad_lines='warn')
                        adf['Subfolder'] = subfolder
                        adf['commit_number'] = commit_number
                        adf['date'] = date
                        adfs.append(adf)

                    if os.path.exists(metrics):
                        mdf = pd.read_csv(metrics, on_bad_lines='warn')
                        mdf['Subfolder'] = subfolder
                        mdf['commit_number'] = commit_number
                        mdf['date'] = date
                        mdfs.append(mdf)

                except pd.errors.ParserError as e:
                    print("Error parsing ", subfolder)
                    print(e)


                commit_number +=1
            
 
irdf = pd.concat(idfs, ignore_index=True)
drdf = pd.concat(ddfs, ignore_index=True)
ardf = pd.concat(adfs, ignore_index=True)
mrdf = pd.concat(mdfs, ignore_index=True)

Parsing  0 th commit of  1DcellularAutomata_Library
Parsing  1 th commit of  1DcellularAutomata_Library
Parsing  2 th commit of  1DcellularAutomata_Library
Parsing  2 th commit of  1DcellularAutomata_Library


In [None]:
irdf.to_csv('irdf_1.csv')
drdf.to_csv('drdf_1.csv')
ardf.to_csv('ardf_1.csv')
mrdf.to_csv('mrdf_1.csv')