# RepoStats
Query and generate statistics and information from any local git repository. Supports 3 types of queries.

### Query for modifications and stats
Query within a range of commits (inclusive) for a list of modified files/folders (or ones with a modified file somewhere down the directory subtree). Result list contains only files/folders up to the specified depth relative to the specified directory. Various parameters can be set to restrict the definition of a "modified" file, which will only alter the result list and other stats computed based on modified files.

### Commits affecting paths
Get a dictionary of the number of modifications made by each commit to a path or within its subpaths.

### Existential test
Whether a file existed, came into existence, ceased to exist or never existed within the range of commits specified, for all the files that ever existed in the whole repo history.

## Required libraries

In [None]:
!python -m pip install tqdm pydriller numpy

## Imports

In [6]:
# from repostats import RepoStats, DotDict
from lib import RepoStats
from structures import DotDict
import sys

## Initialize RepoStats with a local repo

In [None]:
# load commits
# from_commit and to_commit represent the entire repo history available to the program
data = RepoStats(
    path = "path/to/git/repo/root",
    from_commit = "<full_length_40_char_hex_string_commit_hash>", # inclusive
    to_commit = "<full_length_40_char_hex_string_commit_hash>", # inclusive
)

In [None]:
# save data to disk
data.save("path/to/save/to/<path>_<from_commit>_<to_commit>.pkl")

In [None]:
# load data from disk
filename = "path/to/load/from/<path>_<from_commit>_<to_commit>.pkl"
data = RepoStats.load(filename)
data.update_methods(RepoStats)

In [None]:
# commits (warning: may be a lot)
data.commit_index.keys()

In [None]:
# just 2nd and 2nd last
l = list(data.commit_index.keys())
l[1], l[-2]

## Query for modifications and stats
Query within a range of commits (inclusive) for list of modified files/folders (or ones with a modified file somewhere down the directory subtree). Result list contains only files/folders up to the specified depth relative to the specified directory. Various parameters can be set to restrict the definition of a "modified" file, which will only alter the result list and other stats computed based on modified files.

In [None]:
# query for modifications and stats
# percentage change is the number of inserted lines divided by the number of lines in the resultant file
# lines changes count as one line deletion and one line insertion
query_params = DotDict(
    # required params
    commit1 = "<full_length_40_char_hex_string_commit_hash>", # Range query start (commit hash)
    commit2 = "<full_length_40_char_hex_string_commit_hash>", # Range query end (commit hash), set it to the same as commit1 for a single commit
    # optional params
    subfolder = "", # Subfolder (leave blank for root)
    depth = 1, # Output depth (search depth is infinite, negative to output everything)
    min_changes = 1, # Min number of times file is modified (inclusive)
    max_changes = -1, # Max number of times file is modified (inclusive, negative for infinity)
    avg_min = 0, # Min average percentage change of files considered to be modified (inclusive)
    avg_max = -1, # Max average percentage change of files considered to be modified (inclusive, negative for infinity)
    avg_variance_min = 0, # Min variance of the average percentage change of files considered to be modified (inclusive)
    avg_variance_max = -1, # Max variance of the percentage change of files considered to be modified (inclusive, negative for infinity)
    variance_min = 0, # Min average of how evenly spread are the lines changed in each file across the commits
    variance_max = -1, # Max average of how evenly spread are the lines changed in each file across the commits
    variance_variance_min = 0, # Min variance of the average of how evenly spread are the lines changed in each file across the commits
    variance_variance_max = -1, # Max variance of the average of how evenly spread are the lines changed in each file across the commits
    freq_avg_min = 0, # Min frequency (seconds/change) change of files considered to be modified (inclusive)
    freq_avg_max = -1, # Max frequency (seconds/change) change of files considered to be modified (inclusive, negative for infinity)
)

stats = data.query(query_params)

In [None]:
# files/folders containing valid modifications within query range
stats.results

In [None]:
# list stat attributes
list(stats)

In [None]:
stats.files_mod, stats.files, stats.total, stats.total_weighted, stats.change_count

In [None]:
# variance cannot be negative
[(i, stats.avg_variance[i]) for i in stats.avg_variance if stats.avg_variance[i] < -sys.float_info.epsilon]

In [None]:
# variance cannot be negative
[(i, stats.variance[i]) for i in stats.variance if stats.variance[i] < -sys.float_info.epsilon]

In [None]:
# negative variances due to catastrophic cancellation...
# should be ignored
[(i, stats.variance_variance[i]) for i in stats.variance_variance if stats.variance_variance[i] < -sys.float_info.epsilon]

In [None]:
[(i, f'{stats.freq_avg[i]} seconds or {stats.freq_avg[i]/3600:.2f} hours') for i in stats.freq_avg if stats.freq_avg[i] != float('inf')]

## Commits affecting paths
Get a dictionary of the number of modifications made by each commit to a path or within its subpaths.

In [None]:
# query for commits affecting paths (TODO: classify by commit type also)
commit_params = DotDict(
    # required params
    commit1 = "<full_length_40_char_hex_string_commit_hash>", # Range query start (commit hash)
    commit2 = "<full_length_40_char_hex_string_commit_hash>", # Range query end (commit hash), set it to the same as commit1 for a single commit
    # optional params
    subfolder = "", # Subfolder (leave blank for root)
)

commits = data.modifications(commit_params)

In [None]:
# number of files affected by each commit
commits

## Existential test
Whether a file existed, came into existence, ceased to exist or never existed within the range of commits specified, for all the files that ever existed in the whole repo history.

In [None]:
# query for file existence
exist_params = DotDict(
    # required params
    commit1 = "<full_length_40_char_hex_string_commit_hash>", # Range query start (commit hash)
    commit2 = "<full_length_40_char_hex_string_commit_hash>", # Range query end (commit hash), set it to the same as commit1 for a single commit
    # optional params
    subfolder = "", # Subfolder (leave blank for root)
)

exists = data.existence(commit_params)

In [None]:
# dictionary of {file path: existence}
exists