# Basic Git Commit Analyzer Model

In [57]:
from pathlib import Path

import numpy as np
import pandas as pd

In [58]:
# Paths
data_dir = Path("__file__").absolute().parent.parent.parent.joinpath("data")

In [114]:
# Constants
repo_name = "tensorflow"

### Repository Statistics

In [198]:
# Repo Path
repo_path = data_dir.joinpath(repo_name)
if not repo_path.exists():
    raise ValueError(f"Repo path does not exist: {repo_path}")

# Repo stats
print(f"Repo path: {repo_path}")
files_size = sum(f.stat().st_size for f in repo_path.glob('**/*.parquet') if f.is_file())
print(f"Files size: {files_size / 1024:.2f} KB")

# Load index
index_df = pd.read_csv(repo_path.joinpath("index.csv"), encoding="utf-8-sig")

# Print index stats
print(f"# Indexed Files: ", index_df.shape[0])
print(index_df.columns)

# Print index
print()
print(index_df.iloc[0])

Repo path: C:\Users\punit\OneDrive\Documents\Projects\git-analyzer\data\tensorflow
Files size: 48019.57 KB
# Indexed Files:  2500
Index(['sha', 'node_id', 'commit', 'url', 'html_url', 'comments_url', 'author',
       'committer', 'parents'],
      dtype='object')

sha                      2a7ca476f44c58814d182c23d7f41a813468188e
node_id         C_kwDOArmXAtoAKDJhN2NhNDc2ZjQ0YzU4ODE0ZDE4MmMy...
commit          {'author': {'name': 'Jean-Baptiste Lespiau', '...
url             https://api.github.com/repos/tensorflow/tensor...
html_url        https://github.com/tensorflow/tensorflow/commi...
comments_url    https://api.github.com/repos/tensorflow/tensor...
author          {'login': 'jblespiau', 'id': 534945, 'node_id'...
committer       {'login': 'tensorflower-gardener', 'id': 17151...
parents         [{'sha': '10fe17255335e16aac9f828764050bd2c087...
Name: 0, dtype: object


### Helper Functions

In [152]:
def get_file(sha: str) -> pd.DataFrame:
    """
    Loads a file from the repo.
    :param sha: SHA of the commit/file.
    :return: Dataframe with the file.
    """
    file_name = f"{sha}.parquet"
    df = pd.read_parquet(repo_path.joinpath(file_name))
    df["file"] = df["filename"].apply(lambda x: x.split("/")[-1])
    df.set_index("file", inplace=True)
    return df

In [186]:
def parse_patch(
        patch: str,
        file_extension: str = None,
) -> list[list[list[str]]]:
    """
    Parses a patch and returns a list of lines.
    :param patch: Patch string to parse.
    :param file_extension: File extension.
    :return: List of lines.
    """

    # Get language
    if file_extension in ["c", "cpp", "h", "hpp"]:
        language = "c"
    elif file_extension in ["py"]:
        language = "py"
    else:
        language = "c"

    # Comments dict
    comments = {
        "c": ["//"],
        "py": ["#"],
    }

    # Create output list
    parsed = []

    # Split patch into lines
    lines = patch.split("\n")

    # Create additions and deletions
    additions = []
    deletions = []

    # Iterate through each line and parse it
    for line in lines:
        # Get only modified lines
        if line.startswith("+") or line.startswith("-"):
            # ADDITIONS
            if line.startswith("+"):

                # Remove leading and trailing whitespace
                line = line[1:].strip()
                # Don't add empty lines
                if line != "":
                    # Check for single line comments
                    if not line.startswith(comments[language][0]):
                        # Add line to additions
                        additions.append(line)

            # DELETIONS
            else:
                # Create next set
                if len(additions) > 0:
                    # Add to output
                    parsed.append([deletions, additions])

                    # Create new set
                    additions = []
                    deletions = []

                # Remove leading and trailing whitespace
                line = line[1:].strip()

                # Don't add empty lines
                if line.strip() != "":
                    # Check for single line comments
                    if not line.startswith(comments[language][0]):
                        # Add line to deletions
                        deletions.append(line)

    # Add last set if not empty
    if len(additions) > 0 or len(deletions) > 0:
        parsed.append([deletions, additions])

    return parsed

### Load Files

In [142]:
# Pick a random number
random_index = int(len(index_df) * np.random.random())
print(f"Random index: {random_index}")

Random index: 1143


In [185]:
# Load the random file
file_sha = index_df.iloc[random_index]["sha"]
file_df = get_file(file_sha)
print(f"File (sha): {file_sha}")
print(f"# Changed: ", file_df.shape[0])
print(file_df[["additions", "deletions", "changes"]])

File (sha): b85eb7f22bde6822b686d54330d77cd419e23aa0
# Changed:  5
                                         additions  deletions  changes
file                                                                  
quantized_function_library.mlir                  3          1        4
replace_cast_hacks_with_tf_xla_ops.cc           79         52      131
replace_cast_hacks_with_tf_xla_ops.td           72          3       75
utils.td                                        13          0       13
replace_cast_hacks_with_tf_xla_ops.mlir         84          0       84


### Raw Patch

In [182]:
# Iterate through the file and print the patch for each commit
for index, row in file_df.iterrows():
    print(f"File name: {row.name}")
    print(f"File path: {row['filename']}")
    print(f"Patch: ")
    print(row['patch'])
    print("\n", "="*80, "\n")

File name: quantized_function_library.mlir
File path: tensorflow/compiler/mlir/quantization/tensorflow/passes/quantized_function_library.mlir
Patch: 
@@ -167,9 +167,11 @@ module {
 
     %2 = "tf.Cast"(%filter) {Truncate = false} : (tensor<*xi8>) -> tensor<*xi32>
     %3 = "tf.Sub"(%2, %filter_zp) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    // Use identity op to avoid the filter being folded.
+    %identity = "tf.Identity"(%3) : (tensor<*xi32>) -> tensor<*xi32>
 
     %cast_1_f32 = "tf.Cast"(%1) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
-    %cast_3_f32 = "tf.Cast"(%3) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+    %cast_3_f32 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
 
     // TODO(b/215633216): Optimize this function with the XLA convolution op.
     %5 = "tf.DepthwiseConv2dNative"(%cast_1_f32, %cast_3_f32) {


File name: replace_cast_hacks_with_tf_xla_ops.cc
File path: tensorflow/compiler/mlir/quantization/tensorf

### Parsed Patch

In [199]:
# Save parsed files
parsed_files = {}

# Iterate through each file and parse patch
for index, row in file_df.iterrows():
    # Parse file
    parsed_patch = parse_patch(row['patch'])

    # Cache File
    fn_split = row.filename.split("/")
    _file_name = fn_split[-1] if len(fn_split) <= 1 else "-".join(fn_split[-2:])
    parsed_files[_file_name] = parsed_patch

    # Print File Name
    print(f"File: {_file_name}")

    # Print parsed patch
    for _set in parsed_patch:
        print("_"*90, "\n")

        _dels = _set[0]
        _adds = _set[1]

        for _line in _dels:
            print(f"- {_line}")
        for _line in _adds:
            print(f"+ {_line}")

    print("\n", "="*90, "\n")

File: passes-quantized_function_library.mlir
__________________________________________________________________________________________ 

+ %identity = "tf.Identity"(%3) : (tensor<*xi32>) -> tensor<*xi32>
__________________________________________________________________________________________ 

- %cast_3_f32 = "tf.Cast"(%3) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+ %cast_3_f32 = "tf.Cast"(%identity) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>


File: passes-replace_cast_hacks_with_tf_xla_ops.cc
__________________________________________________________________________________________ 

- const int stride_h = strides[1].cast<mlir::IntegerAttr>().getInt();
- const int stride_w = strides[2].cast<mlir::IntegerAttr>().getInt();
+ const int stride_h = strides[1].cast<IntegerAttr>().getInt();
+ const int stride_w = strides[2].cast<IntegerAttr>().getInt();
__________________________________________________________________________________________ 

- const int dilation

### Basic Checks

In [201]:
# Iterate through parsed files and compare len of additions and deletions
for _file_name, _parsed_patch in parsed_files.items():
    # Get len of additions and deletions
    _adds = [len(x[1]) for x in _parsed_patch]
    _dels = [len(x[0]) for x in _parsed_patch]

    # Print stats
    print(f"File: {_file_name}")
    print(f"# Additions: {sum(_adds)}")
    print(f"# Deletions: {sum(_dels)}")
    print(f"Net Change: {sum(_adds) - sum(_dels)}")
    print("\n", "="*80, "\n")

File: passes-quantized_function_library.mlir
# Additions: 2
# Deletions: 1
Net Change: 1


File: passes-replace_cast_hacks_with_tf_xla_ops.cc
# Additions: 71
# Deletions: 49
Net Change: 22


File: passes-replace_cast_hacks_with_tf_xla_ops.td
# Additions: 58
# Deletions: 3
Net Change: 55


File: passes-utils.td
# Additions: 6
# Deletions: 0
Net Change: 6


File: tests-replace_cast_hacks_with_tf_xla_ops.mlir
# Additions: 66
# Deletions: 0
Net Change: 66


