In [1]:
import requests 
import pandas as pd
import matplotlib.pyplot as plt 
from dotenv import load_dotenv
from os import environ
from tools import *

load_dotenv()

True

In [2]:
g = Github(environ['user'],environ['token'])


## Retrieving Commits

We can retrieve the patch/diff information of a commit. Note that this is one commit call, and Github is limited to 5000 requests an hour, i.e 5000 commits scraped. Let's sample some from SpeechBrain.

In [3]:
commit_list = list(map(lambda x:x['sha'],flatten([g.get_commits('speechbrain/speechbrain',{'page':x+1,'per_page':100}).json() for x in range(2)])))

In [9]:
g.get_commit_diff('speechbrain/speechbrain',commit_list[0]).json()['files'][0]['patch']

'@@ -18,7 +18,7 @@ Here is a list of the different languages that we tested within the CommonVoice\n | Language | CommonVoice Release | hyperparams file | LM | Val. CER | Val. WER | Test CER | Test WER | HuggingFace link | Model link | GPUs |\n | ------------- |:-------------:|:---------------------------:| -----:| -----:| -----:| -----:| -----:| :-----------:| :-----------:| :-----------:|\n | English | 2020-12-11 | train_en_with_wav2vec.yaml | No | 5.01 | 12.57 | 7.32 | 15.58 | Not Avail. | [model](https://drive.google.com/drive/folders/1tYO__An68xrM5pR1UIXzEkwzvKX2Tz2o?usp=sharing) | 2xV100 32GB |\n-| French | 2020-12-11 | train_fr_with_wav2vec.yaml | No | 2.60 | 8.59 | 3.19 | 9.96 | Not Avail. | [model](https://drive.google.com/drive/folders/1T9DfdZwcNI9CURxhLCi8GA5JVz8adiY8?usp=sharing) | 2xV100 32GB |\n+| French | 2020-12-11 | train_fr_with_wav2vec.yaml | No | 2.60 | 8.59 | 3.19 | 9.96 | [model](https://huggingface.co/speechbrain/asr-wav2vec2-commonvoice-fr) | [model](https://dri

## Filter words

I have inputted some PyTorch layers to pick up in the process of reading line diffs. This is to be expanded for things like hyperparameters and eventually grouping by Layer/Hyperparameter.

In [11]:
filter_words = ['Conv1d','Conv2d','Conv3d', 'ConvTranspose1d','ConvTranspose1d','ConvTranspose2d','ConvTranspose3d','LazyConv1d','LazyConv2d','LazyConv3d','LazyConvTranspose1d','LazyConvTranspose2d','LazyConvTranspose3d','Unfold','Fold',
'MaxPool1d', 'MaxPool2d', 'MaxPoool3d', 'MaxUnpool1d', 'MaxUnpool2d','MaxUnpool3d', 'AvgPool1d', 'AvgPool2d','AvgPool3d', 'LPPool1d','LPPool2d', 'LPPool3d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d',
'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d',
'ELU', 'Hardshrink', 'Hardsigmoid', 'Hardtanh', 'Hardswish', 'LeakyReLU', 'LogSigmoid', 'MultiheadAttention', 'PReLU', 'ReLU', 'RReLU', 'SELU', 'CELU', 'GELU', 'Sigmoid', 'SiLU', 'Mish', 'Softplus', 'Softshrink', 'Softsign', 'Tanh', 'Tanhshrink', 'Threshold',
'Softmin', 'Softmax','Softmax2d', 'Softmax2d', 'LogSoftmax', 'AdaptiveLogSoftmaxWithLoss',
'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'GroupNorm', 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm',
'RNN', "LSTM", 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
'Transformer', 'TransformerEncoder', 'TransformerDecoder', 'Linear', 'Bilinear', 'LazyLinear', 'Identity',
'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout',
'Embedding',
'batch_size', 'num_epochs', 'epochs', 'n_hidden'
]

In [10]:
commit_history = []


In [14]:
for commit_sha in commit_list[:200]:
    try:
        commit = g.get_commit_diff('speechbrain/speechbrain',commit_sha).json()
        com = commit['files']
    except KeyError: 
        continue
    
    try:
        commit_history.append([commit['sha'],[
        [file['filename'],[
        [line[0],set(re.findall('|'.join(filter_words), line))] for line in 
        [line  for line in file['patch'].split('\n') if  len(line) > 2 and ((line[0]=='+' and line[1] != '+') or (line[0]=='-' and line[1] != '-'))] 
        if len(set(re.findall('|'.join(filter_words), line))) != 0]
        ] 
        for file in com if file['filename'].split('.')[-1] == 'py']])
    except:
        continue

In [15]:
commit_history

[['f968c46fe6e4eb07202e32ca54335112acc42195', []],
 ['6d44f957bad493bba63ffaac86201aa1f31f250c', [['speechbrain/core.py', []]]],
 ['b279e6fc038bbaf1ed0d5c5cdbb2636ee5806bba', [['speechbrain/core.py', []]]],
 ['8030182de32c3188e1a4103fbcd49be9f3b486b2',
  [['speechbrain/utils/metric_stats.py', []],
   ['tests/unittests/test_metrics.py', []]]],
 ['e14d2c15874a22df018edf441af62b105b33ed39',
  [['speechbrain/core.py',
    [['+', {'epochs'}], ['-', {'epochs'}], ['+', {'epochs'}]]]]],
 ['5cab96e21a1e793c8b294de21bc5e6d897e3e62b',
  [['speechbrain/utils/metric_stats.py', []]]],
 ['8b914ff01099ff879cebeabdd18cc207561a55ae',
  [['speechbrain/utils/metric_stats.py', []]]],
 ['e1b7ea21be4a6b711b6bd410f6cfb20427d82d03',
  [['speechbrain/core.py', [['-', {'epochs'}], ['+', {'epochs'}]]]]],
 ['3e4d85ae6f558742f4760bf328230086e54fccc1',
  [['speechbrain/core.py',
    [['+', {'epochs'}], ['-', {'epochs'}], ['+', {'epochs'}]]]]],
 ['01b84a7ef4c90cfffc1111e709c140080e630d1a',
  [['tests/unittests/test_m

We see that each commit has a list of files, each with summarised changes pertaining to any changes, especially Layer changes. We can combine this with commit information for comprehensive tagging. 