In [None]:
import requests 
import pandas as pd
import matplotlib.pyplot as plt 
from dotenv import load_dotenv
from os import environ
from tools import *

load_dotenv()

In [None]:
g = Github(environ['user'],environ['token'])

## Retrieving Commits

We can retrieve the patch/diff information of a commit. Note that this is one commit call, and Github is limited to 5000 requests an hour, i.e 5000 commits scraped. Let's sample some from SpeechBrain.

In [None]:
commit_list = list(map(lambda x:x['sha'],flatten([g.get_commits('speechbrain/speechbrain',{'page':x+1,'per_page':100}).json() for x in range(2)])))

In [None]:
g.get_commit_diff('speechbrain/speechbrain',commit_list[0]).json()['files'][0]['patch']

## Filter words

I have inputted some PyTorch layers to pick up in the process of reading line diffs. This is to be expanded for things like hyperparameters and eventually grouping by Layer/Hyperparameter.

In [None]:
filter_words = ['Conv1d','Conv2d','Conv3d', 'ConvTranspose1d','ConvTranspose1d','ConvTranspose2d','ConvTranspose3d','LazyConv1d','LazyConv2d','LazyConv3d','LazyConvTranspose1d','LazyConvTranspose2d','LazyConvTranspose3d','Unfold','Fold',
'MaxPool1d', 'MaxPool2d', 'MaxPoool3d', 'MaxUnpool1d', 'MaxUnpool2d','MaxUnpool3d', 'AvgPool1d', 'AvgPool2d','AvgPool3d', 'LPPool1d','LPPool2d', 'LPPool3d', 'AdaptiveMaxPool1d', 'AdaptiveMaxPool2d', 'AdaptiveMaxPool3d', 'AdaptiveAvgPool1d',
'AdaptiveAvgPool2d', 'AdaptiveAvgPool3d',
'ReflectionPad1d', 'ReflectionPad2d', 'ReplicationPad1d', 'ReplicationPad2d', 'ReplicationPad3d', 'ZeroPad2d', 'ConstantPad1d', 'ConstantPad2d', 'ConstantPad3d',
'ELU', 'Hardshrink', 'Hardsigmoid', 'Hardtanh', 'Hardswish', 'LeakyReLU', 'LogSigmoid', 'MultiheadAttention', 'PReLU', 'ReLU', 'RReLU', 'SELU', 'CELU', 'GELU', 'Sigmoid', 'SiLU', 'Mish', 'Softplus', 'Softshrink', 'Softsign', 'Tanh', 'Tanhshrink', 'Threshold',
'Softmin', 'Softmax','Softmax2d', 'Softmax2d', 'LogSoftmax', 'AdaptiveLogSoftmaxWithLoss',
'BatchNorm1d', 'BatchNorm2d', 'BatchNorm3d', 'GroupNorm', 'InstanceNorm1d', 'InstanceNorm2d', 'InstanceNorm3d', 'LayerNorm',
'RNN', "LSTM", 'GRU', 'RNNCell', 'LSTMCell', 'GRUCell',
'Transformer', 'TransformerEncoder', 'TransformerDecoder', 'Linear', 'Bilinear', 'LazyLinear', 'Identity',
'Dropout', 'Dropout2d', 'Dropout3d', 'AlphaDropout',
'Embedding',
'batch_size', 'num_epochs', 'epochs', 'n_hidden'
]

In [None]:
commit_history = []


In [None]:
for commit_sha in commit_list[:200]:
    try:
        commit = g.get_commit_diff('speechbrain/speechbrain',commit_sha).json()
        com = commit['files']
    except KeyError: 
        continue
    
    try:
        commit_history.append([commit['sha'],[
        [file['filename'],[
        [line[0],set(re.findall('|'.join(filter_words), line))] for line in #  over each line, find all the unique instances of matches, along with +-
        [line  for line in file['patch'].split('\n') if  len(line) > 2 and ((line[0]=='+' and line[1] != '+') or (line[0]=='-' and line[1] != '-'))] # Split diff into list of lines
        if len(set(re.findall('|'.join(filter_words), line))) != 0] # If there's no filtering, ignore
        ] 
        for file in com if file['filename'].split('.')[-1] == 'py']]) # Filter python files
    except:
        continue

Then you can view the commit history with the list above:

commit_history