In [1]:
# Prerequisites for data collection
import os
import json
import pandas as pd
import numpy as np

# Prerequisites for BoDeGHa
import prerequisites as pq

## Data Collection

### 1. Install Perceval

In [2]:
!pip install virtualenv
!virtualenv venv
!source venv/bin/activate
!pip install git+https://github.com/chaoss/grimoirelab-perceval

created virtual environment CPython3.8.8.final.0-64 in 780ms
  creator CPython3Posix(dest=/Users/natarajanchidambaram/Documents/GitHub/MSR2022_code/Hackathon-21/Code/venv, clear=False, no_vcs_ignore=False, global=False)
  seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/Users/natarajanchidambaram/Library/Application Support/virtualenv)
    added seed packages: pip==21.3.1, setuptools==59.6.0, wheel==0.37.0
  activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
Collecting git+https://github.com/chaoss/grimoirelab-perceval
  Cloning https://github.com/chaoss/grimoirelab-perceval to /private/var/folders/77/sqj0d78x4833fl_fcc_d0jzm0000gn/T/pip-req-build-alrtwuld
  Running command git clone -q https://github.com/chaoss/grimoirelab-perceval /private/var/folders/77/sqj0d78x4833fl_fcc_d0jzm0000gn/T/pip-req-build-alrtwuld
  Installing build dependencies ... [?25ldone
[?25h  Getting re

In [3]:
def query(PERCEVAL_LOCATION, owner, repo, TOKEN):
    """
    args:
    PERCEVAL_LOCATION: Perceval tool's location
    owner: repository owner
    repo: repository name
    TOKEN: GitHub API token for querying
    
    return: queried data in the form of json file
    
    Description: Connect to Perceval for querying the data corresponding to the required repository
    """
    
    COMMAND_STRING = PERCEVAL_LOCATION + ' && perceval github {0} {1} --from-date 2021-12-01 --json-line --sleep-for-rate -t {2} '.format(owner, repo, TOKEN)
    command_output = os.popen(COMMAND_STRING)
    command_result = command_output.read()
    return(command_result)

In [4]:
def fetch_and_save_data(PERCEVAL_LOCATION, REPOSITORY, TOKEN, FILE_SAVE_LOCATION):
    """
    args:
    PERCEVAL_LOCATION: Perceval tool's location (for passing it to query function)
    REPOSITORY: Full repository description in the form of owner/repo
    TOKEN: GitHub API token (for passing it to query function)
    FILE_SAVE_LOCATION: Location to save the perocessed data
    
    Descriotion: Pass the repository information to query function, receive the json file, extract the required fields for BoDeGHa's ML part and save it as .csv file
    """
    
    for repository in pq.tqdm(REPOSITORY):
        owner = repository.split(r'/')[0]
        repo = repository.split(r'/')[1]
        raw_data = query(PERCEVAL_LOCATION, owner, repo, TOKEN)
        
        dict_list = [d.strip() for d in raw_data.splitlines()]
        json_format = [json.loads(i) for i in dict_list]
        
        formated_data = pd.DataFrame(columns=['author', 'body', 'number', 'created_at', 'empty']) 
        
        for line in json_format:
            try:
                for comment in range(len(line['data']['comments_data'])):
                    empty = 0
                    if len(line['data']['comments_data'][comment]['body']) < 2:
                        empty = 1
                    formated_data = formated_data.append({'author': line['data']['comments_data'][comment]['user_data']['login'],
                                                    'body': line['data']['comments_data'][comment]['body'],
                                                    'number': line['data']['number'],
                                                    'created_at': line['data']['comments_data'][comment]['created_at'],
                                                    'empty': empty
                                                   }, ignore_index=True)
            except:
                print('empty')
        file_name = FILE_SAVE_LOCATION + '{0}_{1}.json'.format(owner, repo)
        formated_data.to_csv(FILE_SAVE_LOCATION + '{0}_{1}.csv'.format(owner, repo))

In [5]:
PERCEVAL_LOCATION = 'source venv/bin/activate' # perceval tool location
REPOSITORY = ['SergioBenitez/Rocket', 'servo/servo']#, 'rustwasm/wasm-pack']
TOKEN = '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'
FILE_SAVE_LOCATION = r'Data/'

fetch_and_save_data(PERCEVAL_LOCATION, REPOSITORY, TOKEN, FILE_SAVE_LOCATION)

100%|██████████| 2/2 [04:48<00:00, 144.03s/it]


## BODeGHa bot classification tool (only ML part)

### 1. Load the model 

In [7]:
def get_model():
    """
    Get BoDeGHa's trained machine learning model
    """
    
    path = 'model.json'
    filename = pq.pkg_resources.resource_filename(__name__, path)
    with open(filename, 'rb') as file:
        model = pq.pickle.load(file)
    return model

In [17]:
"""
Enter the file name that has the processed data for BoDeGHa's trained ML part to work on
"""

file_name = r'Data/servo_servo.csv'#SergioBenitez_Rocket.csv'
comments = pd.read_csv(file_name, index_col = 0)

In [18]:
"""
User fixed constants for executing BoDeGHa
"""

MIN_COMMENTS = 10
MAX_COMMENTS = 100
EXCLUDE = [] # exclude these accounts
ACCOUNTS = []#['JoshLind', 'stale[bot]', 'libra-action', 'dimroc'] # consider only these accounts

In [19]:
"""
To get the top 100 comments for each author in the repository
"""

df = (
    comments
    [comments['author'].isin(
    comments
    .groupby('author', as_index = False)
    .count()[lambda x: x['body'] >= MIN_COMMENTS]['author'].values
    )]
    .sort_values('created_at', ascending = False)
    .groupby('author').head(MAX_COMMENTS)
)

In [20]:
"""
To either EXCLUDE or INCLUDE few accounts and to check if there is at least one account to execute the machine learning model
"""

if len(EXCLUDE) > 0:
    df = df[~df['author'].isin(EXCLUDE)]

if len(ACCOUNTS) > 0:
    df = df[lambda x: x['author'].isin(ACCOUNTS)]

if(len(df) < 1):
    print("At least 10 comments are required for each account to predict the type of the account")

In [21]:
"""
To calculate the jaccard and levenshtein for any pair of comments made by the author 
"""

inputs = []
for author, group in df.groupby('author'):
    inputs.append(
        (
            author,
            group.copy(),
            MAX_COMMENTS,
            {'func': pq.average_jac_lev, 'source': 'body', 'eps': 0.5}
        )
    )

In [22]:
"""
Store the features used for classification along with other required fields in a data frame
"""

data = []
for item in pq.tqdm(inputs,
                desc='Computing features',
                smoothing=.1,
                bar_format='{desc}: {percentage:3.0f}%|{bar}'):
    data.append(pq.task(item))
result = pd.DataFrame(
        data=data, columns=['account', 'comments', 'empty comments', 'patterns', 'dispersion'])

Computing features: 100%|██████████


In [23]:
"""
Predict the type of the account
"""

model = pq.run_function_in_thread(get_model, 5)
result = (
        result
        .assign(
            prediction=lambda x: np.where(model.predict(
                x[['comments', 'empty comments', 'patterns', 'dispersion']]) == 1, 'Bot', 'Human')
        )
    )
del model
result = result.sort_values(['prediction', 'account']).assign(patterns= lambda x: x['patterns'].astype('Int64'))

In [24]:
"""
Formatting and ordering
"""
result = (
            result
            .set_index('account')
            [['comments', 'empty comments', 'patterns', 'dispersion', 'prediction']]
        )

In [25]:
display(result)

Unnamed: 0_level_0,comments,empty comments,patterns,dispersion,prediction
account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bors-servo,100,0,11,0.141083,Bot
servo-wpt-sync,26,0,1,0.02597,Bot
CYBAI,40,0,21,0.09711,Human
highfive,21,0,6,0.174517,Human
jdm,100,0,53,0.10373,Human
negator,26,0,19,0.027161,Human
