In [None]:
# Listing all Commits for author-to-commit map for using the commands in README
import gzip
com = []
with gzip.open('/da4_data/play/botDetection/paper_a2c.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        for i in range(1,len(parts)):
            com.append(parts[i])

with gzip.open('/da4_data/play/botDetection/paper_commits.gz', 'wt') as f:
    f.write('\n'.join(com))

## This script assumes the following files are available:
1. author-to-commit (a2c) map for the suspected bots ('/da4_data/play/botDetection/paper_a2c.gz')
2. commit-to-content (c2cc) map for the commits in question ('/da4_data/play/botDetection/paper_cnt.gz')
3. commit-to-project (c2p) map for the commits. ('/da4_data/play/botDetection/paper_c2p.gz')
4. commit-to-file(c2f) map for the commits. ('/da4_data/play/botDetection/paper_c2f.gz')

See `README` for corresponding commands using World of Code tool.

## This script creates the following data:
1. Data for running BICA ('/da4_data/play/botDetection/test_Info_paper.csv.gz')
2. Data after running BIM ('/da4_data/play/botDetection/paper_template.out')

In [None]:
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
import ast
import json
import gzip
import time

In [None]:
# Adding Timeout
import signal

class timeout:
    def __init__(self, seconds=1, error_message='Timeout'):
        self.seconds = seconds
        self.error_message = error_message
    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)
    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)
    def __exit__(self, type, value, traceback):
        signal.alarm(0)

In [None]:
# Prepapring Author - Commit Message Map for Generating Data for BIM
pauthdict = {}
with gzip.open('/da4_data/play/botDetection/paper_a2c.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        pauthdict[parts[0]] = {'commits':parts[1:], 'message':[]}

pcc = {}        
with gzip.open('/da4_data/play/botDetection/paper_cnt.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        pcc[parts[0]] = ';'.join(parts[3:])
        
    
for key in pauthdict.keys():
    commits = pauthdict[key]['commits']
    for com in commits:
        try:
            pauthdict[key]['message'].append(pcc[com])
        except:
            continue   

In [None]:
# Generating Data for BIM
from collections import defaultdict
bin_threshold = [40]
id_threshold = 0.5 # 50 percent
max_bot_bin = 500
# nbhumans = defaultdict(list)
# nbbots = defaultdict(list)

fw = open('/da4_data/play/botDetection/paper_template.out','wb')

for threshold in bin_threshold:
    for key in pauthdict.keys():
        author, msgs = key, pauthdict[key]['message']
        print (author, len(msgs))

        if len (msgs) == 1:
            ost = ';'.join([author, str(len(msgs)), str(1), str(1)])+'\n'
            fw.write(ost.encode('utf-8'))
            continue
        elif len (msgs) > 100000:
            ost = ';'.join([author, str(len(msgs)), str(1), str(0)])+'\n'
            fw.write(ost.encode('utf-8'))
            continue
        bins = {}
        bratio = 0
        i = 0
        try:
            with timeout(seconds=60):                
                for commit in msgs:
                    i += 1
                    if len(bins) == 0:
                        bins[0] = [(commit, 100)]
                    elif len(commit) >= 200:
                        bins[len(bins)] = [(commit, 100)]
                        continue
                    else: 
                        '''
                        # Create sequences to be aligned.
                        b = Sequence('what a beautiful day'.split())
                        a = Sequence('what a disappointingly bad day'.split())
                        '''
                        a = Sequence(commit.split())
                        added = False
                        brflag = False
                        for key in bins:
                            b = Sequence(bins[key][0][0].split()) #first eleman of the tuple in the list
                            # Create a vocabulary and encode the sequences.
                            v = Vocabulary()   
                            try:                     
                                aEncoded = v.encodeSequence(a)
                                bEncoded = v.encodeSequence(b)

                                # Create a scoring and align the sequences using global aligner.
                                scoring = SimpleScoring(2, -1)
                                aligner = GlobalSequenceAligner(scoring, -2)                    
                                score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)                        

                                # Iterate over optimal alignments and print them.
                                pi_max = 0
                                score_ = 0
                                for encoded in encodeds:                               
                                    alignment = v.decodeSequenceAlignment(encoded)
                                    score_ = alignment.score
                                    percentIdentity =  alignment.percentIdentity()
                                    if percentIdentity > pi_max : pi_max = percentIdentity                    

                                if pi_max > threshold:
            #                         print (pi_max)
                                    bins[key].append((commit, percentIdentity)) # add b and similarity
                                    added = True
                                    break
                            except KeyboardInterrupt:
                                print ('KeyboardInterrupt')
                                break
                            except:
                                brflag = True
                                break
                        if brflag:
                            bins = {}
                            break
                        if added == False:
                            bins[len(bins)] = [(commit, 100)]
                        if len(bins) > max_bot_bin:                            
                            bratio = 1
                            break
                            
        except KeyboardInterrupt:
            print ('KeyboardInterrupt')
            break
        except TimeoutError:
            print ('Timeout')
            ost = ';'.join([author, str(i), str(len(bins.keys())), str(ratio)])+'\n'
            fw.write(ost.encode('utf-8'))
            continue
        except Exception as e:
            print (e)
            break
                
        num_commits = len(msgs)
        ratio = max(len(bins.keys()) / num_commits, bratio)
        ost = ';'.join([author, str(num_commits), str(len(bins.keys())), str(ratio)])+'\n'
        fw.write(ost.encode('utf-8'))
        
fw.close()   

### This Concludes Data Generation for BIM

Format of output data:

`Author ID; No. of Commits; No. of Bins; Ratio`

In [None]:
# Converting Clock Time of commit to standardized time with time zone correction
fc = ''
from datetime import datetime
wf = gzip.open('/da4_data/play/botDetection/paper_cnt2.gz', 'wt', encoding = 'iso-8859-15')
with gzip.open('/da4_data/play/botDetection/paper_cnt.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        time = parts[2]
        del(parts[2])
        old = int(time.split()[0])
        epoch = int(time.split()[0])
        tz = time.split()[1]
        if tz[0] == '+':
            epoch = epoch + (int(tz[1:3])*3600 + int(tz[3:5])*60)
        elif  tz[0] == '-':
            epoch = epoch - (int(tz[1:3])*3600 + int(tz[3:5])*60)

        ts =  datetime.fromtimestamp(epoch)
        oldts = datetime.fromtimestamp(old)

        fc = ';'.join(parts) +';'+str(oldts) +';'+str(ts) + ';'+str(tz)+'\n'
        wf.write(fc)
 
wf.close()

In [None]:
#Creating required dicts for data generation for BICA

fc2content = dict()
with gzip.open('/da4_data/play/botDetection/paper_cnt2.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        fc2content[parts[0]] = {'message':parts[2], 'clock.time':parts[3], 'timezone':parts[5]}


with gzip.open('/da4_data/play/botDetection/paper_c2f.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        try:
            fc2content[parts[0]]['files'] = list(set(parts[1:]) )
        except:
            try:
                fc2content[parts[0]]['files'] = []
            except:
                continue
                
                
with gzip.open('/da4_data/play/botDetection/paper_c2p.gz', 'rt', encoding = 'iso-8859-15') as f:
    for line in f:
        line = line.strip()
        parts = line.split(';')
        try:
            fc2content[parts[0]]['projects'] = list(set(parts[1:]) )
        except:
            try:
                fc2content[parts[0]]['projects'] = []
            except:
                continue     

In [None]:
#Create Final Dataset for BICA - no aliasing
import statistics as stat
from datetime import datetime, timedelta
from scipy.stats import circvar
from collections import Counter
from scipy.stats import iqr



out = 'Author, No.Commit, Days.Active, Avg.Commit.pYear, Median.Commit.pYear, Activity.Hours, \
Spike.Hours, Circ.Variance.Hour, Tot.uniq.FilesChanged, Tot.FilesChanged, Uniq.File.Exten,\
Avg.File.pCommit, Std.File.pCommit, No.Timezones, Std.Timezones, Tot.Projects, Tot.uniq.Projects, \
Median.Project.pCommit, Std.Project.pCommit \n'

f= gzip.open('/da4_data/play/botDetection/test_Info_paper.csv.gz','wt',  encoding = 'iso-8859-15') 
f.write(out)
#i = 0
with gzip.open('/da4_data/play/botDetection/paper_a2c.gz', 'rt', encoding = 'iso-8859-15') as cf:
    for line in cf:
        line = line.strip()
        parts = line.split(';')
        auth = parts[0]
        msgs, times, tz, files, n_files, proj, n_proj = ([] for i in range(7))
        commits = parts[1:]
        nc = len(commits) 
        print(auth, nc)
        for com in commits:
            try:
                cont = fc2content[com]
            except:
                continue

            #msgs.append(cont['message'])
            times.append(datetime.strptime(cont['clock.time'], '%Y-%m-%d %H:%M:%S'))
            ptz = cont['timezone']
            tz.append((timedelta(hours=12)+timedelta(hours=int(ptz[0:2]),minutes=int(ptz[2:])) \
                       if '-' not in ptz else timedelta(hours=12)-timedelta(hours=int(ptz[1:3]),\
                                                                            minutes=int(ptz[3:]))).seconds/3600.0)
            files += cont['files']
            n_files.append( len(cont['files']))
            proj += cont['projects']
            n_proj.append(len(cont['projects']))

        if len(times)  == 0:
            continue
        #File
        tuf = len(set(files))
        tf = len(files)
        ufe = len(set([x.split('.')[-1] for x in files]))
        
        try:
            afpc = stat.mean(n_files)
            vfpc = stat.stdev(n_files)
        except:
            afpc = 0
            vfpc = 0
        #tz
        ntz = len(set(tz))
        try:
            vtz = stat.stdev(tz)
        except:
            vtz = 0
        #proj
        tp = len(proj)
        tup = len(set(proj))
        
        try:
            mppc = stat.median(n_proj)
            vppc = stat.stdev(n_proj)
        except:
            mppc = 0
            vppc = 0
        #time
        yatd = max(times) - min(times)
        ya = yatd.days+1
        y_times = Counter([x.year for x in times])
        acpy = stat.mean(list(y_times.values()))
        mcpy = stat.median(list(y_times.values()))
        h_times = Counter([x.hour for x in times])
        cvhour = circvar(list(h_times.values()))
        ih = iqr(list(h_times.values()))
        ahour = len(h_times.keys())
        sphour = len([x for x in h_times.values() if x > 1.5*ih])

        # join
        out = ','.join(str(x) for x in (auth.replace(',',''), nc,ya,acpy,mcpy,\
                                        ahour,sphour,cvhour,tuf,tf,ufe,afpc,vfpc,ntz,vtz,tp,tup,mppc,vppc))+'\n'
        f.write(out)

    
    

f.close()        

### This concludes data generation for BICA

Not all the variables are used in the model we have, but you might try with other variables as well.