In [1]:
import subprocess
import pandas as pd
from datetime import datetime
import re

In [2]:
class GitLogStatics:
    def __init__(self):
        return
    
    START_COMMIT_RECORD = '______START_COMMIT_RECORD______'
    COMMITID            = 'CommitId'
    SUBJECT             = 'Subject'
    AUTHOR              = 'Author'
    AUTHOR_EMAIL        = 'Author_email'
    AUTHOR_DATE         = 'Author_date'
    COMMITTER           = 'Committer'
    COMMITTER_EMAIL     = 'Committer_email'
    COMMITTER_DATE      = 'Committer_date'
    BODY_STARTS         = 'Body_Starts'
    END_BODY            = 'End_Body'
    FILES_CHANGED       = 'Files_changed'
    
    # It is important the the 'FORMAT' string contain no spaces. Otherwise the subprocess.communicate() call will fail
    # silently and produce no output. For that reason we use a '.' (dot) in lieu of ' ' (a space)
    FORMAT = '' \
    + '%Cred______START_COMMIT_RECORD______%Creset' \
    + '%n' + COMMITID + '..........%H' \
    + '%n' + SUBJECT + '...........%s' \
    + '%n' + AUTHOR + '............%an' \
    + '%n' + AUTHOR_EMAIL + '......%ae' \
    + '%n' + AUTHOR_DATE + '.......%ad' \
    + '%n' + COMMITTER + '.........%cn' \
    + '%n' + COMMITTER_EMAIL + '...%ce' \
    + '%n' + COMMITTER_DATE + '....%cd' \
    + '%n' + BODY_STARTS + '.......%b' \
    + '%n' + END_BODY \
    + '%n' + FILES_CHANGED 

In [3]:
class ChangedLine():
    
    def __init__(self):
        self.filename    = None
        self.loc         = None
        self.loc_added   = None
        self.loc_removed = None
        self.loc_changed = None
        self.other       = None
        
    def buildUp(self, filename, raw_loc_changes): 
        self.filename = filename.strip()
        
        self.loc, self.loc_added, self.loc_removed, self.loc_changed, self.other = self._parseLocChanges(raw_loc_changes)
        
    def _parseLocChanges(self, raw): # raw is like '109 +++++--?' or 'Bin 0 -> 5190 bytes'
        REGEX = '^[0-9]+'
        m = re.search(REGEX, raw)
        if m != None:
            loc = int(m.group(0))
            # Ratios among '+', '-', '?' indicate what proportion of the 'loc'-many lines were added, removed, or changed
            p = raw.count('+')
            m = raw.count('-')
            q = raw.count('?')
            t = p + m + q
            added = 0
            removed = 0
            changed = 0
            if t != 0:
                added = loc * p/t
                removed = loc* m/t
                changed = loc* q/t
            return loc, added, removed, changed, None
        else:
            return 0,0,0,0, raw

In [4]:
class CommitParcel():
    def __init__(self):
        self.commitId        = None
        self.subject         = None
        self.author          = None
        self.author_email    = None
        self.author_date     = None
        self.committer       = None
        self.committer_email = None
        self.committer_date  = None
        self.body            = None
        self.files_changed   = []
        
    def buildUp(self, commit_lines):
        
        self.commitId        = self._stripLabel(commit_lines[1], GitLogStatics.COMMITID)
        self.subject         = self._stripLabel(commit_lines[2], GitLogStatics.SUBJECT)
        self.author          = self._stripLabel(commit_lines[3], GitLogStatics.AUTHOR)
        self.author_email    = self._stripLabel(commit_lines[4], GitLogStatics.AUTHOR_EMAIL)
        self.author_date     = self._stripLabel(commit_lines[5], GitLogStatics.AUTHOR_DATE)
        self.committer       = self._stripLabel(commit_lines[6], GitLogStatics.COMMITTER)
        self.committer_email = self._stripLabel(commit_lines[7], GitLogStatics.COMMITTER_EMAIL)
        self.committer_date  = self._stripLabel(commit_lines[8], GitLogStatics.COMMITTER_DATE)
        
        next_idx = self._readBody(commit_lines)
        self._readFilesChanged(commit_lines, next_idx)
        
    def _readBody(self, commit_lines):       
            idx = 9
            
            result = self._stripLabel(commit_lines[idx], GitLogStatics.BODY_STARTS)

            idx += 1
            while idx < len(commit_lines):
                if self._hasLabel(commit_lines[idx], GitLogStatics.END_BODY): # Done with body
                    break
                result += '\n' + commit_lines[idx]
                idx += 1
            self.body = result
            return idx
        
    def _readFilesChanged(self, commit_lines, start_idx):
        #Label indicating when this beings is GitLogStatics.FILES_CHANGED
        idx = start_idx
        while idx < len(commit_lines):
            line = commit_lines[idx]
            if line.find('|') != -1:
                tokens = line.split('|')
                assert len(tokens)== 2, tokens
                change = ChangedLine()
                change.buildUp(tokens[0].strip(), tokens[1].strip())  
                self.files_changed.append(change)
            idx += 1
            
    def _stripLabel(self, line, label): 
            REGEX  = '^' + label + '[.]*' # like 'Subject...........'
            m = re.search(REGEX, line)
            assert m!=None, line
            prefix = m.group(0)  
            return line[len(prefix):].strip()
        
    def _hasLabel(self, line, label):
            REGEX  = '^' + label + '[.]*' # like 'Subject...........'
            m = re.search(REGEX, line)
            if m==None:
                return False
            else:
                return True

In [5]:
class FilenameParser:   
    # -filename: a string, for the full path in a GIT repo for an artifact, with '/' separators. Might look like:
    #
    #     'MisysPD/UniversalBanking/BFUBInfrastructure/src/com/misys/ub/fatoms/batch/common/uxEnhancement/BatchUXEnhancementAccumulator.java'
    #
    # -layer_depth: integer stating how many folders in the filename's path correspond to the highest level (self.layer)
    #               as apposed to a submodule. In the above example a 'layer_depth' of 2 results in
    #
    #           self.layer     = 'MisysPD/UniversalBanking'
    #           self.submodule = 'BFUBInfrastructure'
    #
    # -package_folders: a list of strings, each of them being the name used for folders containing packages.
    #              For example, for Java the 'package_folder' is usually '[src]' or possibly '[src, src-gen]'
    #              if there are generated Java source files
    def __init__(self, filename, layer_depth=2, package_folders=['src']):
        self.filename          = filename
        self.layer_depth       = layer_depth
        self.package_folders   = package_folders
        
        self.artifact_type     = None
        self.layer             = None
        self.submodule         = None 
        self.package_type      = None
        self.package           = None
        self.classname         = None
        self.symbolic_link     = None
        
    def parse(self):
        
        #  Comments below are with regards to this example for 'self.filename':
        #
        # 'MisysPD/UniversalBanking/BFUBInfrastructure/src/com/misys/ub/fatoms/batch/common/
        #                                                 uxEnhancement/BatchUXEnhancementAccumulator.java'
        #
        # If 'self.filename' does not include a 'src' folder in its path, then the 'self.package' is the full path
        
        massaged_filename, link = self._stripSymbolicLinks()
        self.symbolic_link      = link
        tokens                  = massaged_filename.split('/')

        self.artifact_type      = self._extractArtifactType(tokens) # 'java'

        idx_module_ends         = len(tokens)-2 #Default value, may be changed in loop below if we have a source file
        self.package_type       = ''            #Default value, may be changed in loop below if we have a source file
        self.package            = ''            #Default value, may be changed in loop below if we have a source file
        for src in self.package_folders:
            if src in tokens:
                idx_module_ends   = tokens.index(src)
                self.package_type = src
                self.package      = '.'.join(tokens[idx_module_ends+1:-1]) # like 'com.misys.ub.fatoms.batch.common.uxEnhancement'
                break #Found it, so no need to keep searching

        module_tokens       = tokens[:idx_module_ends] # like '[MisysPD, UniversalBanking, BFUBInfrastructure]
        DEPTH               = self.layer_depth
        self.layer          = '/'.join(module_tokens[:DEPTH]) # like 'MisysPD/UniversalBanking'
        self.submodule      = '/'.join(module_tokens[DEPTH:]) # like 'BFUBInfrastructure'
                                          
        self.classname      = tokens[-1:][0]  # like 'BatchUXEnhancementAccumulator.java'
        
    # Used to massage filenames that contain symbolic links, by replacing the link by the actual file. For example,
    # if 'self.filename' is:
    # 
    #     'MisysPD/UniversalBanking/{BFUBRetail/src-gen => ReferencedBOs/src}/com/trapedza/bankfusion/bo/refimpl/IBOUB_BLK_StaticAccountBlock.java'
    #
    # then the symbolic link portion '{BFUBRetail/src-gen => ReferencedBOs/src}' gets replaced by 'ReferencedBOs/src'
    def _stripSymbolicLinks(self):
        REGEX = '{[ _.0-9a-zA-Z/-]+ => [ _.0-9a-zA-Z/-]+}'
        m = re.search(REGEX, self.filename)
        if m != None:
            link = m.group(0) # Like '{BFUBRetail/src-gen => ReferencedBOs/src}'
            link_tokens = link[1:-1].split(' => ') # like '[BFUBRetail/src-gen, ReferencedBOs/src]'
            massaged_filename = self.filename.replace(link, link_tokens[1])
            return massaged_filename, link
        else:
            return self.filename, ''

        
    def _extractArtifactType(self, tokens):
        pathless_filename = tokens[-1:][0] # Strips the pathname, leaving like BatchUXEnhancementAccumulator.java'
        words             = pathless_filename.split('.')
        if len(words) < 2:
            return '' # There is no sufix to the file
        else:
            #return words[-1:][0] # like 'java' or, if multiple suffixes are used, e.g. like 'properties.bak'
            return '.'.join(words[1:]) # like 'java' or, if multiple suffixes are used, e.g. like 'properties.bak'

In [6]:
class GitLogParser:
    # -git_directory: a string with the full path to the directory in which the git project exists and git commands
    #                 can be run successfully. For example, c:/Alex/Code/Essence/ubrepos'
    # -after_date: either None, or must be a string in format MM/DD/YY, such as '06/01/18' for the 1st of June, 2018.
    #              Used to filter log entries to only include commits after that date.
    # -before_date: similar to 'after_date', but excluding commits before the date.
    # -max_commits: integer used to restrict how many commits to retrieve. If set to 'None' then no restriction is imposed.
    # -diff_line_width: integer to determine how many characters to include in the diff lines product by 
    #                   'git log --stat'. Should be big enough that paths of changed files are not truncated, since
    #                   the 'GitLogParser' needs the full paths to accurately match changed files to entries
    #                   in the git repo. The default value of 350 should usually be enough.
    # -layer_depth: integer stating how many folders in the filename's path correspond to the highest level (self.layer)
    #               as apposed to a submodule. In the above example a 'layer_depth' of 2 results in
    #
    #           self.layer     = 'MisysPD/UniversalBanking'
    #           self.submodule = 'BFUBInfrastructure'
    #
    # -package_folders: a list of strings, each of them being the name used for folders containing packages.
    #              For example, for Java the 'package_folder' is usually '[src]' or possibly '[src, src-gen]'
    #              if there are generated Java source files
    def __init__(self, git_directory, after_date, before_date=None, max_commits=None, diff_line_width=350,
                layer_depth=2, package_folders = ['src']):
        self.git_directory   = git_directory
        self.after_date      = after_date
        self.before_date     = before_date
        self.max_commits     = max_commits
        self.diff_line_width = diff_line_width
        self.layer_depth     = layer_depth
        self.package_folders = package_folders
        self.log_tokens      = None
        self.git_command     = None
        self.parcels         = None

    def _buildGitCommand(self):
        
        FILTERS = ''
        if self.after_date != None:  
            FILTERS += ' --after=' + self.after_date
        if self.before_date != None:
            FILTERS += ' --before=' + self.before_date
        if self.max_commits != None:
            FILTERS += ' -n ' + str(self.max_commits)

        GIT_CMD = 'git log --date=short --format=' + GitLogStatics.FORMAT \
        + ' --stat --stat-width=' + str(self.diff_line_width) \
        + FILTERS
        
        return GIT_CMD
    
    def _generateLogTokens(self):
        self.git_command = glog._buildGitCommand()
        process          = subprocess.Popen(self.git_command.split(), cwd=self.git_directory, stdout=subprocess.PIPE)
        output, error    = process.communicate()
        s                = output.decode('utf-8')
        self.log_tokens  = s.split('\n')
        
    def parse(self):
        
        self._generateLogTokens()
        self.parcels = self._parcelOutCommits(self.log_tokens)
        return self._build_df()
        
    def _build_df(self):
        commitId_list        = []
        subject_list         = []
        author_list          = []
        author_email_list    = []
        author_date_list     = []
        committer_list       = []
        committer_email_list = []
        committer_date_list  = []
        body_list            = []
        filename_list        = []
        loc_list             = []
        loc_added_list       = []
        loc_removed_list     = []
        loc_changed_list     = []
        other_list           = []
        artifact_type_list   = []
        layer_list           = []
        submodule_list       = []
        package_list         = []
        package_type_list    = []
        classname_list       = []
        symbolic_link_list   = []

        for p in self.parcels:
            for f in p.files_changed:
                
                commitId_list        .append(p.commitId)
                subject_list         .append(p.subject)
                author_list          .append(p.author) 
                author_email_list    .append(p.author_email)
                author_date_list     .append(p.author_date)
                committer_list       .append(p.committer)
                committer_email_list .append(p.committer_email)
                committer_date_list  .append(p.committer_date)
                body_list            .append(p.body)
                filename_list        .append(f.filename)
                loc_list             .append(f.loc)
                loc_added_list       .append(f.loc_added)
                loc_removed_list     .append(f.loc_removed)
                loc_changed_list     .append(f.loc_changed)
                other_list           .append(f.other)

                fp = FilenameParser(f.filename, layer_depth=self.layer_depth, package_folders=self.package_folders)
                fp.parse()
                
                artifact_type_list    .append(fp.artifact_type)
                layer_list            .append(fp.layer)
                submodule_list        .append(fp.submodule)
                package_type_list     .append(fp.package_type)
                package_list          .append(fp.package)
                classname_list        .append(fp.classname)
                symbolic_link_list    .append(fp.symbolic_link)

        result_dict = {'CommitId(s)': commitId_list, 'Artifact Type': artifact_type_list, 
                       'Submodule': submodule_list, 'Package Type': package_type_list,
                       'Package': package_list, 'Classname': classname_list,
                       'Loc': loc_list, 'Loc+': loc_added_list, 'Loc-': loc_removed_list, 'Loc?': loc_changed_list,
                       'Loc other': other_list, 'Subject': subject_list, 'Body': body_list,
                       'Layer': layer_list, 'Symbolic Link': symbolic_link_list,'Filename': filename_list, 
                      'Author(s)': author_list, 'Author(s) e-mail': author_email_list, 'Author(s) date': author_date_list,
                      'Committer(s)': committer_list, 'Committer(s) e-mail': committer_email_list, 
                       'Comitter(s) date': committer_date_list,
                      }
        df = pd.DataFrame(result_dict)
        return df
        
    def _parcelOutCommits(self, tokens):
        cursor = 0
        parcelled_commits= []
        while cursor < len(tokens):
            n = self._getNextCommitLines(tokens, cursor)
            if n == None:
                break # We are done, didn't find a parcel
            parcel = CommitParcel()
            parcel.buildUp(n[0])
            parcelled_commits.append(parcel)
            cursor = n[1]
        return parcelled_commits

    def _getNextCommitLines(self, tokens, cursor):
        while tokens[cursor] != GitLogStatics.START_COMMIT_RECORD:
            cursor += 1
            if len(tokens) <= cursor: #No more tokens to see
                return None
        #Found where next commit starts
        commit_start_idx = cursor

        cursor += 1
        #search for where commit ends
        while tokens[cursor] != GitLogStatics.START_COMMIT_RECORD:
            cursor += 1
            if len(tokens) <= cursor: #No more tokens to see
                break

        commit_end_idx = cursor
        return tokens[commit_start_idx: commit_end_idx], cursor 

In [7]:
class GitLogAggregationEngine():
    
    #
    # -glogdf: a DataFrame, as built by the GitLogParser after doing a full parsing run
    def __init__(self, glog_df):
        self.glog_df      = glog_df
        
        self.artifacts_df = None
        self.modules_df   = None
        self.vol_df       = None
        
    # Saves all the dataframes in that are part of the state of 'self'
    def save_all(self, directory):
        if self.glog_df is not None:
            self.glog_df.to_csv(directory + '/raw_parsed_git_log.csv')
        if self.artifacts_df is not None:
            self.artifacts_df.to_csv(directory + '/by_artifact_parsed_git_log.csv')
        if self.modules_df is not None:
            self.modules_df.to_csv(directory + '/by_module_parsed_git_log.csv')
        if self.vol_df is not None:
            self.vol_df.to_csv(directory + '/by_volatility_parsed_git_log.csv')
    
    def aggregateByArtifact(self):
        artifacts_df      = self.glog_df.groupby(['Filename', 'Package Type']).apply(self._collapseFilenameMultiplicities)
        artifacts_df.sort_values(by=['# commits'], ascending=False, inplace=True)
        artifacts_df      = artifacts_df.reset_index()
        cols              = list(artifacts_df.columns)
        cols.remove('level_2')
        cols              = cols[1:] + cols[:1]
        artifacts_df      = artifacts_df[cols]
        self.artifacts_df = artifacts_df
        
        return self.artifacts_df
    
    def aggregateByModule(self):
        if self.artifacts_df is not None: # Optimization: re-use partial aggregation already done to file level
            input_df = self.artifacts_df
        else:
            input_df = self.glog_df # Do from scratch: aggregate from each <commit, file> pair
            
        modules_df = input_df.groupby(['Layer', 'Submodule', 'Package Type']).apply(self._collapseModuleMultiplicities)
        modules_df = modules_df.reset_index()
        cols = list(modules_df.columns)
        cols.remove('level_3')
        modules_df = modules_df[cols]
        modules_df.sort_values(by=['# commits'], ascending=False, inplace=True)
        
        self.modules_df = modules_df
        
        return self.modules_df
    
    # Returns a dataframe, 1 row per module, of all the modules where the changed loc exceeds the given
    # 'loc_limit'
    # Assumes that self.modules_df has already been computed (for example, by calling self.aggregateByModule)
    #
    # -loc_limit: an integer stating the lower bound for how many lines of code (loc) must have changed for
    #             a module to quality as volatile
    def buildVolatility_df(self, loc_limit):
        m_df = self.modules_df
        if m_df is None:
            return None
        vol_df = m_df[m_df['Loc'] > loc_limit]
        vol_df.sort_values(by=['Loc'], ascending=False, inplace=True)
        
        self.vol_df = vol_df
        return self.vol_df

    def _collapseFilenameMultiplicities(self, df):
        result_df = pd.DataFrame()

        assert df['Artifact Type'] .unique().size==1
        assert df['Layer']         .unique().size==1
        assert df['Submodule']     .unique().size==1
        assert df['Classname']     .unique().size==1
    
        result_df['Artifact Type']  = [df['Artifact Type'].iloc[0]]
        result_df['Layer']          = [df['Layer'].iloc[0]]
        result_df['Submodule']      = [df['Submodule'].iloc[0]]
        result_df['Classname']      = [df['Classname'].iloc[0]]
        
        self._aggregateMetrics(df, result_df)
        
        return result_df
    
    def _collapseModuleMultiplicities(self, df):
        result_df = pd.DataFrame()

        result_df['Artifact Types'] = [list(df['Artifact Type'].unique())]
        result_df['# files changed'] = [df['Filename'].unique().size]
        result_df['files changed'] = [list(df['Filename'])]
        
        self._aggregateMetrics(df, result_df)

        return result_df
    
    def _aggregateMetrics(self, input_df, result_df):

        author_list                   = self._mergeLists(list(input_df['Author(s)'])) #Avoid duplicates
        commit_ids                    = self._mergeLists(list(input_df['CommitId(s)']))
        if type(commit_ids) is not list:
            result_df['# commits']        = [1]                
        else:    
            result_df['# commits']        = [len(commit_ids)]                

        result_df['# authors']        = [len(author_list)]
        result_df['Loc']              = [input_df['Loc'].sum()]
        result_df['Loc+']             = [input_df['Loc+'].sum()]
        result_df['Loc-']             = [input_df['Loc-'].sum()]
        result_df['Loc?']             = [input_df['Loc?'].sum()]
        
        result_df['Loc other']        = [self._mergeLists(list(input_df['Loc other']))] #Avoid duplicates
        result_df['CommitId(s)']      = [commit_ids]
        result_df['Author(s)']        = [author_list]
        result_df['Author(s) e-mail'] = [self._mergeLists(list(input_df['Author(s) e-mail']))] #Avoid duplicates
        
    # Merges a list of elements under an agreed approach of defaulting to scalars for empty or singleton lists. 
    # Thus, elements which are themselves lists are concatenated with the result, whereas elements that are not lists
    # are treated as scalars and inserted to the resulting list.
    # Duplicates are avoided, and if the resulting list is a singleton then the unique element of the resulting
    # list is returned. Otherwise the resulting list is returned.
    #
    # -list_of_elts: a list where each element is either a string or another list
    def _mergeLists(self, list_of_elts):
        raw_merge = []
        for elt in list_of_elts:
            if type(elt)==list:
                raw_merge.extend(elt)
            else:
                raw_merge.append(elt)
        # Now eliminate duplicates
        no_duplicates_merge = list(set(raw_merge))
        if len(no_duplicates_merge)==1:
            return no_duplicates_merge[0]
        else:
            if len(no_duplicates_merge)==0:
                return ''
            else:
                return no_duplicates_merge

In [8]:
GIT_DIR  = 'c:/Alex/Code/Essence/ubrepos'
DATA_DIR = 'c:/Alex/Code/Essence/alex_analysis/data/ubrepos'
glog   = GitLogParser(GIT_DIR, '06/01/18', max_commits=None, package_folders=['src', 'src-gen', 'src-api', 'src-test'])
df     = glog.parse()
agg    = GitLogAggregationEngine(df)
a_df   = agg.aggregateByArtifact()
m_df   = agg.aggregateByModule()
vol_df = agg.buildVolatility_df(10000)
agg.save_all(DATA_DIR)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [9]:
df.shape, a_df.shape, m_df.shape, vol_df.shape

((19326, 22), (10194, 16), (178, 16), (35, 16))

In [10]:
df[:5]

Unnamed: 0,CommitId(s),Artifact Type,Submodule,Package Type,Package,Classname,Loc,Loc+,Loc-,Loc?,...,Body,Layer,Symbolic Link,Filename,Author(s),Author(s) e-mail,Author(s) date,Committer(s),Committer(s) e-mail,Comitter(s) date
0,9139a3e30c928abbb94fe11d7fc14ea67c76073c,asd.store,UBDBLoader,,,UB_TXN_GenerateGLExtract.asd.store,0,0.0,0.0,0.0,...,,MisysPD/DBLoader,,MisysPD/DBLoader/UBDBLoader/ASD/UB_TXN_Generat...,CMREUB,cmre.ub@misys.com,2019-06-14,CMREUB,CMREUB@misys.global.ad,2019-06-14
1,9139a3e30c928abbb94fe11d7fc14ea67c76073c,bfg.store,UBDBLoader,,,UB_TXN_GenerateGLExtract_SRV.bfg.store,0,0.0,0.0,0.0,...,,MisysPD/DBLoader,,MisysPD/DBLoader/UBDBLoader/BFG/UB_TXN_Generat...,CMREUB,cmre.ub@misys.com,2019-06-14,CMREUB,CMREUB@misys.global.ad,2019-06-14
2,9139a3e30c928abbb94fe11d7fc14ea67c76073c,properties,UB/servercommon,,,UB-TXN-GenerateGLExtract-SRV.properties,2,2.0,0.0,0.0,...,,MisysPD/bundles,,MisysPD/bundles/UB/servercommon/microflows/UB-...,CMREUB,cmre.ub@misys.com,2019-06-14,CMREUB,CMREUB@misys.global.ad,2019-06-14
3,03d61fe55073b02e58b58f1b0e89e116a024d3b6,bfg,Artefacts/Microflow/Financial,,,UB_TXN_GenerateGLExtract_SRV.bfg,109,109.0,0.0,0.0,...,,MisysPD/UniversalBanking,,MisysPD/UniversalBanking/Artefacts/Microflow/F...,Avinash N S,avinash.ns@misys.com,2019-06-14,Avinash N S,avinash.ns@misys.com,2019-06-14
4,03d61fe55073b02e58b58f1b0e89e116a024d3b6,asd,Artefacts,,,UB_TXN_GenerateGLExtract.asd,2,2.0,0.0,0.0,...,,MisysPD/UniversalBanking,,MisysPD/UniversalBanking/Artefacts/Step/UB_TXN...,Avinash N S,avinash.ns@misys.com,2019-06-14,Avinash N S,avinash.ns@misys.com,2019-06-14


In [11]:
a_df[:5]

Unnamed: 0,Package Type,Artifact Type,Layer,Submodule,Classname,# commits,# authors,Loc,Loc+,Loc-,Loc?,Loc other,CommitId(s),Author(s),Author(s) e-mail,Filename
0,src,java,MisysPD/UniversalBanking,BFUBLending,LendingUtils.java,66,14,1220,997.087879,222.912121,0.0,,"[be1928b3fea155effe330e27bd4e658f78849d7b, fbb...","[ssubramo, Durga N, chandahg, niranjan, nirkum...","[shanmugaprasad.mp@finastra.com, gmahendr@misy...",MisysPD/UniversalBanking/BFUBLending/src/com/m...
1,src,java,MisysPD/UniversalBanking,BFUBLending,LoanPlanGeneratorRqInitialiser.java,48,11,577,372.666667,204.333333,0.0,,"[4807e21a5dd1cbb76082e19fd73039a6f594737b, c25...","[ssubramo, Durga N, chandahg, niranjan, nirkum...","[durga.n@finastra.com, shanmugaprasad.mp@finas...",MisysPD/UniversalBanking/BFUBLending/src/com/m...
2,src,java,MisysPD/UniversalBanking,BFUBLending,LoanPlanGenerator.java,48,9,1623,849.423547,773.576453,0.0,,"[18bd1ca9056ce6a8f4a7fc2efe5fb277c13f2fed, d6d...","[Durga N, chandahg, nirkuma2, G P Mahendra, ge...","[shanmugaprasad.mp@finastra.com, gmahendr@misy...",MisysPD/UniversalBanking/BFUBLending/src/com/m...
3,,bfm,MisysPD/UniversalBanking,Artefacts,UB_LEN_CaptureLoanProdAndRepaymentDtls.bfm,42,10,0,0.0,0.0,0.0,"[Bin 1749735 -> 1749820 bytes, Bin 1722263 -> ...","[7a1d45502e082ff873de5c892769a7db187784a1, 5a0...","[Durga N, chandahg, nirkuma2, ipsita hota, G P...","[shanmugaprasad.mp@finastra.com, gmahendr@misy...",MisysPD/UniversalBanking/Artefacts/Messages/UB...
4,src,java,MisysPD/UniversalBanking,BFUBLending,LoanAccountAppropriation.java,41,11,498,309.3,188.7,0.0,,"[075d0c6c6fb51250e6050d4d75257a458c7c7f44, b83...","[Durga N, chandahg, nirkuma2, ipsita hota, viv...","[shanmugaprasad.mp@finastra.com, gmahendr@misy...",MisysPD/UniversalBanking/BFUBLending/src/com/m...


In [12]:
m_df[:5]

Unnamed: 0,Layer,Submodule,Package Type,Artifact Types,# files changed,files changed,# commits,# authors,Loc,Loc+,Loc-,Loc?,Loc other,CommitId(s),Author(s),Author(s) e-mail
78,MisysPD/UniversalBanking,BFUBLending,src,[java],412,[MisysPD/UniversalBanking/BFUBLending/src/com/...,1082,29,72822,44136.73,28685.27,0.0,,"[0f5bd8e1008d585f12e8ab083189fe46561d49f8, 23b...","[Avinash N S, Durga N, vivverm2, hargupta, Hid...","[shanmugaprasad.mp@finastra.com, geetha.palem@..."
5,MisysPD/DBLoader,UBDBLoader,,"[bfm.store, bfg.store, bod.store, pfd.store, a...",1328,[MisysPD/DBLoader/UBDBLoader/BFM/UB_LEN_Captur...,856,5,0,0.0,0.0,0.0,"[Bin 0 -> 32069 bytes, Bin 47460 -> 48276 byte...","[1967ba8d8be39947745c4f5160c52a85a0ea0e49, 25a...","[nandeemm, G P Mahendra, Unni Krishnan, Prasha...","[nandeesh.mm@misys.com, Prashant.UnniKrishnan@..."
35,MisysPD/UniversalBanking,Artefacts/Microflow,,"[bfg, bfg.mo, asd]",1840,[MisysPD/UniversalBanking/Artefacts/Microflow/...,717,69,4950346,2505224.0,2445122.0,0.0,,"[5383e0f97ce2a488499fcf878e0e16c27aa6273c, 1bd...","[Avinash N S, Durga N, Kumar, vivverm2, hargup...","[shanmugaprasad.mp@finastra.com, harikesh.gupt..."
21,MisysPD/UniversalBanking,Artefacts,,"[bfm, pfd, bfg, bod, asd, bfg.mo, xsd, exd, dr...",721,[MisysPD/UniversalBanking/Artefacts/Messages/U...,661,64,287543,155043.6,132499.4,0.0,"[Bin 1723936 -> 1724408 bytes, Bin 1680469 -> ...","[932809153bbf8edcec49cb1534042e401350a8ed, 3cf...","[Avinash N S, Durga N, gowthven, shmathu1, Kum...","[shanmugaprasad.mp@finastra.com, harikesh.gupt..."
177,MisysPD/bundles,UB/servercommon,,[properties],1046,[MisysPD/bundles/UB/servercommon/bftcMessages/...,632,4,10980,5976.0,5004.0,0.0,,"[1967ba8d8be39947745c4f5160c52a85a0ea0e49, e66...","[nandeemm, CMREUB, cmreub, Unni Krishnan, Pras...","[nandeesh.mm@misys.com, Prashant.UnniKrishnan@..."


In [13]:
vol_df[:5]

Unnamed: 0,Layer,Submodule,Package Type,Artifact Types,# files changed,files changed,# commits,# authors,Loc,Loc+,Loc-,Loc?,Loc other,CommitId(s),Author(s),Author(s) e-mail
35,MisysPD/UniversalBanking,Artefacts/Microflow,,"[bfg, bfg.mo, asd]",1840,[MisysPD/UniversalBanking/Artefacts/Microflow/...,717,69,4950346,2505224.0,2445122.0,0.0,,"[5383e0f97ce2a488499fcf878e0e16c27aa6273c, 1bd...","[Avinash N S, Durga N, Kumar, vivverm2, hargup...","[shanmugaprasad.mp@finastra.com, harikesh.gupt..."
135,MisysPD/UniversalBanking,ReferencedBOs,src,[java],585,[MisysPD/UniversalBanking/ReferencedBOs/src/co...,270,44,426515,261676.1,164838.9,0.0,,"[a19a8d3b766155c64a4e5360996ce1cae2550054, 2b9...","[Avinash N S, Durga N, gowthven, shmathu1, Kum...","[shanmugaprasad.mp@finastra.com, harikesh.gupt..."
159,MisysPD/UniversalBankingInterfaces,FFCComponents/dc-fbe-integration/dc-fbe-mappings,src,"[json, properties, groovy, integration.basetyp...",178,[MisysPD/UniversalBankingInterfaces/FFCCompone...,150,14,335830,328738.7,7091.266,0.0,,"[1330ffd42a7e1eec620c83119c1638244914c724, 322...","[jybhorge, dkhadil2, rajashekaragouda, Kumar, ...","[shiv.soumendrasarangi@misys.com, machamma.dev..."
45,MisysPD/UniversalBanking,Artefacts/Microflow/MoneyMarket/FixedDepositOp...,,"[bfg, bfg.mo]",24,[MisysPD/UniversalBanking/Artefacts/Microflow/...,49,17,332983,168019.5,164963.5,0.0,,"[7b9b8d54c07950f3d253c9d522e42ff61cc8b720, 18a...","[deshett3, datiwari, chethast, Asha, Kulkarni,...","[abhimanyu.singh@finastra.com, nandeesh.mm@mis..."
104,MisysPD/UniversalBanking,Documents,,[pdf],121,[MisysPD/UniversalBanking/Documents/fusion_ess...,14,3,308802,232461.7,76340.32,0.0,"[Bin 19409469 -> 0 bytes, Bin 0 -> 821854 byte...","[70417b9e0f691404c3dc6b8632934c878c394cf6, 359...","[Nisha M, Soniya, agangul2]","[Nisha.Muralidharan@finastra.come, Avishek.Gan..."
