In [None]:
from functools import reduce
from pathlib import Path
from glob import glob
from time import sleep, perf_counter

import pandas as pd

def read_csv(path):
  csv = pd.read_csv(path)
  
  # Ensure commit message is a string.
  csv['message'] = csv['message'].astype(str)
  
  return csv

start = perf_counter()

commits_per_language = dict(map(lambda path: (Path(path).stem, read_csv(path)), glob('results/csv/*.csv')))
all_commits = reduce(lambda a, b: pd.concat([a, b], ignore_index=True), commits_per_language.values())

stop = perf_counter()
print(f'Loading files took {stop - start:0.3f} seconds.')

all_commits

In [None]:
all_messages = [commit['message'] for (index, commit) in all_commits.iterrows()]

In [None]:
import nltk
import re

gitmoji_mappings = {
  ':memo:':             'docs',        # Documentation
  ':zap:':              'perf',        # Performance
  ':fire:':             'remove',      # Removal
  ':sparkles:':         'feat',        # Feature
  ':bug:':              'fix',         # Bug Fix
  ':lipstick:':         'ui',          # UI
  ':wrench:':           'config',      # Configuration
  ':hammer:':           'development', # Development Scripts
  ':art:':              'refactor',    # Improve Code Structure/Format
  ':white_check_mark:': 'test',        # Tests
  ':chore:':            'chore',       # Chore
  ':up:':               'update',      # Update
  ':arrow_up:':         'deps',        # Dependency Update
  ':arrow_down:':       'deps',        # Dependency Downgrade
  ':bulb:':             'docs',        # Update Source Code Comments
  ':rocket:':           'deploy',      # Deployment
  ':pencil2:':          'typo',        # Fix Typo
  ':green_heart:':      'ci',          # Fix CI
  ':construction:':     'wip',         # Work In Progress
  ':recycle:':          'refactor',    # Refactor Code
}

tag_mappings = {
  'bug':           'fix',
  'bugfix':        'fix',
  'testing':       'test',
  'tests':         'test',
  'tst':           'test',
  'documentation': 'docs',
  'doc':           'docs',
  'changelog':     'docs',
  'feature':       'feat',
  'gui':           'ui',
}

def message_to_tag(message):
  message = message.lower()
  
  # Extract “Conventional Commits”.
  match = re.match(r'^([^(\s:]+)(\([^)]+\))?!?:', message)
  if match:
    tag = match[1]
    return tag_mappings.get(tag) or tag        
      
  # Extract “Gitmoji Commits”.
  match = re.match(r'^(:[a-z0-9_]+:)', message)
  if match:
    tag = match[1]
    return gitmoji_mappings.get(tag) or tag
  
  return None

known_tags = set([
  'build',
  'chore',
  'ci',
  'deps',
  'docs',
  'feat',
  'fix',
  'perf',
  'refactor',
  'style',
  'test',
  'examples',
])

def message_to_known_tag(message):
  tag = message_to_tag(message)
  return tag if tag in known_tags else None

tags = [tag for tag in map(message_to_tag, all_messages) if tag]

tag_freq_dist = nltk.FreqDist(tags)
tag_freq_dist.most_common(25)

In [None]:
wanted_tags = [tag for tag in tags if tag in known_tags]

known_tag_freq_dist = nltk.FreqDist(wanted_tags)
known_tag_freq_dist.most_common(25)

In [None]:
import multiprocessing
import mr4mp
import helpers

pool = mr4mp.pool()

start = perf_counter()

messages = [commit['message'] for (index, commit) in all_commits.iterrows()]
stop = perf_counter()
print(f'Creating list of messages took {stop - start:0.3f} seconds.')

all_words = pool.mapreduce(helpers.tokenize, helpers.reduce_list, messages)
stop = perf_counter()
print(f'Tokenizing messages took {stop - start:0.3f} seconds.')

In [None]:
import nltk

freq_dist = nltk.FreqDist(all_words)

m = 100
most_common_words = set(dist[0] for dist in freq_dist.most_common(m))

n = 100
least_common_words = set(dist[0] for dist in freq_dist.most_common()[:-n-1:-1])

In [None]:
most_common_words

In [None]:
least_common_words