In [None]:
from functools import reduce
from pathlib import Path
from glob import glob
from time import sleep, perf_counter

import pandas as pd

def read_csv(path):
  csv = pd.read_csv(path)
  
  # Ensure commit message is a string.
  csv['message'] = csv['message'].astype(str)
  
  return csv

start = perf_counter()

commits_per_language = dict(map(lambda path: (Path(path).stem, read_csv(path)), glob('results/csv/*.csv')))
all_commits = reduce(lambda a, b: pd.concat([a, b], ignore_index=True), commits_per_language.values())

stop = perf_counter()
print(f'Loading files took {stop - start:0.3f} seconds.')

all_commits

In [None]:
all_messages = [commit['message'] for (index, commit) in all_commits.iterrows()]

In [None]:
import nltk
import re

gitmoji_mappings = {
  ':memo:':             'docs',        # Documentation
  ':zap:':              'perf',        # Performance
  ':fire:':             'remove',      # Removal
  ':sparkles:':         'feat',        # Feature
  ':bug:':              'fix',         # Bug Fix
  ':lipstick:':         'ui',          # UI
  ':wrench:':           'config',      # Configuration
  ':hammer:':           'development', # Development Scripts
  ':art:':              'refactor',    # Improve Code Structure/Format
  ':white_check_mark:': 'test',        # Tests
  ':chore:':            'chore',       # Chore
  ':up:':               'update',      # Update
  ':arrow_up:':         'deps',        # Dependency Update
  ':arrow_down:':       'deps',        # Dependency Downgrade
  ':bulb:':             'docs',        # Update Source Code Comments
  ':rocket:':           'deploy',      # Deployment
  ':pencil2:':          'typo',        # Fix Typo
  ':green_heart:':      'ci',          # Fix CI
  ':construction:':     'wip',         # Work In Progress
  ':recycle:':          'refactor',    # Refactor Code
}

tag_mappings = {
  'bug':           'fix',
  'bugfix':        'fix',
  'testing':       'test',
  'tests':         'test',
  'tst':           'test',
  'documentation': 'docs',
  'doc':           'docs',
  'changelog':     'docs',
  'feature':       'feat',
  'gui':           'ui',
}

def message_to_tag(message):
  message = message.lower()
  
  # Extract “Conventional Commits”.
  match = re.match(r'^([^(\s:]+)(?:\([^)]+\))?!?:\s*(.*)$', message)
  if match:
    tag = match[1]
    message = match[2]
    return (message, tag_mappings.get(tag) or tag)        
      
  # Extract “Gitmoji Commits”.
  match = re.match(r'^(:[a-z0-9_]+:)\s*(.*)$', message)
  if match:
    tag = match[1]
    message = match[2]
    return (message, gitmoji_mappings.get(tag) or tag)
  
  return (message, None)

known_tags = set([
  'build',
  'chore',
  'ci',
  'deps',
  'docs',
  'feat',
  'fix',
  'perf',
  'refactor',
  'style',
  'test',
  'examples',
])

def message_to_known_tag(message):
  message, tag = message_to_tag(message)
  return (message, tag) if tag in known_tags else (message, None)

tags = [tag for (message, tag) in map(message_to_tag, all_messages) if tag]

tag_freq_dist = nltk.FreqDist(tags)
tag_freq_dist.most_common(25)

In [None]:
import helpers

wanted_tags = [tag for tag in tags if tag in known_tags]

tagged_commits = all_commits.copy()
tagged_commits[['message', 'tag']] = tagged_commits['message'].apply(message_to_known_tag).apply(pd.Series)
tagged_commits['message'] = tagged_commits['message'].apply(helpers.tokenize)
tagged_commits.dropna(subset=['tag'], inplace=True)
tagged_commits.reset_index(inplace=True)
tagged_commits.head(10)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

tagged_commits['label'] = label_encoder.fit_transform(tagged_commits['tag'])
target_data = tagged_commits['label']
tagged_commits.head(10)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer() 

source_data = vect.fit_transform([" ".join(message) for message in tagged_commits["message"]])
source_data

In [None]:
import multiprocessing

from sklearn.model_selection import KFold

kf = KFold(n_splits=10)

print("Splits:", kf.get_n_splits(source_data))

results = list(map(helpers.test, enumerate((index, source_data, target_data) for index in kf.split(source_data))))

# with multiprocessing.Pool() as p:
#  results = p.map(helpers.test, enumerate(kf.split(source_data)))

In [None]:
import numpy as np

accuracy, f1_micro, f1_macro = np.mean(results, axis=0)

print("Total Accuracy:", accuracy)
print("Total F1 micro:", f1_micro)
print("Total F1 macro:", f1_macro)

In [None]:
import mr4mp

pool = mr4mp.pool()

start = perf_counter()

messages = [commit['message'] for (index, commit) in all_commits.iterrows()]
stop = perf_counter()
print(f'Creating list of messages took {stop - start:0.3f} seconds.')

all_words = pool.mapreduce(helpers.tokenize, helpers.reduce_list, messages)
stop = perf_counter()
print(f'Tokenizing messages took {stop - start:0.3f} seconds.')

In [None]:
import nltk

freq_dist = nltk.FreqDist(all_words)

m = 100
most_common_words = set(dist[0] for dist in freq_dist.most_common(m))

n = 100
least_common_words = set(dist[0] for dist in freq_dist.most_common()[:-n-1:-1])

In [None]:
most_common_words

In [None]:
least_common_words