In [None]:
from functools import reduce
from pathlib import Path
from glob import glob
from time import sleep, perf_counter

import pandas as pd

def read_csv(path):
  csv = pd.read_csv(path)
  
  # Ensure commit message is a string.
  csv['message'] = csv['message'].astype(str)  
  
  return csv

start = perf_counter()

commits_per_language = dict(map(lambda path: (Path(path).stem, read_csv(path)), glob('results/csv/*.csv')))
all_commits = reduce(lambda a, b: pd.concat([a, b], ignore_index=True), commits_per_language.values())

stop = perf_counter()
print(f'Loading files took {stop - start:0.3f} seconds.')

all_commits

In [None]:
import multiprocessing
import mr4mp
import helpers

pool = mr4mp.pool()

start = perf_counter()

messages = [commit['message'] for (index, commit) in all_commits.iterrows()]
stop = perf_counter()
print(f'Creating list of messages took {stop - start:0.3f} seconds.')

all_words = pool.mapreduce(helpers.tokenize, helpers.reduce_list, messages)
stop = perf_counter()
print(f'Tokenizing messages took {stop - start:0.3f} seconds.')

all_words

In [None]:
import nltk

freq_dist = nltk.FreqDist(all_words)

m = 100
most_common_words = set(dist[0] for dist in freq_dist.most_common(m))

n = 100
least_common_words = set(dist[0] for dist in freq_dist.most_common()[:-n-1:-1])

In [None]:
most_common_words

In [None]:
least_common_words