In [None]:
from loading import load_commits

all_commits = load_commits()

In [None]:
import multiprocessing

import helpers

from tagging import message_to_known_tag

tagged_commits = all_commits.copy()

with multiprocessing.Pool() as p:
  tagged_commits[['message', 'tag']] = p.map(message_to_known_tag, list(tagged_commits['message']))
  tagged_commits['message'] = p.map(helpers.tokenize, tagged_commits['message'])

tagged_commits.dropna(subset=['tag'], inplace=True)
tagged_commits = tagged_commits.sample(frac=1)
tagged_commits.reset_index(inplace=True)
tagged_commits.head(10)

In [None]:
print(f'{len(tagged_commits)}/{len(all_commits)}')

In [None]:
import nltk
import pandas as pd

freq_dist = nltk.FreqDist(tagged_commits['tag'])
pd.DataFrame(freq_dist.items(), columns=['tag', 'count'])

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

tagged_commits['label'] = label_encoder.fit_transform(tagged_commits['tag'])
target_data = tagged_commits['label']
tagged_commits.head(10)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer() 

source_data = vect.fit_transform([" ".join(message) for message in tagged_commits["message"]])
source_data

In [None]:
import multiprocessing

import numpy as np
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)

print("Splits:", kf.get_n_splits(source_data))
print()

# results = list(map(helpers.test, enumerate((index, source_data, target_data) for index in kf.split(source_data))))

with multiprocessing.Pool() as p:
  results = p.map(helpers.test, list(enumerate((index, source_data, target_data) for index in kf.split(source_data))))

accuracy, f1_micro, f1_macro = np.mean(results, axis=0)

print("Total Accuracy:", accuracy)
print("Total F1 micro:", f1_micro)
print("Total F1 macro:", f1_macro)