In [17]:
import pandas as pd 
import os 
import numpy
import json 

In [52]:
# training_commit_samples.csv, bug_doc_clean.json 

In [55]:
df = pd.read_csv("training_commits_samples.csv")
df.columns 

Index(['Commit ID', 'total_files', 'deleted_files', 'testing', 'maintenance',
       'build', 'csha', 'commit message'],
      dtype='object')

## Prerprocess commit message 

In [56]:
csha_list = df['csha'].values
csha_list[0]

'008cd8a209564a958e9065a5f6ff05526a41f4a5'

In [57]:
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('wordnet')
from nltk.corpus import stopwords
from ast import literal_eval
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kelechi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/kelechi/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kelechi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
GIT_SVN_RE = re.compile('git-svn-id:.*\n')
issue_number_re = re.compile('[A-Za-z]+[-]\d+')
created_by_moe = re.compile('Created by MOE:.*\n')
MOE_ID = re.compile('MOE_MIGRATED_REVID=.*\n')
developer_name_re = re.compile('\(.*\)\n')
HTTP_RE = re.compile('(http|https):.*\n')
EMAIL_RE = re.compile('[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z0-9-.]+')
Other_RE = re.compile('(Author|Authors|Reviewers|Reviewer).*\n')
Bracket_RE = re.compile('\[.*\]')

break_line_re = re.compile('\n')
digital = re.compile('[0-9]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z ]')
STOPWORDS = set(stopwords.words('english'))

In [59]:
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = GIT_SVN_RE.sub('', text)
    text = issue_number_re.sub('', text)
    text = created_by_moe.sub('', text)
    text = MOE_ID.sub('', text)
    text = developer_name_re.sub('\n', text)
    text = HTTP_RE.sub('\n', text)
    text = EMAIL_RE.sub('\n', text)
    text = Other_RE.sub('', text)
    text = Bracket_RE.sub('', text)

    text = text.lower()  # lowercase text
    text = break_line_re.sub(' ', text)  # remove all \n
    text = digital.sub('', text)
    text = BAD_SYMBOLS_RE.sub(
        '', text)  # delete symbols which are in BAD_SYMBOLS_RE from text

    text = ' '.join([x for x in text.split() if x and x not in STOPWORDS
                     ])  # delete stopwords from text
    return text

In [61]:
test_commit = df['commit message'].values[232]
print(test_commit)
test_commit = text_prepare(test_commit)
print(test_commit)

AVRO-321.  Restore java RPC interop tests.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/avro/trunk@906635 13f79535-47bb-0310-9956-ffa450edef68

restore java rpc interop tests


In [62]:
df['commit message'] = [text_prepare(line) for line in df['commit message']]
for index, line in enumerate(df['commit message'].values):
    if index < 10:
        print(index, line)

0 merge pull request thiruapachethiru added tests fixed couple bugs also formatted code
1 datafilewriterappendto leads intermittent ioexception write
2 java add command line tool generate schema files protocol contributed bertrand dechoux
3 timeconversions implement getrecommendedschema closes signedoffby gabor szadovszky signedoffby sacharya signedoffby nandor kollar
4 java fix genericdatarecordequals correctly compare schemas fix schemaequals consider order
5 java fix decimal conversion bytebuffer
6 java add support snappy codec newer mapreduce api contributed matt mead
7 improve invalid file format error message
8 java fix builder api correctly handle default values enums
9 java permit maven find imports within project contributed alexandre normand


### Spell Checking 

In [63]:
!pip3 install pyenchant

Collecting pyenchant
Installing collected packages: pyenchant
Successfully installed pyenchant-2.0.0


In [65]:
from enchant.checker import SpellChecker
chkr = SpellChecker("en_US")
wrong_words = []
for index, text in enumerate(df['commit message']):
    chkr.set_text(text)
    for err in chkr:
        if err.word not in wrong_words:
            wrong_words.append(err.word)
        #   print(index, err.word)


def remove_noise_words(message_array):
    res = []
    for text in message_array:
        text = ' '.join(
            [x for x in text.split() if x and x not in wrong_words])
        res.append(text)
    return res


def replace_nosie_words_with_sth(message_array):
    res = []
    for text in message_array:
        # replace wrong word with something
        new_text = []
        for x in text.split():
            if x and x in wrong_words:
                new_text.append('something')
            else:
                new_text.append(x)

        text = ' '.join(new_text)
        res.append(text)
    return res


remove_noise_message = remove_noise_words(df['commit message'].values)
for index, line in enumerate(remove_noise_message):
    if index < 10:
        print(index, line)

KeyboardInterrupt: 