# read labels from ./commit data.xlsx

In [1]:
import psycopg2   
from sqlalchemy import create_engine 
import pandas as pd

In [2]:
df =  pd.read_excel('commit data.xlsx')
csha_list = df['commit id'].values

# read commit messages from database - msg

In [3]:
try:
    conn = psycopg2.connect(
        "dbname='msg' user='postgres' password = 'rebecca9581' host='localhost'"
    )
except:
    print("Access denied!")
cur = conn.cursor()

In [4]:
cmt_msg = []
for index, csha in enumerate(csha_list):
    qr =  "SELECT message FROM commits WHERE csha LIKE '{}%'".format(csha_list[index].strip())
    cur.execute(qr)
    fetched_row = cur.fetchone()
    if fetched_row != None:
#         print(index, fetched_row[0].replace('\n',""))
        cmt_msg.append(fetched_row[0].replace('\n',""))
    else:
        print('None')
        cmt_msg.append('None')

None
None
None


In [5]:
df['cmt_msg'] = cmt_msg
df.head()

Unnamed: 0,commit id,class,cmt_msg
0,0b9ea98,NFT,"[CALCITE-1124] Add TIMESTAMPADD, TIMESTAMPDIFF..."
1,980d9f8,RFT,Move code from JdbcImplementor and JdbcRules t...
2,e9d5060,NFT,[CALCITE-968] Stream-to-relation and stream-to...
3,a63639b,NFT,[CALCITE-912] Add Avatica OpenConnectionReques...
4,a611d64,RMN,[CALCITE-296] Re-organize package structure;[C...


In [6]:
if conn:
    conn.close()

# data pre-processing

In [7]:
import nltk
nltk.download('stopwords')
nltk.download('words')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebecca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /Users/rebecca/nltk_data...
[nltk_data]   Package words is already up-to-date!


## change into lower case

In [8]:
df['cmt_msg'] = df['cmt_msg'].str.lower()

##  SpellCheck

!pip install pyenchant

In [9]:
from enchant.checker import SpellChecker
chkr = SpellChecker("en_US")
wrong_words = []
for index, text in enumerate(df['cmt_msg']):
    chkr.set_text(text)
    for err in chkr:
        if err.word not in wrong_words:
            wrong_words.append(err.word)
#         print(index, err.word)
    

## delete useless information
* website link
* creator
* [ calcite-]
* git-svn-id
* stop words
* wrong spelling words

In [10]:
import re

In [11]:
issue = re.compile('[a-z]+[- ]\d+')
created_by_moe = re.compile('created by moe')
GIT_SVN_RE = re.compile('git-svn-id')
HTTP_SIGN_RE = re.compile('(http|https):\/\/.*')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]\.')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
CALCITE_RE = re.compile('(calcite)\S*')
STOPWORDS = set(stopwords.words('english'))

In [12]:
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = issue.sub(' ',text)
    text = created_by_moe.sub(' ',text)
    text = HTTP_SIGN_RE.sub(' ',text)
    text = GIT_SVN_RE.sub(' ',text)
    text = CALCITE_RE.sub(' ',text)
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub(' ', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([x for x in text.split() if x and x not in STOPWORDS and x not in wrong_words]) # delete stopwords from text
    return text

In [13]:
df['cmt_msg'] = [text_prepare(line) for line in df['cmt_msg']]

for index, line in enumerate(df['cmt_msg'].values):
    print(index, line)

## word stemming

In [14]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english") # Choose a language
# stemmer.stem("countries")
for index, text in enumerate(df['cmt_msg']):
#     print(index, df['cmt_msg'][index])
    df['cmt_msg'][index] = ' '.join([stemmer.stem(x) for x in text.split()])
#     print(index, df['cmt_msg'][index])

* lemmatization - doesn't perform as good as stemming

## delete rows with missing values ( no valuable commit messages)

In [15]:
data = df[df['cmt_msg']!='']
data = data.reset_index(drop=True)

In [16]:
data.shape

(910, 3)

In [17]:
data.head()

Unnamed: 0,commit id,class,cmt_msg
0,0b9ea98,NFT,add function close apach
1,980d9f8,RFT,move code new class
2,e9d5060,NFT,stream relat stream stream join rule transform...
3,a63639b,NFT,add goal pass connect properti info driver cor...
4,a611d64,RMN,organ packag structur name convent planner cha...


# text to vector

In [18]:
from collections import Counter
from scipy import sparse as sp_sparse
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [19]:
X, y = data['cmt_msg'].values, data['class'].values

In [20]:
words_counts = Counter()

for words in X:
    for word in words.split():
        words_counts[word] += 1

most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# most_common_words

In [21]:
tags_counts = Counter()
for tags in y:
    tags_counts[tags] += 1
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:10]
most_common_tags

[('NFT', 275),
 ('FIX', 190),
 ('MOD', 155),
 ('DOC', 45),
 ('RFT', 42),
 ('DEL', 40),
 ('RMN', 37),
 ('CLN', 31),
 ('VER', 25),
 ('DPD', 20)]

In [22]:
DICT_SIZE = 1000
INDEX_TO_WORDS = sorted(words_counts.keys(), key=lambda x: words_counts[x], reverse=True)[:DICT_SIZE]
WORDS_TO_INDEX = {word:i for i, word in enumerate(INDEX_TO_WORDS)}
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    return result_vector

In [23]:
X_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X])
print('X ', X_mybag.shape)

X  (910, 1000)


In [24]:
def tfidf_features(X):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    
    X = tfidf_vectorizer.fit_transform(X)
    
    return X,tfidf_vectorizer.vocabulary_

In [25]:
X_tfidf, tfidf_vocab = tfidf_features(X)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

# Logistic Regression Classification (Lasso)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.33, random_state=42)

* Feature selection with Lasso

In [28]:
clf = LogisticRegression(penalty='l1',
                         random_state=0,
                         solver='saga',
                         n_jobs=-1,
                         max_iter=1000,
                         multi_class='multinomial').fit(X_train, y_train)

y_pre_test = clf.predict(X_test)
test_prob = clf.predict_proba(X_test)
train_prob = clf.predict_proba(X_train)
print('Accuracy is:',clf.score(X_test, y_test))

Accuracy is: 0.48172757475083056


# draw Confusion Matrix

In [29]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from IPython.display import display, HTML

In [30]:
def cm(test_y, predict_y,label,show=False):    
    cm = confusion_matrix(test_y, predict_y,labels=label)
    data = pd.DataFrame(data=cm)
    precision = []
    recall = []
    f_score = []
    for i in range(len(label)):
        precision.append(data[i][i]/sum(data.iloc[i,:]))
        recall.append(data[i][i]/sum(data.iloc[:,i]))
        f_score.append(precision[i] * recall[i] * 2/(precision[i] + recall[i]))
    if show == True:
        cf_rowidx = pd.MultiIndex.from_tuples([("actual", clazz) for clazz in label])
        cf_colidx = pd.MultiIndex.from_tuples([("predict", clazz) for clazz in label])
        display(HTML("<h4>Confusion Matrix</h4>"))
        display(pd.DataFrame(data=cm, columns=cf_colidx, index=cf_rowidx))
        display(HTML("<h4>Precision & Recall</h4>"))
    table = pd.DataFrame(data=[precision,recall,f_score],columns=label,).T 
    table.columns=['precision','recall','F-score']
    display(table.T)  

In [31]:
label = list(tags_counts.keys())
cm(y_test, y_pre_test,label,show=True)

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0_level_0,Unnamed: 1_level_0,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict,predict
Unnamed: 0_level_1,Unnamed: 1_level_1,NFT,RFT,RMN,BRC,FIX,CMT,ANT,DEL,MOD,DOC,DPD,VER,CLN,MDL,IMP,TST
actual,NFT,76,0,2,0,10,0,0,2,3,1,0,0,0,0,0,0
actual,RFT,6,7,1,0,1,0,0,1,1,0,0,0,1,0,0,0
actual,RMN,2,1,5,0,3,0,0,1,0,0,0,0,0,0,0,0
actual,BRC,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
actual,FIX,18,0,0,0,36,0,0,1,0,0,0,0,0,0,0,0
actual,CMT,1,0,0,0,2,1,0,1,0,0,0,0,1,0,0,0
actual,ANT,3,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
actual,DEL,1,0,0,0,1,0,0,4,2,0,0,0,0,0,0,0
actual,MOD,33,0,0,0,5,0,0,2,12,0,0,0,0,0,0,0
actual,DOC,9,0,0,0,7,1,0,0,1,1,0,0,0,0,0,0


Unnamed: 0,NFT,RFT,RMN,BRC,FIX,CMT,ANT,DEL,MOD,DOC,DPD,VER,CLN,MDL,IMP,TST
precision,0.808511,0.388889,0.416667,0.0,0.654545,0.166667,0.0,0.5,0.230769,0.052632,0.0,0.0,0.125,0.0,1.0,0.5
recall,0.460606,0.777778,0.625,,0.493151,0.5,,0.25,0.6,0.5,,0.0,0.333333,,1.0,1.0
F-score,0.586873,0.518519,0.5,,0.5625,0.25,,0.333333,0.333333,0.095238,,,0.181818,,1.0,0.666667
