In [18]:
import pandas as pd
import numpy as np
import psycopg2
from collections import Counter

### Get Commit id and Labels

In [27]:
df_origin =  pd.read_excel('./data/commit data.xlsx', sheet_name= 'New Commits')
print(df_origin.shape)
df_origin.head()

(1925, 8)


Unnamed: 0,project name,Commit ID,label1,label2,label3,label4,label5,label6
0,apache-avro,126e976,Testing,,,,,Bug fix
1,apache-avro,2df0775,Testing,,,,,Maintenance
2,apache-avro,a39e6de,Testing,,,,,Bug fix
3,apache-avro,2020c8a,Testing,,,,Maintenance,Bug fix
4,apache-avro,fa0059c,,,,,Maintenance,Bug fix


### Get Multiple Labels

In [28]:
def create_label(row):
    result = []
    for i in range(1,7):
        if pd.isnull(row['label' + str(i)]) == False:
            result.append(row['label' + str(i)])
    
    return result

df_origin['categories'] = df_origin.apply(lambda row : create_label(row), axis = 1)
df_origin.to_csv('orignal_commit_data.csv')
df_origin.head()

Unnamed: 0,project name,Commit ID,label1,label2,label3,label4,label5,label6,categories
0,apache-avro,126e976,Testing,,,,,Bug fix,"[Testing, Bug fix]"
1,apache-avro,2df0775,Testing,,,,,Maintenance,"[Testing, Maintenance]"
2,apache-avro,a39e6de,Testing,,,,,Bug fix,"[Testing, Bug fix]"
3,apache-avro,2020c8a,Testing,,,,Maintenance,Bug fix,"[Testing, Maintenance, Bug fix]"
4,apache-avro,fa0059c,,,,,Maintenance,Bug fix,"[Maintenance, Bug fix]"


### Get Commit Message and Full Csha

In [29]:
df = df_origin.copy()
df = df.drop(['label1','label2','label3','label4','label5','label6'], axis = 1)
df.head()

Unnamed: 0,project name,Commit ID,categories
0,apache-avro,126e976,"[Testing, Bug fix]"
1,apache-avro,2df0775,"[Testing, Maintenance]"
2,apache-avro,a39e6de,"[Testing, Bug fix]"
3,apache-avro,2020c8a,"[Testing, Maintenance, Bug fix]"
4,apache-avro,fa0059c,"[Maintenance, Bug fix]"


In [30]:
def get_csha_and_message(df):
    try:
        conn =  psycopg2.connect("dbname='squad' user='apple' password = 'mst123456' host='localhost'")
    except:
        print ("I am unable to connect to the database")  

    cur = conn.cursor()

    csha_list = df['Commit ID'].values
    # print(csha_list[0])
    # print(csha_list[96])

    commit_message = []
    cshas = []
    for index, csha in enumerate(csha_list):
        qr =  "SELECT message, csha FROM commits WHERE csha LIKE '{}%'".format(csha_list[index].strip())
        cur.execute(qr)
        fetched_row = cur.fetchone()
        if fetched_row != None:
            if index < 5:
                print('============================================')
                print('csha:%s'%fetched_row[1])
                print(repr(fetched_row[0]))
            commit_message.append(fetched_row[0])
            cshas.append(fetched_row[1])
        else:
            if index < 5:
                print('None')
            commit_message.append('None')
            cshas.append('None')
    assert len(commit_message) == df.shape[0]
    assert len(cshas) == df.shape[0]
    
    return commit_message, cshas

commit_message, csha_list = get_csha_and_message(df)

csha:126e9769f45f978f42321c4fc465198982df482b
'AVRO-906. Java: Fix so that ordering of schema properties is consistent.\n\ngit-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1179356 13f79535-47bb-0310-9956-ffa450edef68\n'
csha:2df0775d2f368b326e3ac6442ce4850e3fe62edc
'AVRO-2003: Report specific location of schema incompatibilities\n\nCloses #201\n\nSigned-off-by: Nandor Kollar <nkollar@apache.org>\n'
csha:a39e6deea243dcd9a09df8f67af724003dec1d02
'AVRO-1099. Java: Fix JsonDecoder to permit floats and doubles to be read from JSON values without decimal points, and for ints and longs to be read from JSON values with decimal points.\n\ngit-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1347779 13f79535-47bb-0310-9956-ffa450edef68\n'
csha:2020c8a8cc19c58eaef2d9de75260e7341099038
'AVRO-1046. Java: Fix ReflectDatumReader to be able to read generic and specific arrays.\n\ngit-svn-id: https://svn.apache.org/repos/asf/avro/trunk@1339864 13f79535-47bb-0310-9956-ffa450edef68\n'
csha:fa00

In [31]:
df['commit_message'] = commit_message
df['Commit ID'] = csha_list
df.head()

Unnamed: 0,project name,Commit ID,categories,commit_message
0,apache-avro,126e9769f45f978f42321c4fc465198982df482b,"[Testing, Bug fix]",AVRO-906. Java: Fix so that ordering of schema...
1,apache-avro,2df0775d2f368b326e3ac6442ce4850e3fe62edc,"[Testing, Maintenance]",AVRO-2003: Report specific location of schema ...
2,apache-avro,a39e6deea243dcd9a09df8f67af724003dec1d02,"[Testing, Bug fix]",AVRO-1099. Java: Fix JsonDecoder to permit flo...
3,apache-avro,2020c8a8cc19c58eaef2d9de75260e7341099038,"[Testing, Maintenance, Bug fix]",AVRO-1046. Java: Fix ReflectDatumReader to be ...
4,apache-avro,fa0059c55a31813634188eb85e53b68e33644489,"[Maintenance, Bug fix]",AVRO-607: Java: Make SpecificData schema cache...


### Expand Labels to Binary Features

In [32]:
from sklearn.preprocessing import MultiLabelBinarizer

def generate_binary_features(df):
    
    # get tags count 
    tags_counts = Counter()
    for tags in df['categories'].values:
        for tag in list(tags):
            tags_counts[tag] += 1
    
    # binarilization tags（multilabels）
    y = df['categories'].values
    mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
    y_new = mlb.fit_transform(y)
    
    # add binary features to dataframe
    target_columns = mlb.classes
    for index, item in enumerate(target_columns):
        df[item] = y_new[:, index]
    
    return df

df = generate_binary_features(df)
df.head()

Unnamed: 0,project name,Commit ID,categories,commit_message,Bug fix,Build,Clean up,Cross,Data,Debug,...,Merge,Module Add,Module Move,Module Remove,Refactoring,Rename,Source Control,Testing,Token Replace,Versioning
0,apache-avro,126e9769f45f978f42321c4fc465198982df482b,"[Testing, Bug fix]",AVRO-906. Java: Fix so that ordering of schema...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,apache-avro,2df0775d2f368b326e3ac6442ce4850e3fe62edc,"[Testing, Maintenance]",AVRO-2003: Report specific location of schema ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,apache-avro,a39e6deea243dcd9a09df8f67af724003dec1d02,"[Testing, Bug fix]",AVRO-1099. Java: Fix JsonDecoder to permit flo...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,apache-avro,2020c8a8cc19c58eaef2d9de75260e7341099038,"[Testing, Maintenance, Bug fix]",AVRO-1046. Java: Fix ReflectDatumReader to be ...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,apache-avro,fa0059c55a31813634188eb85e53b68e33644489,"[Maintenance, Bug fix]",AVRO-607: Java: Make SpecificData schema cache...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Reorder Columns and Drop None Row

In [33]:
# change order of columns
label_cols = ['Maintenance', 'Testing', 'Feature Add', 'Bug fix', 'Documentation', 'Clean up','Build', 'Refactoring','Indentation', 
        'Token Replace', 'Source Control', 'Cross', 'Legal', 'Debug', 'Module Remove', 'Module Move', 'Rename', 'Versioning',
        'Merge', 'Initialization', 'Internationalization', 'Data', 'Module Add']
categories_cols = ['categories']
meta_info_cols = ['project name','Commit ID']
features_cols = ['commit_message']

cols = meta_info_cols + features_cols + label_cols + categories_cols

df = df[cols]

df.head()

Unnamed: 0,project name,Commit ID,commit_message,Maintenance,Testing,Feature Add,Bug fix,Documentation,Clean up,Build,...,Module Remove,Module Move,Rename,Versioning,Merge,Initialization,Internationalization,Data,Module Add,categories
0,apache-avro,126e9769f45f978f42321c4fc465198982df482b,AVRO-906. Java: Fix so that ordering of schema...,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Testing, Bug fix]"
1,apache-avro,2df0775d2f368b326e3ac6442ce4850e3fe62edc,AVRO-2003: Report specific location of schema ...,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Testing, Maintenance]"
2,apache-avro,a39e6deea243dcd9a09df8f67af724003dec1d02,AVRO-1099. Java: Fix JsonDecoder to permit flo...,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Testing, Bug fix]"
3,apache-avro,2020c8a8cc19c58eaef2d9de75260e7341099038,AVRO-1046. Java: Fix ReflectDatumReader to be ...,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Testing, Maintenance, Bug fix]"
4,apache-avro,fa0059c55a31813634188eb85e53b68e33644489,AVRO-607: Java: Make SpecificData schema cache...,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[Maintenance, Bug fix]"


In [34]:
print('Before Drop: %d'%df.shape[0])
df = df[df['Commit ID'] != 'None'].reset_index(drop = True)
print('After Drop: %d'%df.shape[0])

Before Drop: 1925
After Drop: 1922


### Export to CSV

In [35]:
df.to_csv('./data/commit_data_new.csv')

### Useful Functions to process label

#### Convert str to List when reading data

In [None]:
# convert string to list : used for reuse categories
from ast import literal_eval

def convert_str_to_list(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: literal_eval(x))
    print(type(df[column_name].values[0]))
    
    r

#### labels counts and return labels columns

In [37]:
# tag counts and label columns
def get_tag_counts(df):
    
    tags_counts = Counter()

    for tags in df['categories'].values:
        for tag in list(tags):
            tags_counts[tag] += 1

    most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:]

    for item in most_common_tags:
        print(item[0], ":", item[1])
    
    target_columns = list(tags_counts.keys())
        
    return tags_counts, target_columns

#### Drop some of labels

In [36]:
# drop labels
def drop_labels(df, labels):
    """
    Drop some of labels
    
    Args:
    df - Dataframs
    labels - List of labels name to drop
    
    Returns:
    new_df -  new dataframe
    """
    # remove labels in categories list
    new_df = df.copy()
    new_df['categories'] = new_df['categories'].apply(lambda row :[item for item in row if item not in labels])
    
    # remove columns
    new_df = new_df.drop(labels, axis = 1)
    
    # remove columns which have no labels after removing labels
    new_df['number_of_labels'] = new_df['categories'].apply(lambda row: len(row))
    new_df = new_df[new_df['number_of_labels'] != 0].reset_index(drop = True)
    new_df = new_df.drop(['number_of_labels'], axis = 1)
    
    return new_df

#### Group Labels

In [39]:
# group some of labels
def group_labels(df, labels_to_group, new_label):
    '''
    Group some of labels
    
    Args:
        df - dataframe
        labels_to_group -  List of labels you want to group
        new_label -  string - new label name of grouped labels
    
    Returns:
        new_df - dataframe after grouped
    '''
    new_df = df.copy()
    
    # generate new labels by group labels
    def create_new_label(row, labels):
        new_label = 0                         # initialize new label
        for label in labels:
            if row[label] == 1:
                new_label = 1                 # if one of labels in grouped labels is 1 the new label is 1
        return new_label
    
    new_df[new_label] = df.apply(lambda row: create_new_label(row, labels_to_group), axis = 1)
    
    # drop old labels
    new_df = new_df.drop(labels_to_group, axis = 1)
    
    # generate list of new_categories
    
    return new_df

#### Label_Counter

In [None]:
def categories_count(df, target_labels, verbose = True):
    '''
    count number of labels of each catergories
    
    Args:
        df - Dataframe
        target_labels - List of labels to count
        verbose - Boolean - whether to show result
    
    Returns:
        Count Dict
    
    '''
    # calculate count
    count_cat = {}
    for label in target_labels:
        count_cat[label] = df[df[label] == 1].shape[0]
    # print result
    if verbose == True:
        for k, v in count_cat.items():
            print(k + ' : ' + str(v))
    return count_cat

categories_count(new_df, new_target)