In [21]:
import pandas as pd
import numpy as np
import json
import traceback

pair_path = 'esmall_pairs.json'
bugrepo_path = 'esmall_clear.json'

In [14]:
def get_pair_frame(file_path):
    dicList=[json.loads(line) for line in open(file_path)]
    res_list = []
    for dic in dicList:
        res_list.append([dic['bug1'], dic['bug2'], dic['dec']])
    res_list = pd.DataFrame(res_list, columns=['bug1', 'bug2', 'dec'])
    return res_list

In [15]:
pairs = get_pair_frame(pair_path)
pairs.head()

Unnamed: 0,bug1,bug2,dec
0,214301,214611,1
1,214623,214825,-1
2,214445,214451,1
3,214466,214452,1
4,214181,214620,-1


In [1]:
def extract_bugrepo(file_path):
    '''
        Json like this
        {
            "_id"         :{"$oid":"52e9a43354dc1c24f597bef8"},
            "bug_id"      :"214065",
            "product"     :"BIRT",
            "description" :"Description:\n[Regression] ...",
            "bug_severity":"normal",
            "dup_id"      :[],
            "short_desc"  :"[Regression]Group TOC are create ... PDF",
            "priority"    :"P3",
            "version"     :"2.3.0",
            "component"   :"Report Engine",
            "delta_ts"    :"2008-01-02 21:38:46 -0500",
            "bug_status"  :"CLOSED",
            "creation_ts" :"2008-01-02 00:34:00 -0500",
            "resolution"  :"FIXED"
        }
    '''
    dicList=[json.loads(line) for line in open(file_path)]
    res_list = []
    for dic in dicList:
        res_list.append([dic['bug_id'], dic['product'], dic['description'], dic['bug_severity'],\
                         dic['dup_id'], dic['short_desc'], dic['priority'], dic['version'],\
                         dic['component'], dic['delta_ts'], dic['bug_status'], dic['creation_ts'],\
                         dic['resolution']
                        ])
    res_list = pd.DataFrame(res_list, columns=['bug_id', 'product', 'description', 'bug_severity',
                                               'dup_id', 'summary', # change short_desc to summary
                                               'priority', 'version', 'component', 'delta_ts', 'bug_status',
                                               'creation_ts', 'resolution'
                                              ])
    return res_list

In [19]:
bug_repos = extract_bugrepo(bugrepo_path)
bug_repos.head()

Unnamed: 0,bug_id,product,description,bug_severity,dup_id,summary,priority,version,component,delta_ts,bug_status,creation_ts,resolution
0,214065,BIRT,Description:\n[Regression]Group TOC are create...,normal,[],[Regression]Group TOC are created automaticall...,P3,2.3.0,Report Engine,2008-01-02 21:38:46 -0500,CLOSED,2008-01-02 00:34:00 -0500,FIXED
1,214070,BIRT,Output column page in data set editor used Res...,normal,[],ResultSetColumnHandle should not be cached in ...,P3,2.3.0,Data,2008-01-02 04:14:41 -0500,RESOLVED,2008-01-02 01:55:00 -0500,FIXED
2,214068,BIRT,Description:\n[Regression]Failed to preview Ch...,critical,[],[Regression]Failed to preview Chart Viewer Exa...,P3,2.3.0,Build,2008-01-02 04:32:33 -0500,CLOSED,2008-01-02 01:35:00 -0500,FIXED
3,214072,BIRT,Description:\n Exception is thrown out when l...,normal,[],[Automation]Exception is thrown out when link ...,P3,2.3.0,Report Engine,2008-01-02 21:42:39 -0500,CLOSED,2008-01-02 02:10:00 -0500,WORKSFORME
4,214071,Platform,Build ID: M20070921-1145\n\nSteps To Reproduce...,normal,[],[Help] About eclipse help pop-up information d...,P3,3.3.1,SWT,2009-01-23 15:01:34 -0500,RESOLVED,2008-01-02 01:58:00 -0500,WORKSFORME


In [38]:
def gen(col, bug_repos, pairs):
    '''
        chose specific column
    '''
    res = []
    for i, r in pairs.iterrows():
        try:
            res.append([
                    bug_repos[bug_repos['bug_id'] == str(r['bug1'])][col].values[0],
                    bug_repos[bug_repos['bug_id'] == str(r['bug2'])][col].values[0],
                    r['dec']
                ])
        except:
            print(traceback.print_exc())
    res = pd.DataFrame(res, columns=[col + '_bug1', col + '_bug2', 'dec'])
    return res

In [39]:
summary = gen('summary', bug_repos, pairs)
summary.head(10)

Unnamed: 0,summary_bug1,summary_bug2,dec
0,[update] could not load tasklist hyperlink det...,[update] Sometimes but not selden i get the er...,1
1,WSE hangs in external browser after invoking f...,Loading model aborts on non-fatal error,-1
2,[Regression]<Select value...> can not select a...,Select value in table filter condition panel d...,1
3,Group completion options issue,[Group Code Assist] No code completion for und...,1
4,Add org.apache.bcel,[api tooling] comments from Eugene,-1
5,Support cube filter in chart,add API-3.0 and other common tags to project s...,-1
6,Max Rydahl Andersen's blog feed contains comments,Move my feed from blog.xam.dk to in.relation.to,1
7,[Regression] Highlight can not be added and th...,Submitting task fails with invalid date / time...,-1
8,Notification e-mails not sent for committer el...,[Regression] The error is of no default value ...,-1
9,unable to reassign tasks if no permissions to ...,cmdbf services make eclipse-specific references,-1


In [41]:
pairs.to_csv('esmall_pairs.csv', index=False, encoding='GB18030')
bug_repos.to_csv('esmall_bug_repos.csv', index=False, encoding='GB18030')
summary.to_csv('esmall_summary.csv', index=False, encoding='GB18030')