In [14]:
import pandas as pd
import os
import traceback

def read_csvs(path):
    dfs = []
    for file in os.listdir(path):
        if file.endswith(".csv"):
            try:
                df = pd.read_csv(path + file, sep=',', lineterminator='\n')
                dfs.append(df)
            except Exception as ex:
                print(f"Error in file: {file}")
                print(traceback.format_exc())
    return pd.concat(dfs, ignore_index=True)


In [15]:
df = read_csvs(path = "/home/r4ph/desenv/phd/exception-miner/output/fixes_2/")

In [16]:
len(df.file.unique())

1323

In [17]:
df.columns

Index(['file', 'function', 'func_body', 'project', 'commit_fix', 'repo_url',
       'url_issue', 'str_uncaught_exceptions', 'n_try_except', 'n_try_pass',
       'n_finally', 'n_generic_except', 'n_raise', 'n_captures_broad_raise',
       'n_captures_try_except_raise', 'n_captures_misplaced_bare_raise',
       'n_try_else', 'n_try_return', 'str_except_identifiers',
       'str_raise_identifiers', 'str_except_block', 'n_nested_try',
       'n_bare_except', 'n_bare_raise_finally'],
      dtype='object')

In [18]:
df_total = df.agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
           'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']})

In [19]:
df_total

Unnamed: 0,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally
nunique,1323.0,24631.0,,,,,,,
sum,,,1132.0,1308.0,106.0,385.0,11.0,292.0,2.0


In [20]:
df.groupby(['project', 'commit_fix', 'url_issue', 'repo_url'], as_index=False).agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
               'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']}).reset_index()

Unnamed: 0_level_0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,nunique,nunique,sum,sum,sum,sum,sum,sum,sum
0,0,aiohttp,20bfadcbfcf2be8e03e9e7fb0868a196e12b8b1c,https://github.com/aio-libs/aiohttp/issues/3628,https://github.com/aio-libs/aiohttp,2,193,0,0,2,0,0,0,0
1,1,ansible,01ba3a4efcb7aa830e3c726de58a0690756dc6f1,https://github.com/ansible/ansible/issues/35968,https://github.com/ansible/ansible,7,30,1,3,0,2,0,0,0
2,2,ansible,01e1b51e33d2c8611c34e208606286d1efb59a61,https://github.com/ansible/ansible/issues/41142,https://github.com/ansible/ansible,1,18,0,0,0,0,0,0,0
3,3,ansible,06d997b2b2c3034dd5d567127df54b93f8ee0f34,https://github.com/ansible/ansible/issues/65861,https://github.com/ansible/ansible,1,14,0,1,0,0,0,0,0
4,4,ansible,0c73e47a42f69901ea892f9d0e58acb554f4e668,https://github.com/ansible/ansible/issues/61650,https://github.com/ansible/ansible,1,19,2,11,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,435,wagtail,a3473544b8aef16e16147a0cdeb8e3247943a66b,https://github.com/wagtail/wagtail/issues/1785,https://github.com/wagtail/wagtail,1,4,0,0,0,0,0,0,0
436,436,wifiphisher,2f03bcde111542d5a9556747927dbf8944057ebb,https://github.com/wifiphisher/wifiphisher/iss...,https://github.com/wifiphisher/wifiphisher,2,58,0,0,0,0,0,0,0
437,437,youtube-dl,1c11204056566c2983f0a837897d882581880f41,https://github.com/ytdl-org/youtube-dl/issues/1,https://github.com/ytdl-org/youtube-dl,1,1,1,0,0,0,0,0,0
438,438,zulip,298c59f7fd3887422207916aa7d44955a6166d9e,https://github.com/zulip/zulip/issues/6639,https://github.com/zulip/zulip,4,134,0,6,0,0,0,0,0


In [21]:
df_rq5 = df.groupby(['project', 'commit_fix', 'url_issue', 'repo_url'], as_index=False).agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
               'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']}).reset_index()

In [22]:
df_rq5 = df_rq5.droplevel(1, axis=1)

In [23]:
smells_cols = ['n_try_pass', 'n_generic_except', 'n_nested_try', 'n_bare_except', 'n_captures_misplaced_bare_raise',	
              'n_captures_broad_raise', 'n_bare_raise_finally']

In [24]:
df_rq5['has_smell'] = df_rq5[smells_cols].sum(axis=1)

In [28]:
df_rq5[df_rq5['has_smell'] >0]

Unnamed: 0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally,has_smell
0,0,aiohttp,20bfadcbfcf2be8e03e9e7fb0868a196e12b8b1c,https://github.com/aio-libs/aiohttp/issues/3628,https://github.com/aio-libs/aiohttp,2,193,0,0,2,0,0,0,0,2.0
1,1,ansible,01ba3a4efcb7aa830e3c726de58a0690756dc6f1,https://github.com/ansible/ansible/issues/35968,https://github.com/ansible/ansible,7,30,1,3,0,2,0,0,0,6.0
3,3,ansible,06d997b2b2c3034dd5d567127df54b93f8ee0f34,https://github.com/ansible/ansible/issues/65861,https://github.com/ansible/ansible,1,14,0,1,0,0,0,0,0,1.0
4,4,ansible,0c73e47a42f69901ea892f9d0e58acb554f4e668,https://github.com/ansible/ansible/issues/61650,https://github.com/ansible/ansible,1,19,2,11,0,0,0,0,0,13.0
5,5,ansible,0d905a0496f4554a9de57cbd3ee90e30d6249b34,https://github.com/ansible/ansible/issues/63140,https://github.com/ansible/ansible,1,35,5,12,0,0,0,1,0,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432,432,vision,d427f36510798863a3953ba4ebf6ab364717bbbb,https://github.com/pytorch/vision/issues/6672,https://github.com/pytorch/vision,1,9,2,0,0,0,0,0,0,2.0
433,433,wagtail,41eccc72bcc68924d801e4048eef872ec5826897,https://github.com/wagtail/wagtail/issues/6352,https://github.com/wagtail/wagtail,28,596,2,1,0,0,0,0,0,3.0
437,437,youtube-dl,1c11204056566c2983f0a837897d882581880f41,https://github.com/ytdl-org/youtube-dl/issues/1,https://github.com/ytdl-org/youtube-dl,1,1,1,0,0,0,0,0,0,1.0
438,438,zulip,298c59f7fd3887422207916aa7d44955a6166d9e,https://github.com/zulip/zulip/issues/6639,https://github.com/zulip/zulip,4,134,0,6,0,0,0,0,0,6.0


In [29]:
df_rq5 = df_rq5[df_rq5['has_smell'] >0]

In [31]:
df_rq5.shape

(321, 15)

In [32]:
df_rq5.to_csv(f"../output/fixes_val/fixes_2.csv", index=False)

In [70]:
import numpy as np

#get a sample from a dataframe and split in n parts:
shuffled = df_rq5.sample(frac=1, random_state=40)
result = np.array_split(shuffled, 2)

In [73]:
result[1]

Unnamed: 0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally,has_smell
312,312,pandas,89ad14ddd49287dfed1b4586be2d2f8a3d723868,https://github.com/pandas-dev/pandas/issues/2971,https://github.com/pandas-dev/pandas,2,29,3,5,0,1,0,0,0,9.0
17,17,ansible,26fff6f5c32fa4396a8c72872c729b367739499c,https://github.com/ansible/ansible/issues/59527,https://github.com/ansible/ansible,2,32,1,5,0,0,0,3,0,9.0
337,337,pipenv,baef2e78a632168bddce0a428969b9f6fb32693c,https://github.com/pypa/pipenv/issues/3017,https://github.com/pypa/pipenv,60,596,20,11,2,0,1,3,0,37.0
197,197,jupyterhub,cb25d29b0bab438b5fd68b92036e0e25134b7ace,https://github.com/jupyterhub/jupyterhub/issue...,https://github.com/jupyterhub/jupyterhub,1,1,0,1,0,0,0,0,0,1.0
189,189,ipython,811d52cc05312485b22c67b07d9161eb42c60646,https://github.com/ipython/ipython/issues/5790,https://github.com/ipython/ipython,2,68,4,6,0,5,0,0,0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,399,robotframework,af92ca7269cf5edbff9078d0ac4a927782a46a3d,https://github.com/robotframework/robotframewo...,https://github.com/robotframework/robotframework,1,171,2,0,2,6,0,0,0,10.0
216,216,loguru,285f7b7464134b963adea4e24a77324fcac41bbf,https://github.com/delgan/loguru/issues/237,https://github.com/delgan/loguru,2,47,0,0,0,0,0,1,0,1.0
9,9,ansible,16bd93a14eb1b6f3b8292e33c62c9e8383e7bdb4,https://github.com/ansible/ansible/issues/25882,https://github.com/ansible/ansible,2,38,6,6,0,0,0,0,0,12.0
289,289,openbbterminal,b939fbd9b8c68ab207d0a9959ef637bfb32d3643,https://github.com/openbb-finance/openbbtermin...,https://github.com/openbb-finance/openbbterminal,7,66,1,1,0,0,0,0,0,2.0


In [74]:
part = 0
for r in result:
    part += 1
    r.to_csv(f"../output/fixes_val/fixes_part_{part}.csv", index=False)