In [34]:
import pandas as pd
import os

def read_csvs(path):
    dfs = []
    for file in os.listdir(path):
        if file.endswith(".csv"):
            df = pd.read_csv(path + file)
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


In [35]:
df = read_csvs(path = "/home/r4ph/desenv/phd/exception-miner/output/fixes/")

In [36]:
len(df.file.unique())

1429

In [37]:
df.columns

Index(['file', 'function', 'func_body', 'project', 'commit_fix', 'repo_url',
       'url_issue', 'str_uncaught_exceptions', 'n_try_except', 'n_try_pass',
       'n_finally', 'n_generic_except', 'n_raise', 'n_captures_broad_raise',
       'n_captures_try_except_raise', 'n_captures_misplaced_bare_raise',
       'n_try_else', 'n_try_return', 'str_except_identifiers',
       'str_raise_identifiers', 'str_except_block', 'n_nested_try',
       'n_bare_except', 'n_bare_raise_finally'],
      dtype='object')

In [38]:
df_total = df.agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
           'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']})

In [39]:
df_total

Unnamed: 0,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally
nunique,1429.0,29647.0,,,,,,,
sum,,,1301.0,1608.0,108.0,481.0,11.0,622.0,2.0


In [77]:
df.groupby(['project', 'commit_fix', 'url_issue', 'repo_url'], as_index=False).agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
               'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']}).reset_index()

Unnamed: 0_level_0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,nunique,nunique,sum,sum,sum,sum,sum,sum,sum
0,0,aiohttp,0d2e43bf2a920975a5da4d9295e0ba887080bf5b,https://github.com/aio-libs/aiohttp/issues/7237,https://github.com/aio-libs/aiohttp,2,143,1,0,0,0,0,0,0
1,1,altair,57c4158bba811bd76b6a37636ebd5d38b782c666,https://github.com/altair-viz/altair/issues/649,https://github.com/altair-viz/altair,1,2,0,0,0,0,0,0,0
2,2,ansible,019d078a5a457823e8d445d4e949b5ed041e2609,https://github.com/ansible/ansible/issues/55986,https://github.com/ansible/ansible,9,46,2,6,0,0,0,1,0
3,3,ansible,05879d331ae23b9c4a77b70425c3f8e48ab2e8ea,https://github.com/ansible/ansible/issues/20356,https://github.com/ansible/ansible,1,11,2,0,0,0,0,0,0
4,4,ansible,082f54eaf4c566f6862cf58f3dc05cf6a024f231,https://github.com/ansible/ansible/issues/28924,https://github.com/ansible/ansible,1,10,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
482,482,wagtail,a318c5ba7982be52d8f2b00336557a3ac476aa87,https://github.com/wagtail/wagtail/issues/2281,https://github.com/wagtail/wagtail,2,29,0,0,0,0,0,0,0
483,483,wifiphisher,2b181c1353d59ec961532daf66adf7fbbbca32b8,https://github.com/wifiphisher/wifiphisher/iss...,https://github.com/wifiphisher/wifiphisher,1,21,0,0,0,0,0,0,0
484,484,xx-net,a183aca3d13ce6dcf7795f4efbdc9502b5016cf2,https://github.com/xx-net/xx-net/issues/10649,https://github.com/xx-net/xx-net,2,16,0,4,0,1,0,0,0
485,485,youtube-dl,b3f0e5304807862ce72c136da90b860df805ee5c,https://github.com/ytdl-org/youtube-dl/issues/...,https://github.com/ytdl-org/youtube-dl,1,25,0,1,0,0,0,1,0


In [62]:
df_rq5 = df.groupby(['project', 'commit_fix', 'url_issue', 'repo_url'], as_index=False).agg({'file': pd.Series.nunique, 'function': pd.Series.nunique, 'n_try_pass': ['sum'], 'n_generic_except': ['sum'], 'n_nested_try': ['sum'], 
               'n_bare_except': ['sum'], 'n_captures_misplaced_bare_raise': ['sum'], 'n_captures_broad_raise' : ['sum'], 'n_bare_raise_finally' : ['sum']}).reset_index()

In [63]:
df_rq5 = df_rq5.droplevel(1, axis=1)

In [64]:
smells_cols = ['n_try_pass', 'n_generic_except', 'n_nested_try', 'n_bare_except', 'n_captures_misplaced_bare_raise',	
              'n_captures_broad_raise', 'n_bare_raise_finally']

In [65]:
df_rq5['has_smell'] = df_rq5[smells_cols].sum(axis=1)

In [68]:
df_rq5[df_rq5['has_smell'] >0]

Unnamed: 0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally,has_smell
0,0,aiohttp,0d2e43bf2a920975a5da4d9295e0ba887080bf5b,https://github.com/aio-libs/aiohttp/issues/7237,https://github.com/aio-libs/aiohttp,2,143,1,0,0,0,0,0,0,1.0
2,2,ansible,019d078a5a457823e8d445d4e949b5ed041e2609,https://github.com/ansible/ansible/issues/55986,https://github.com/ansible/ansible,9,46,2,6,0,0,0,1,0,9.0
3,3,ansible,05879d331ae23b9c4a77b70425c3f8e48ab2e8ea,https://github.com/ansible/ansible/issues/20356,https://github.com/ansible/ansible,1,11,2,0,0,0,0,0,0,2.0
5,5,ansible,0b64408f5a2d2135afcb68988c51a3dd50e0124f,https://github.com/ansible/ansible/issues/5531,https://github.com/ansible/ansible,2,9,0,0,0,3,0,0,0,3.0
6,6,ansible,0c3216c5652411549ce32234a18c6ba2329d35ec,https://github.com/ansible/ansible/issues/44740,https://github.com/ansible/ansible,2,10,2,4,0,0,0,1,0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,480,vision,cdbbd6664bbd2e212739519aa0eb70c06252e88c,https://github.com/pytorch/vision/issues/7838,https://github.com/pytorch/vision,2,209,0,1,0,0,0,0,0,1.0
481,481,wagtail,8d1835f55c1700ec003fb11d299a7769a800a54c,https://github.com/wagtail/wagtail/issues/8699,https://github.com/wagtail/wagtail,4,107,1,2,1,0,0,2,0,6.0
484,484,xx-net,a183aca3d13ce6dcf7795f4efbdc9502b5016cf2,https://github.com/xx-net/xx-net/issues/10649,https://github.com/xx-net/xx-net,2,16,0,4,0,1,0,0,0,5.0
485,485,youtube-dl,b3f0e5304807862ce72c136da90b860df805ee5c,https://github.com/ytdl-org/youtube-dl/issues/...,https://github.com/ytdl-org/youtube-dl,1,25,0,1,0,0,0,1,0,2.0


In [69]:
df_rq5 = df_rq5[df_rq5['has_smell'] >0]

In [70]:
import numpy as np

#get a sample from a dataframe and split in n parts:
shuffled = df_rq5.sample(frac=1, random_state=40)
result = np.array_split(shuffled, 2)

In [73]:
result[1]

Unnamed: 0,index,project,commit_fix,url_issue,repo_url,file,function,n_try_pass,n_generic_except,n_nested_try,n_bare_except,n_captures_misplaced_bare_raise,n_captures_broad_raise,n_bare_raise_finally,has_smell
312,312,pandas,89ad14ddd49287dfed1b4586be2d2f8a3d723868,https://github.com/pandas-dev/pandas/issues/2971,https://github.com/pandas-dev/pandas,2,29,3,5,0,1,0,0,0,9.0
17,17,ansible,26fff6f5c32fa4396a8c72872c729b367739499c,https://github.com/ansible/ansible/issues/59527,https://github.com/ansible/ansible,2,32,1,5,0,0,0,3,0,9.0
337,337,pipenv,baef2e78a632168bddce0a428969b9f6fb32693c,https://github.com/pypa/pipenv/issues/3017,https://github.com/pypa/pipenv,60,596,20,11,2,0,1,3,0,37.0
197,197,jupyterhub,cb25d29b0bab438b5fd68b92036e0e25134b7ace,https://github.com/jupyterhub/jupyterhub/issue...,https://github.com/jupyterhub/jupyterhub,1,1,0,1,0,0,0,0,0,1.0
189,189,ipython,811d52cc05312485b22c67b07d9161eb42c60646,https://github.com/ipython/ipython/issues/5790,https://github.com/ipython/ipython,2,68,4,6,0,5,0,0,0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,399,robotframework,af92ca7269cf5edbff9078d0ac4a927782a46a3d,https://github.com/robotframework/robotframewo...,https://github.com/robotframework/robotframework,1,171,2,0,2,6,0,0,0,10.0
216,216,loguru,285f7b7464134b963adea4e24a77324fcac41bbf,https://github.com/delgan/loguru/issues/237,https://github.com/delgan/loguru,2,47,0,0,0,0,0,1,0,1.0
9,9,ansible,16bd93a14eb1b6f3b8292e33c62c9e8383e7bdb4,https://github.com/ansible/ansible/issues/25882,https://github.com/ansible/ansible,2,38,6,6,0,0,0,0,0,12.0
289,289,openbbterminal,b939fbd9b8c68ab207d0a9959ef637bfb32d3643,https://github.com/openbb-finance/openbbtermin...,https://github.com/openbb-finance/openbbterminal,7,66,1,1,0,0,0,0,0,2.0


In [74]:
part = 0
for r in result:
    part += 1
    r.to_csv(f"../output/fixes_val/fixes_part_{part}.csv", index=False)