## Read Data

In [None]:
import pandas as pd

df = pd.read_csv('./all_studies.csv')
df.columns

## Define Constants

In [None]:
IDENTIFIER = 'Identifier'
TITLE = 'Title'
AUTHOR = 'Author'
YEAR = 'Year'
ABSTRACT = 'Abstract'
DOI = 'DOI'

KEYWORD_MATCH = 'Keyword Match'
KEYWORD_MATCH_AUTO = 'Keyword Auto Match'
KEYWORD_MATCH_MANUAL = 'Keyword Manual Match'

ABSTRACT_MATCH = 'Abstract Match'
ABSTRACT_MATCH_AUTO = 'Abstract Auto Match'
ABSTRACT_MATCH_MANUAL = 'Abstract Manual Match'

USE = 'Use'

## Reduce and Sort Data

In [None]:
ps = df.loc[:, ['Identifier', 'Title', 'Author', 'Year', 'Custom1', 'DOI']]
ps.columns = [IDENTIFIER, TITLE, AUTHOR, YEAR, ABSTRACT, DOI]
ps = ps.sort_values(by=[YEAR], ascending=False)
ps[USE] = True
ps.head()

## Filter Duplicates

In [None]:
rows_before = len(ps.index)

ps.drop_duplicates(TITLE, inplace=True)

rows_after = len(ps.index)
removed = rows_before - rows_after

print(f'Removed {str(removed)} duplicates ({rows_after}/{rows_before})')

## Download Studies

from scidownl.scihub import *
import os.path
from os import path

size = len(ps.index)

for index, row in ps.iterrows():
    doi = row[DOI]
    out = 'papers/' + row[IDENTIFIER]
    print(f'Download {index} of {size}')
    if path.exists(out) and len(os.listdir(out)) > 0:
        print('skipping')
        continue
    try:
        sci = SciHub(doi, out).download(choose_scihub_url_index=(2 + index%4))
    except Exception as e:
        print(e, file=sys.stderr)



## Filter all Studies where the Full-Text is not available

In [None]:
import os.path
from os import path

rows_before = len(ps.index)

nft = ps[ps.apply(lambda x: len(os.listdir('papers/' + x[IDENTIFIER])) == 0, axis=1)]
nft.to_csv('no_full-text.csv', index = False, header=True)
#print(nft)

ps = ps[ps.apply(lambda x: len(os.listdir('papers/' + x[IDENTIFIER])) > 0, axis=1)]

rows_after = len(ps.index)
removed = rows_before - rows_after

print(f'Removed {str(removed)} papers, because the full-text was not available ({rows_after}/{rows_before})')
assert removed == 1, removed

## Manual search for removed papers

## Check Title for Keywords

In [None]:
ps[KEYWORD_MATCH_AUTO] = ps[ps[USE] == True][TITLE].str.match(pat='.*(test|mutant|mutation|coverage).*', case=False)
ps[KEYWORD_MATCH_MANUAL] = ps[IDENTIFIER].str.match(pat='(Golagha2017|Li2014|Zhang2013a|Gopinath2012)')
# Manual inspection of marked papers (x -> removed, / -> will be kept)
# x - Degott2019
# x - Vancsics2019
# x - Soto2019
# x - Vancsics2019a
# x - Wang2019
# x - Laemmel2018
# / - Golagha2017
# x - Clapp2015
# x - Smith2015
# / - Li2014
# / - Zhang2013a
# x - Shahriar2012
# / - Gopinath2012
# x - Polikarpova2009
## TODO Why are those papers removed?
ps[KEYWORD_MATCH] = ps[KEYWORD_MATCH_AUTO] | ps[KEYWORD_MATCH_MANUAL]

rows_before = len(ps[ps[USE] == True])
rows_auto = len(ps[ps[KEYWORD_MATCH_AUTO] == True])
rows_manual = len(ps[ps[KEYWORD_MATCH_MANUAL] == True])
rows_after = len(ps[ps[KEYWORD_MATCH] == True])
removed = rows_before - rows_auto
assert rows_before == 116, rows_before
assert rows_auto == 102, rows_auto
assert rows_manual == 4, rows_manual
assert rows_after == 106, rows_after
assert removed == 14, removed

print(f'Marked {str(removed)} studies, because their title didn\'t match any keyword ({rows_auto}/{rows_before})')
#print(ps[ps[KEYWORD_MATCH_AUTO] == False][IDENTIFIER].tolist())
print(f'{rows_manual} of those {str(removed)} studies met the requirements through manual analysis ({rows_after}/{rows_before})')

ps[USE] = ps[USE] & ps[KEYWORD_MATCH]

## Check Abstact for Keywords

In [None]:
ps[ABSTRACT_MATCH_AUTO] = ps[ps[USE] == True][ABSTRACT].str.match(pat='.*(test suite).*', case=False)
ps[ABSTRACT_MATCH_MANUAL] = ps[IDENTIFIER].str.match(pat='(Magalhaes2020|GomezAbajo2020|Wang2020|Wong2020|Bertolino2019|Chen2019|EscobarVelasquez2019|Gergely2019|Minhas2018|Gergely2018|Groce2018|Yi2018|Bowes2017|Felbinger2017|Fellner2017|Golagha2017|Kazmi2017|Magalhaes2017|Giannakopoulou2014|Mirzaaghaei2014|Jehan2013|Schuler2013|Zhang2013a|Selim2012|Gopinath2012|Dobolyi2010|Halfond2009|Fraser2007|Xie2006|Bradbury2005a)')
# Manual inspection of the marked studies (x -> removed, / -> will be kept)
# / - Magalhaes2020
# / - GomezAbajo2020
# x - Krotkov2020
# / - Wang2020
# / - Wong2020
# x - Ghanbari2019
# / - Bertolino2019
# / - Chen2019
# / - EscobarVelasquez2019
# / - Gergely2019
# / - Minhas2018
# / - Gergely2018
# / - Groce2018
# / - Yi2018
# / - Bowes2017
# / - Felbinger2017
# / - Fellner2017
# / - Golagha2017
# / - Kazmi2017
# / - Magalhaes2017
# / - Giannakopoulou2014
# / - Mirzaaghaei2014
# / - Jehan2013
# / - Schuler2013
# / - Zhang2013a
# / - Selim2012
# / - Gopinath2012
# x - Schuler2011 (duplicate)
# / - Dobolyi2010
# / - Halfond2009
# / - Fraser2007
# / - Xie2006
# / - Bradbury2005a
## TODO Why are those papers removed?
ps[ABSTRACT_MATCH] = ps[ABSTRACT_MATCH_AUTO] | ps[ABSTRACT_MATCH_MANUAL]

rows_before = len(ps[ps[USE] == True])
rows_auto = len(ps[ps[ABSTRACT_MATCH_AUTO] == True])
rows_manual = len(ps[ps[ABSTRACT_MATCH_MANUAL] == True])
rows_after = len(ps[ps[ABSTRACT_MATCH] == True])
removed = rows_before - rows_auto
assert rows_before == 106, rows_before
assert rows_auto == 73, rows_auto
assert rows_manual == 30, rows_manual
assert rows_after == 103, rows_after
assert removed == 33, removed

print(f'Marked {str(removed)} studies, because their abstract didn\'t match any keyword ({rows_auto}/{rows_before})')
# print(ps[ps[ABSTRACT_MATCH_AUTO] == False][IDENTIFIER].tolist())
print(f'{rows_manual} of those {str(removed)} studies met the requirements through manual analysis ({rows_after}/{rows_before})')

ps[USE] = ps[USE] & ps[ABSTRACT_MATCH]

## Accumulate

In [None]:
ps = ps[ps[USE] == True]
#ps = ps.loc[:, [IDENTIFIER, TITLE, AUTHOR, YEAR, ABSTRACT, DOI]]
ps = ps.loc[:, [IDENTIFIER, TITLE, AUTHOR, YEAR, ABSTRACT, DOI]]

rows_used = len(ps)
assert rows_used == 103, rows_used
print(f'{rows_used} papers remain in the study')

## Export Auto Filtered Studies

In [None]:
ps_auto = ps.loc[:, [IDENTIFIER, TITLE, ABSTRACT]]
ps_auto.to_csv('filtered_studies_auto.csv', index = False, header=True)

## Import Manual Filtered Studies

In [None]:
ps_manual = pd.read_csv('filtered_studies_manual.csv')

undef = ps_manual[USE].isnull().sum()
assert undef == 0, undef

print(ps_manual.columns)

## Check for Compability

In [None]:
difference = set(ps_auto[IDENTIFIER].tolist()) - set(ps_manual[IDENTIFIER].tolist())
difference2 = set(ps_manual[IDENTIFIER].tolist()) - set(ps_auto[IDENTIFIER].tolist())
difference |= difference2

assert len(difference) == 0, difference

## Summary

In [None]:
rows_before = len(ps_auto.index)
rows_after = len(ps_manual[ps_manual[USE] == True])

assert rows_after == 73, rows_after

print(f'After manual filtering, {rows_after} studies remain ({rows_after}/{rows_before})')

## Export Manual Filtered Studies

In [None]:
ps_final = ps_manual[ps_manual[USE] == True]

ps_final = ps_final.loc[:, [IDENTIFIER, TITLE]]
ps_final = pd.merge(ps_final, ps, how="inner", on=[IDENTIFIER, TITLE])
ps_final = ps_final.loc[:, [IDENTIFIER, TITLE, DOI]]
ps_final.to_csv('filtered_studies_final.csv', index = False, header=True)