In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
def pandas_df_to_latex_table(df, 
                             indent=False, 
                             vert_lines_locs=None, 
                             wrap_in_table=False, 
                             caption_string=None, 
                             label_str=None):
    # Prefix with table stuff
    res = '' if indent else r'\noindent' + '\n'
    if wrap_in_table:
        res += r'\begin{table}[]' + '\n'
    else:
        assert (caption_string is None),"Caption String requires table wrapping"
        assert (label_str is None),"Label String requires table wrapping"
        
    res += r'\footnotesize' + '\n'
    table = df.to_latex(index=False)
    tabular = re.sub(r'([a-zA-Z]+)\\_(\S+)', r'$\1_{\2}$', table)
    if vert_lines_locs:
        orientation_re = re.compile(r'begin{tabular}\s*{([^}]*)}')
        orientations = re.match(r'\\begin{tabular}\s*{([^}]*)}', tabular)[1]
        if orientations:
            new_orientation = ''
            last_insert = 0
            for ind, val in enumerate(vert_lines_locs):
                new_orientation += orientations[last_insert:val] + '|' 
                last_insert = val
            if last_insert < len(orientations):
                new_orientation += orientations[last_insert:]
            tabular = re.sub(r'\\begin{tabular}\s*{([^}]*)}', r'\\begin{tabular}{'+ new_orientation + '}', tabular)
        else:
            print("Could not parse tabular format")
    res += tabular
    res += r'\normalsize' + '\n'
    if wrap_in_table:
        res += r'\caption{' + caption_string + '}\n'
        res += r'\label{tab:' + label_str + '}\n'
        res += '\end{table}'
    return res
    
    

In [None]:
df = pd.read_csv('./DATA/SCAM2021_Data.csv')

In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
df.info()

In [None]:
severity_group = df.groupby(['Tool', 'Severity'])

In [None]:
tool_severity_group = severity_group[['Production Hits', 'Testcode Hits']]
tool_severity_group.sum()

In [None]:
plot = tool_severity_group.sum().plot.bar(stacked=False,subplots=False, figsize=(20,20))

In [None]:
df = pd.read_csv('./DATA/SCAM2021_Data.csv')
df.replace('ONLY PRODUCTION', np.nan, inplace=True)

In [None]:
df = df.dropna()
df['Weighted Ratio'] = df['Weighted Ratio'].astype(float)
df['Weighted Ratio'] = df['Weighted Ratio'].round(4)
df.rename(columns={'Production Hits': 'Hit_P', 'Severity': 'Sev', 'Testcode Hits': 'Hit_T', 'Weighted Ratio': 'Ratio'}, inplace=True)
df

In [None]:
df.sort_values(by=['Ratio'], ascending=[False], inplace=True)
df

In [None]:
toolgroup = df.groupby('Tool')

In [None]:
remove_less_than_this_hits = 5

In [None]:
remove_factors_under_x = 3.0

In [None]:
high_hit_amount = 300

In [None]:
def filter_for_review(df, remove_less_than, remove_by_factor, include_this_high):
    less_than_factor = ((df['Hit_T'] >= remove_less_than) & (df['Ratio'] >= remove_by_factor) | (df['Ratio'] <= 1.0/remove_by_factor))
    potentially_noisy = ((df['Hit_T'] >= include_this_high) | (df['Hit_P'] >= include_this_high))
    return less_than_factor | potentially_noisy

In [None]:
filter_low_hitnums = filter_for_review(df, remove_less_than_this_hits, remove_factors_under_x, high_hit_amount)

In [None]:
it = toolgroup[['Warning', 'Sev', 'Hit_P', 'Hit_T','Ratio']]

In [None]:
tidy_hits = it.get_group('Clang Tidy').sort_values(by=['Ratio'], ascending=[False])

In [None]:
tidy_hits_filtered_lows = tidy_hits[filter_low_hitnums]
tidy_hits_filtered_lows
tidy_hits_filtered_lows.to_latex(index=False)

In [None]:
pandas_df_to_latex_table(tidy_hits_filtered_lows)

In [None]:
%debug

In [None]:
with open(f'DATA/Clang_Tidy_Less_Than_{remove_less_than_this_hits}_{remove_factors_under_x}_{high_hit_amount}_removed.txt', 'w') as file:    
    file.write(pandas_df_to_latex_table(tidy_hits_filtered_lows, 
                                        False, 
                                        [2,4], 
                                        False,
                                        None,
                                        None))

In [None]:
sa_hits = it.get_group('Clang Static Analyzer').sort_values(by=['Ratio'], ascending=[False])

In [None]:
sa_hits_filtered_lows = sa_hits[filter_low_hitnums]
sa_hits_filtered_lows

In [None]:
with open(f'DATA/Clang_SA_Less_Than_{remove_less_than_this_hits}_{remove_factors_under_x}_{high_hit_amount}_removed.txt', 'w') as file:
    file.write(pandas_df_to_latex_table(sa_hits_filtered_lows, False, 
                                        [2,4], 
                                        False,
                                        None,
                                        None))

In [None]:
cppcheck_hits = it.get_group('CppCheck').sort_values(by=['Ratio'], ascending=[False])
cppcheck_hits_filtered_lows = cppcheck_hits[filter_low_hitnums]

In [None]:
cppcheck_hits_filtered_lows.filter(items=['Warning', 'Sev', 'Hit_P', 'Hit_T'])

In [None]:
with open(f'DATA/CppCheck_Less_Than_{remove_less_than_this_hits}_{remove_factors_under_x}_{high_hit_amount}_removed.txt', 'w') as file:
    file.write(pandas_df_to_latex_table(cppcheck_hits_filtered_lows, 
                                        False, 
                                        [2,4], 
                                        False,
                                        None,
                                        None))