# Tokenize and anonymize code samples

In [1]:
import pandas as pd
import subprocess
import re
from tqdm import tqdm
#from pandas import Panel

## Load function defs with dump_tokens output

In [3]:
df = pd.read_parquet(r'/mnt/md0/user/scheuererra68323/JTT/JTT_DumpTokens_wExtFuncCalls.parquet')
print(df.shape)
df.head()

(25000, 6)


Unnamed: 0_level_0,path,line_start,line_stop,code_snippet,external_function_names,dump_tokens_output
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
83452,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,22,39,"[unsigned int data;, data = 0;, if(glo...","[printUnsignedLine, globalReturnsTrue]",unsigned 'unsigned'\t [StartOfLine]\tLoc=<<std...
248990,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,199,203,"[goodG2B();, goodB2G();]",[],identifier 'goodG2B'\t [StartOfLine]\tLoc=<<st...
220067,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,56,67,"[int64_t * data;, /* Initialize data*/, ...",[CWE762_Mismatched_Memory_Management_Routines_...,identifier 'int64_t'\t [StartOfLine]\tLoc=<<st...
283612,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,52,57,"[printIntLine(data->intOne);, /* POTENTIAL...",[],identifier 'printIntLine'\t [StartOfLine]\tLoc...
142227,/mnt/md0/user/scheuererra68323/testset_jtt/C/t...,46,52,"[if(GLOBAL_CONST_FIVE==5), {, sign...",[signal],if 'if'\t [StartOfLine]\tLoc=<<stdin>:1:1>\nl_...


## Parse output of clang -dump-tokens

In [4]:
def handle_identifiers(row, external_function_names):
    if row.token_text in external_function_names:
        return row.token_text
    elif row.token_type == 'identifier':
        return row['identifier_anon']
    elif row.token_type == 'string_literal':
        return '<string_literal>'
    elif row.token_type == 'wide_string_literal':
        return '<wide_string_literal>'
    elif row.token_type == 'utf16_string_literal':
        return '<utf16_string_literal>'
    elif row.token_type == 'char_constant':
        return '<char_constant>'
    elif row.token_type == 'numeric_constant':
        return '<numeric_constant>'
    else:
        #if row.token_text == 'token_text_to_investigate':
        #    print("-------")
        #    print("token_text_to_investigate -\t", row.dump_tokens_output)
        #    print("-------")
        return row.token_text

def parse_dump_tokens(row):
    tokens = pd.DataFrame(row.dump_tokens_output.split('\n'), columns=['dump_tokens_output'])

    tokens['token_type'] = tokens['dump_tokens_output'].str.extract('(\w+)')
    tokens['token_text'] = tokens['dump_tokens_output'].str.extract(r'\w+ \'(.+)\'') #.reset_index()[0]
    
    # remove eof, empty lines and error messages from command output
    tokens = tokens[ tokens.token_text.notnull() ]
    tokens = tokens[ tokens.token_text.map(len) >= 1 ]
    
    regex = r"<stdin>|#|\s*~|\s*\^"    
    if not tokens[ tokens.dump_tokens_output.str.match( regex ) ].empty:
        tokens = tokens[ tokens.dump_tokens_output.str.match( regex ) == False ]
    
    if tokens.empty:
        return None
   
    # create a lookup table for identifier anonymization
    anonymize_identifiers = pd.DataFrame( tokens[tokens.token_type=='identifier'].token_text.unique() , columns=['token_text'])
    anonymize_identifiers['index_txt'] = anonymize_identifiers.reset_index()['index'].astype(str)
    anonymize_identifiers['identifier_anon'] = 'identifier' + anonymize_identifiers.index_txt
    anonymize_identifiers.drop(['index_txt'], axis='columns', inplace=True)
    
    tokens = tokens.merge(anonymize_identifiers, on='token_text', how='left')
    
    tokens['token_anon'] = tokens.apply(handle_identifiers, axis='columns', args=[row.external_function_names])
    
    return  list(tokens.token_anon)


tqdm.pandas(desc="Get Function Calls")
df['token_anon'] = df.progress_apply(parse_dump_tokens, axis='columns')
df = df[df.token_anon.notnull()]

  from pandas import Panel
Get Function Calls: 100%|██████████| 25000/25000 [04:56<00:00, 84.33it/s]


In [5]:
print(df.shape)
print(df.head())

(24999, 7)
                                                     path  line_start  \
index                                                                   
83452   /mnt/md0/user/scheuererra68323/testset_jtt/C/t...          22   
248990  /mnt/md0/user/scheuererra68323/testset_jtt/C/t...         199   
220067  /mnt/md0/user/scheuererra68323/testset_jtt/C/t...          56   
283612  /mnt/md0/user/scheuererra68323/testset_jtt/C/t...          52   
142227  /mnt/md0/user/scheuererra68323/testset_jtt/C/t...          46   

        line_stop                                       code_snippet  \
index                                                                  
83452          39  [unsigned int data;,     data = 0;,     if(glo...   
248990        203                       [goodG2B();,     goodB2G();]   
220067         67  [int64_t * data;,     /* Initialize data*/,   ...   
283612         57  [printIntLine(data->intOne);,     /* POTENTIAL...   
142227         52  [if(GLOBAL_CONST_FIVE==5),

In [6]:
#df2 = df.copy()#.sample(n=2000, random_state=0)
#unique_token_anon = pd.DataFrame(df2.token_anon.sum())[0].unique()
#print(len(unique_token_anon),"distinct tokens found")
#print(unique_token_anon)

## Save output

In [6]:
import fastparquet
fastparquet.write("/mnt/md0/user/scheuererra68323/JTT/JTT_TokenAnon_wExtFuncCalls.parquet", df, row_group_offsets=1000, compression="gzip")

## Test: Show token_anon

In [None]:
#print(len(unique_token_anon), "unique tokens found")
#print(unique_token_anon)