# Tokenize and anonymize code samples

In [3]:
import pandas as pd
import subprocess
import re
from tqdm import tqdm

## Load function defs with dump_tokens output

In [4]:
df = pd.read_parquet(r'./FuncDef_15_DumpTokens.parquet')
print(df.shape)
df.head(10)

  Numpy8 = numba.jitclass(spec8)(NumpyIO)
  Numpy32 = numba.jitclass(spec32)(NumpyIO)
  from pandas.core.index import CategoricalIndex, RangeIndex, Index, MultiIndex


(304846, 5)


Unnamed: 0_level_0,path,line_start,line_stop,code_snippet,dump_tokens_output
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,code_input/libreoffice-6.4.2.2/bean/native/uni...,38,51,"[jclass c;, (*env)->ExceptionClear(env);, ...",identifier 'jclass'\t [StartOfLine]\tLoc=<<std...
1,code_input/libreoffice-6.4.2.2/bean/native/win...,42,55,"[jclass c;, (*env)->ExceptionClear(env);, ...",identifier 'jclass'\t [StartOfLine]\tLoc=<<std...
2,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,42,82,"[HKEY hkey;, DWORD type;, wchar_t* dat...",identifier 'HKEY'\t [StartOfLine]\tLoc=<<stdin...
3,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,90,114,"[const wchar_t* UNOPATHVARNAME = L""UNO_PATH"";,...",const 'const'\t [StartOfLine]\tLoc=<<stdin>:1:...
4,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,210,217,[return platformSpecific();],return 'return'\t [StartOfLine]\tLoc=<<stdin>:...
5,code_input/libreoffice-6.4.2.2/desktop/source/...,47,55,"[int ret = soffice_main();, #ifdef DBG_UTIL, #...",int 'int'\t [StartOfLine]\tLoc=<<stdin>:1:1>\n...
6,code_input/libreoffice-6.4.2.2/desktop/source/...,24,26,[return unopkg_main();],return 'return'\t [StartOfLine]\tLoc=<<stdin>:...
7,code_input/libreoffice-6.4.2.2/desktop/test/de...,23,33,"[//prevent warning about unused parameters, ...",l_paren '('\t [StartOfLine] [LeadingSpace]\tLo...
8,code_input/libreoffice-6.4.2.2/desktop/unx/sou...,16,20,[return !rtl_ustr_ascii_compare_WithLength (st...,return 'return'\t [StartOfLine]\tLoc=<<stdin>:...
9,code_input/libreoffice-6.4.2.2/desktop/unx/sou...,60,128,"[Args *args;, sal_uInt32 nArgs, i, j;, , ...",identifier 'Args'\t [StartOfLine]\tLoc=<<stdin...


## Parse output of clang -dump-tokens

In [5]:
def handle_identifiers(row):
    if row.token_type == 'identifier':
        return row['identifier_anon']
    elif row.token_type == 'string_literal':
        return '<string_literal>'
    elif row.token_type == 'wide_string_literal':
        return '<wide_string_literal>'
    elif row.token_type == 'utf16_string_literal':
        return '<utf16_string_literal>'
    elif row.token_type == 'char_constant':
        return '<char_constant>'
    elif row.token_type == 'numeric_constant':
        return '<numeric_constant>'
    else:
        #if row.token_text == 'token_text_to_investigate':
        #    print("-------")
        #    print("token_text_to_investigate -\t", row.dump_tokens_output)
        #    print("-------")
        return row['token_text']

def parse_dump_tokens(row):
    tokens = pd.DataFrame(row.dump_tokens_output.split('\n'), columns=['dump_tokens_output'])

    tokens['token_type'] = tokens['dump_tokens_output'].str.extract('(\w+)')
    tokens['token_text'] = tokens['dump_tokens_output'].str.extract(r'\w+ \'(.+)\'') #.reset_index()[0]
    
    # remove eof, empty lines and error messages from command output
    tokens = tokens[ tokens.token_text.notnull() ]
    tokens = tokens[ tokens.token_text.map(len) >= 1 ]
    
    regex = r"<stdin>|#|\s*~|\s*\^"    
    if not tokens[ tokens.dump_tokens_output.str.match( regex ) ].empty:
        tokens = tokens[ tokens.dump_tokens_output.str.match( regex ) == False ]
    
    if tokens.empty:
        return None
   
    # create a lookup table for identifier anonymization
    anonymize_identifiers = pd.DataFrame( tokens[tokens.token_type=='identifier'].token_text.unique() , columns=['token_text'])
    anonymize_identifiers['index_txt'] = anonymize_identifiers.reset_index()['index'].astype(str)
    anonymize_identifiers['identifier_anon'] = 'identifier' + anonymize_identifiers.index_txt
    anonymize_identifiers.drop(['index_txt'], axis='columns', inplace=True)
    
    tokens = tokens.merge(anonymize_identifiers, on='token_text', how='left')
    
    tokens['token_anon'] = tokens.apply(handle_identifiers, axis='columns')
    
    return  list(tokens.token_anon)


tqdm.pandas(desc="Get Function Calls")
df['token_anon'] = df.progress_apply(parse_dump_tokens, axis='columns')
df = df[df.token_anon.notnull()]

  from pandas import Panel
Get Function Calls: 100%|██████████| 304846/304846 [1:39:49<00:00, 50.90it/s]  


In [6]:
print(df.shape)
df.head()

(292655, 6)


Unnamed: 0_level_0,path,line_start,line_stop,code_snippet,dump_tokens_output,token_anon
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,code_input/libreoffice-6.4.2.2/bean/native/uni...,38,51,"[jclass c;, (*env)->ExceptionClear(env);, ...",identifier 'jclass'\t [StartOfLine]\tLoc=<<std...,"[identifier0, identifier1, ;, (, *, identifier..."
1,code_input/libreoffice-6.4.2.2/bean/native/win...,42,55,"[jclass c;, (*env)->ExceptionClear(env);, ...",identifier 'jclass'\t [StartOfLine]\tLoc=<<std...,"[identifier0, identifier1, ;, (, *, identifier..."
2,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,42,82,"[HKEY hkey;, DWORD type;, wchar_t* dat...",identifier 'HKEY'\t [StartOfLine]\tLoc=<<stdin...,"[identifier0, identifier1, ;, identifier2, ide..."
3,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,90,114,"[const wchar_t* UNOPATHVARNAME = L""UNO_PATH"";,...",const 'const'\t [StartOfLine]\tLoc=<<stdin>:1:...,"[const, wchar_t, *, identifier0, =, <wide_stri..."
4,code_input/libreoffice-6.4.2.2/cppuhelper/sour...,210,217,[return platformSpecific();],return 'return'\t [StartOfLine]\tLoc=<<stdin>:...,"[return, identifier0, (, ), ;]"


In [None]:
df2 = df.copy()#.sample(n=2000, random_state=0)
unique_token_anon = pd.DataFrame(df2.token_anon.sum())[0].unique()
print(len(unique_token_anon),"distinct tokens found")
print(unique_token_anon)

## Save output

In [8]:
df.to_hdf('./FuncDef_20_TokenAnon.h5', key='FuncDef_20_sample1k_TokenAnon')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['path', 'code_snippet', 'dump_tokens_output', 'token_anon'], dtype='object')]

  encoding=encoding,


OverflowError: Python int too large to convert to C long

In [7]:
import fastparquet
fastparquet.write("./FuncDef_20_TokenAnon.parquet", df, row_group_offsets=1000, compression="gzip")

## Test: Show token_anon

In [1]:
len(pd.DataFrame(token_anon.token_anon.sum())[0].unique())

NameError: name 'pd' is not defined