In [53]:
import pandas
import numpy
import itertools
import random
from tqdm import tqdm
from multiprocessing import Pool

In [12]:
letters = list('qwertyuiopasdfghjklzxcvbnm')
TYPO_PROBABILITY = 0.5
ADD_TYPO_PROBABILITY = 0.1

def rand_bool(true_prob):
    """
    Returns True with probability true_prob
    """
    return random.uniform(0, 1) < true_prob 

## Collecting data

In [54]:
id_stats = pandas.read_csv('id_stats.csv', index_col='identifier')
id_stats.head()

Unnamed: 0_level_0,num_files,num_occ,num_repos
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1133535,4302461,636834
aa,37524,119497,24676
aaa,5039,27529,2834
aaaa,4036,10504,2288
aaaaa,214,282,111


In [58]:
id_stats['token_split'] = id_stats.index.copy()
id_stats['identifier'] = id_stats.index.copy()
id_stats.head()

Unnamed: 0_level_0,num_files,num_occ,num_repos,token_split,identifier
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1133535,4302461,636834,a,a
aa,37524,119497,24676,aa,aa
aaa,5039,27529,2834,aaa,aaa
aaaa,4036,10504,2288,aaaa,aaaa
aaaaa,214,282,111,aaaaa,aaaaa


In [59]:
id_stats.to_pickle('all_ids.pkl')

In [4]:
id_info = pandas.read_csv('id_info.csv')
id_info = id_info.loc[:, ['num_files', 'num_occ', 'num_repos', 'token_split', 'identifier']]
id_info.head()

Unnamed: 0,num_files,num_occ,num_repos,token_split,identifier
0,1,3,1,set num tile,set
1,1,3,1,set num tile,num
2,1,3,1,set num tile,tile
3,1,1,1,mc fom none,mc
4,1,1,1,mc fom none,fom


In [28]:
common_tokens = numpy.load('common_tokens')
tokens_set = set(common_tokens)

In [29]:
print('id_info:', id_info.shape, 'id_stats:', id_stats.shape, 'common_tokens:', len(common_tokens))

id_info: (164581428, 5) id_stats: (2930973, 3) common_tokens: 440791


## Frequencies of common tokens

In [13]:
def get_frequencies(tokens_set, id_stats, file):
    """
    Damp frequencies of tokens from tokens_set to a file
    """
    frequencies = {}
    for token, row in id_stats.iterrows():
        if token in tokens_set:
            frequencies[token] = row.num_occ
    with open(file, 'w') as f:
        for token in list(tokens_set):
            print(token, requencies[token], file=f)
            
#get_frequencies(id_stats, 'common_frequencies.csv')

## Leaving only identifiers with vectorizable tokens

In [27]:
def vectorized(split): 
    """
    Check whether all tokens from the string 'split' are in tokens_set
    """
    if type(split) != str:
        return False

    for token in split.split():
        if token not in tokens_set:
            return False
    return True

def leave_vectorized(id_info, filename):
    """
    Leave rows in a dataframe whose tokens are all in the tokens_set
    """
    token_split = list(id_info.token_split)
    vectorizable = []

    with Pool(32) as p:
        vectorizable = p.map(vectorized, token_split)
        
    data = id_info.copy()
    data = data[vectorizable]
    data.to_pickle(file_name)
    
#leave_vectorized(id_info, 'vectorizable_ids.pkl')

## Small chunk for testing models

In [31]:
data = pandas.read_pickle('data.pkl')
pick_indices = [rand_bool(0.00015) for i in range(len(data))]
data = data[pick_indices]

data.to_pickle('15k_data.pkl')
print(data.shape)
data.head()

(15580, 5)


Unnamed: 0,num_files,num_occ,num_repos,token_split,identifier
12104,1,2,1,add clear default for types,add
20926,1,4,1,clear tag button,button
35907,4,5,1,string converter property editor,string
46362,12,34,1,result numeric table id,result
65103,7,11,6,stat changed,changed


In [61]:
sdata = pandas.read_pickle('supersmall_data.pkl')
sdata.head()

Unnamed: 0,num_files,num_occ,num_repos,token_split,identifier
2072779,1,2,1,filter drop down selection changed,changed
2701828,2,2,1,of property atomic,of
2816527,1,1,1,rl print list node,list
4605643,1,1,1,test hashed shard key,hashed
5985604,1,1,1,ep wc convert post object to id,post


## Corrupting data

In [18]:
letters = list('qwertyuiopasdfghjklzxcvbnm')

def rand_insert(string):
    """
    Add random letter inside a string
    """
    letter = random.choice(letters)
    if len(string) == 0:
        return letter
    
    pos = random.choice(range(len(string) + 1))
    if pos == len(string):
        return string + letter
    return string[:pos] + letter + string[pos:]

def rand_delete(string):
    """
    Delete random symbol from a string
    """
    if len(string) == 0:
        return string
    pos = random.choice(range(len(string)))
    return string[:pos] + string[pos + 1:]

def rand_substitution(string):
    """
    Substitute random symbol with a letter inside a string
    """
    if len(string) == 0:
        return string
    pos = random.choice(range(len(string)))
    letter = random.choice(letters)
    return string[:pos] + letter + string[pos + 1:]

def rand_typo(string):
    """
    Make random typo in a string
    """
    typo_func = random.choice([rand_insert, rand_delete, rand_substitution])
    return typo_func(string)

def corrupt(data_file):
    """
    Augment some of identifiers from dataframe with TYPO_PROBABILITY,
    another typos in the same word happend with ADD_TYPO_PROBABILITY each
    """
    data = pandas.read_pickle(data_file)
    tokens = list(data.identifier)
    corrupted = []
    for row_number in tqdm(range(len(data))):
        if rand_bool(TYPO_PROBABILITY):
            item = tokens[row_number]
            item = rand_typo(str(item))
            while rand_bool(ADD_TYPO_PROBABILITY):
                item = rand_typo(item)

            tokens[row_number] = item
            corrupted.append(True)
        else:
            corrupted.append(False)
    data['typo'] = tokens
    data['corrupted'] = corrupted
    data.to_pickle('c_' + data_file)
    
def train_test_split(data_file, test_portion=0.3):
    """
    Split data of train and test chunks without mixing rows
    """
    data = pandas.read_pickle(data_file)
    edge = data.index[int(data.shape[0] * (1 - test_portion))]
    data.loc[:edge, :].to_pickle('train_' + data_file)
    data.loc[edge:, :].to_pickle('test_' + data_file)

In [40]:
corrupt('15k_data.pkl')
train_test_split('c_15k_data.pkl')

100%|██████████| 15580/15580 [00:00<00:00, 318319.90it/s]
