# Precomputing dataset splits
This notebook shows how the dataset splits have been computed beforehand, under difference data split approaches and for different dataset versions.
The dataset versions handled are:
* v1: symbols dataset with only 10 classes by topic(network, cryptography, disk, etc)
* v2: symbols dataset with >100 classes by topic and task (network_send, cryptography_encrypt, network_config,...)
* v3: symbols dataset with a selection of 24 classes containing both topic and task 

The data split approaches consist in:
* 1) just split the dataset without modifying class imbalance
* 2) remove minimum classes, classes with less samples than a minimum threshold
* 3) undersample majority classes, classes with more samples than a threshold will be undersampled


In [2]:
%load_ext autoreload
%autoreload 2
from TFM_function_renaming_baseline_models import *
from TFM_function_renaming_preprocess_dataset_splits import *
from TFM_function_renaming_nlp_models import *

## Inspecting class imbalances

In [2]:
inspect_dataset('./tmp/symbols_dataset_1')

Inspecting ./tmp/symbols_dataset_1
processed_path tmp/symbols_dataset_1/processed
num samples: 30885
num classes: 1
num features: 4
 Samples per class:
{'0_': 27,
 '1_memory': 1248,
 '2_disk': 1710,
 '3_users': 697,
 '4_cryptography': 4580,
 '5_process': 1378,
 '6_datastruct': 1093,
 '7_gui': 5350,
 '8_network': 14653,
 '9_computation': 149}


In [3]:
inspect_dataset('./tmp/symbols_dataset_2')

Inspecting ./tmp/symbols_dataset_2
processed_path tmp/symbols_dataset_2/processed
num samples: 30885
num classes: 1
num features: 4
 Samples per class:
{'0_': 27,
 '100_users_match': 2,
 '101_computation_delete': 2,
 '102_users_work': 48,
 '103_computation_work': 19,
 '104_computation_config': 78,
 '105_network_encrypt': 74,
 '106_network_compute': 789,
 '107_disk_compute': 21,
 '108_datastruct_compute': 41,
 '109_disk_delete': 47,
 '10_memory_start': 1,
 '110_memory_stop': 4,
 '111_memory_hide': 2,
 '112_memory_set': 5,
 '113_gui_work': 4347,
 '114_gui_sync': 5,
 '115_cryptography_decrypt': 2,
 '116_memory_parse': 2,
 '117_memory_write': 102,
 '118_users_write': 4,
 '119_gui_read': 2,
 '11_memory_save': 1,
 '120_datastruct_delete': 24,
 '121_cryptography_show': 1,
 '122_users_compute': 26,
 '123_users_data': 27,
 '124_users_save': 2,
 '125_cryptography_start': 5,
 '126_disk_show': 1,
 '127_network_delete': 130,
 '128_network_parse': 93,
 '129_cryptography_config': 2484,
 '12_gui_write

In [5]:
inspect_dataset('./tmp/symbols_dataset_3')

Inspecting ./tmp/symbols_dataset_3
processed_path tmp/symbols_dataset_3/processed
num samples: 30885
num classes: 1
num features: 4
 Samples per class:
{'0_': 3521,
 '10_process': 344,
 '11_disk_read': 187,
 '12_network_send': 1191,
 '13_memory_write': 1017,
 '14_cryptography': 522,
 '15_network_config': 2009,
 '16_network_parse': 1444,
 '17_network': 46,
 '18_users': 99,
 '19_disk_file': 674,
 '1_cryptography_encrypt': 2370,
 '20_disk': 12,
 '21_memory_read': 262,
 '22_cryptography_config': 2883,
 '23_disk_write': 314,
 '2_computation': 1804,
 '3_gui_config': 673,
 '4_process_sync': 225,
 '5_process_config': 2178,
 '6_datastruct': 3147,
 '7_gui': 4637,
 '8_memory': 403,
 '9_memory_config': 923}


## Precomputing dataset splits

In [12]:
%load_ext autoreload
%autoreload 2
from TFM_function_renaming_baseline_models import *
from TFM_function_renaming_preprocess_dataset_splits import *
from TFM_function_renaming_nlp_models import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
precompute_dataset_splits_unchanged_classes()

processed_path tmp/symbols_dataset_1/processed
30885
1
4
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
max class count set to  14653
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
Root folder verification:  tmp/symbols_dataset_1
Destination:  symbols_dataset_1_precomp_split_unchanged/training_set
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1/processed /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1_precomp_split_unchanged/training_set/processed
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1/processed /media/disk/home/

In [14]:
precompute_dataset_splits_remove_min_classes()

processed_path tmp/symbols_dataset_1/processed
30885
1
4
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
max class count set to  14653
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
Root folder verification:  tmp/symbols_dataset_1
Destination:  symbols_dataset_1_precomp_split_remove_min/training_set
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1/processed /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1_precomp_split_remove_min/training_set/processed
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1/processed /media/disk/hom

In [15]:
precompute_dataset_splits_undersample_max_classes()

processed_path tmp/symbols_dataset_1/processed
30885
1
4
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
 classes to undersample  8
 classes to undersample  7
max class count set to  4580
{0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 4581,
 8: 4581,
 9: 149}
Root folder verification:  tmp/symbols_dataset_1
Destination:  symbols_dataset_1_precomp_split_undersample_max/training_set
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1/processed /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp/symbols_dataset_1_precomp_split_undersample_max/training_set/processed
INside save_partial_dataset_symlink: tmp/symbols_dataset_1 /media/disk/home/pau/Projectes/GNN-MThesis/src/function_renaming/tmp /media/disk/home/pau/Projectes/GNN-MThesis/src/fun

In [5]:
proportions_removemin = {0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
total=0
for i in proportions_removemin.keys():
    total+=proportions_removemin[i]
print("total ",total)
for i in proportions_removemin.keys():
    print("proportion class ",i," = ",str(proportions_removemin[i]/total))

total  30885
proportion class  0  =  0.0008742107819329772
proportion class  1  =  0.040407965031568724
proportion class  2  =  0.055366682855755224
proportion class  3  =  0.02256758944471426
proportion class  4  =  0.14829205115751984
proportion class  5  =  0.044617128055690465
proportion class  6  =  0.035389347579731265
proportion class  7  =  0.173223247531164
proportion class  8  =  0.47443742917273757
proportion class  9  =  0.004824348389185689


In [4]:
proportions_unchanged = {0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 5350,
 8: 14653,
 9: 149}
total=0
for i in proportions_unchanged.keys():
    total+=proportions_unchanged[i]
print("total ",total)
for i in proportions_unchanged.keys():
    print("proportion class ",i," = ",str(proportions_unchanged[i]/total))

total  30885
proportion class  0  =  0.0008742107819329772
proportion class  1  =  0.040407965031568724
proportion class  2  =  0.055366682855755224
proportion class  3  =  0.02256758944471426
proportion class  4  =  0.14829205115751984
proportion class  5  =  0.044617128055690465
proportion class  6  =  0.035389347579731265
proportion class  7  =  0.173223247531164
proportion class  8  =  0.47443742917273757
proportion class  9  =  0.004824348389185689


In [2]:
proportions_undersample = {0: 27,
 1: 1248,
 2: 1710,
 3: 697,
 4: 4580,
 5: 1378,
 6: 1093,
 7: 4581,
 8: 4581,
 9: 149}
total=0
for i in proportions_undersample.keys():
    total+=proportions_undersample[i]
print("total ",total)
for i in proportions_undersample.keys():
    print("proportion class ",i," = ",str(proportions_undersample[i]/total))

total  20044
proportion class  0  =  0.0013470365196567552
proportion class  1  =  0.06226302135302335
proportion class  2  =  0.0853123129115945
proportion class  3  =  0.03477349830373179
proportion class  4  =  0.2284973059269607
proportion class  5  =  0.06874875274396328
proportion class  6  =  0.0545300339253642
proportion class  7  =  0.22854719616842945
proportion class  8  =  0.22854719616842945
proportion class  9  =  0.007433645978846538


## Example training
Now, after adapting code for training models, they can be trained a bit faster thanks to having the dataset already splitted.

In [2]:
X_train, X_test, y_train, y_test, nclasses = load_dataset_split(
    'tmp/symbols_dataset_3_precomp_split_undersample_max')




In [5]:
features='x_topo_feats'
dataset_version='v3'
fileversion='results/precomputed_datasplits_test.json'
baseline_training_and_testing(
    X_train, X_test, 
    y_train, y_test,
    features,
    dataset_version,
    nclasses,
    fileversion,
    baseline_models=prepare_models_quick(),
    baseline_nn_models=prepare_nn_models_quick())


Training  LogisticRegression
GridseachCV for  f1_micro


  'precision', 'predicted', average, warn_for)


Training  RandomForestClassifier
GridseachCV for  f1_micro


  'precision', 'predicted', average, warn_for)



nn_train_models, nclasses= 24
n_X_cols for the nn:  10  and X shape  (18156, 10)
params_set
{'d1': [1], 'd2': [3], 'num_epochs': [2]}


In [9]:
features='document and topo feats'
dataset_version='v3'
fileversion='results/precomputed_datasplits_test.json'

nlp_models_training_and_testing(
    X_train, X_test, 
    y_train, y_test,
    features,
    dataset_version,
    nclasses,
    fileversion,
    nlp_models=prepare_models_quick(),
    nn_models=prepare_nn_models_quick())




cv_train_nn_nlp_models, nclasses= 24
Before unrolling:
{'mlp1': {'model': <class 'TFM_function_renaming_baseline_models.mlp1'>,
          'params_set': [{'d1': [1], 'd2': [3], 'num_epochs': [2]}]}}
'mlp1'
{'model': <class 'TFM_function_renaming_baseline_models.mlp1'>,
 'params_set': [{'d1': [1], 'd2': [3], 'num_epochs': [2]}]}
{'model': <class 'TFM_function_renaming_baseline_models.mlp1'>,
 'params_set': [{'d1': [1],
                 'd2': [3],
                 'model_class': [<class 'TFM_function_renaming_baseline_models.mlp1'>],
                 'num_classes': [24],
                 'num_epochs': [2],
                 'preprocessor__tfidf__tvec__max_df': [0.8],
                 'preprocessor__tfidf__tvec__max_features': [100],
                 'preprocessor__tfidf__tvec__min_df': [0.1],
                 'preprocessor__tfidf__tvec__ngram_range': [(2, 3)]}]}
[{'d1': 1,
  'd2': 3,
  'model_class': <class 'TFM_function_renaming_baseline_models.mlp1'>,
  'num_classes': 24,
  'num_epochs'

  'precision', 'predicted', average, warn_for)





Before save results, fileversion= results/precomputed_datasplits_test.json 



{'f1_micro': {'0': {'f1-score': 0.5450199203187251,
                    'precision': 0.45178335535006603,
                    'recall': 0.6867469879518072,
                    'support': 498},
              '1': {'f1-score': 0.3352490421455939,
                    'precision': 0.32169117647058826,
                    'recall': 0.35,
                    'support': 500},
              '10': {'f1-score': 0.16260162601626016,
                     'precision': 0.3333333333333333,
                     'recall': 0.10752688172043011,
                     'support': 93},
              '11': {'f1-score': 0.0,
                     'precision': 0.0,
                     'recall': 0.0,
                     'support': 50},
              '12': {'f1-score': 0.15267175572519084,
                     'precision': 0.28846153846153844,
                     'recall': 0.10380622837370242,
                     'support': 289},


  'precision', 'predicted', average, warn_for)





Before save results, fileversion= results/precomputed_datasplits_test.json 



{'f1_micro': {'0': {'f1-score': 0.565947242206235,
                    'precision': 0.4701195219123506,
                    'recall': 0.7108433734939759,
                    'support': 498},
              '1': {'f1-score': 0.4007123775601068,
                    'precision': 0.3611556982343499,
                    'recall': 0.45,
                    'support': 500},
              '10': {'f1-score': 0.0392156862745098,
                     'precision': 0.2222222222222222,
                     'recall': 0.021505376344086023,
                     'support': 93},
              '11': {'f1-score': 0.0,
                     'precision': 0.0,
                     'recall': 0.0,
                     'support': 50},
              '12': {'f1-score': 0.12804878048780488,
                     'precision': 0.5384615384615384,
                     'recall': 0.0726643598615917,
                     'support': 289},
     

In [3]:
print_training_stats('','','results/precomputed_datasplits_test.json')

Unnamed: 0,model,parameters,data features,optimized score,avg score in cv,micro-precision,micro-recall,micro-f1,support
0,RandomForestClassifier,clf__max_depth:8__clf__n_estimators:16__prepro...,document and topo feats,f1_micro2019-09-06_19_42_16,0.345506,0.144338,0.19098,0.15097,6053
1,LogisticRegression,clf__C:1__clf__max_iter:100__clf__multi_class:...,document and topo feats,f1_micro2019-09-06_16_34_11,0.322703,0.059302,0.0943334,0.0404312,6053
2,mlp1,d1:110__d2:3__num_classes:24,document and topo feats,f1_micro2019-09-06_17_34_59,0.175615,"[, ]","[, ]","[, ]","[, ]"
3,DecisionTreeClassifier,max_depth:4,x_topo_feats,f1_micro2019-09-06_14_51_12,0.159286,0.0828513,0.156782,0.0988086,6053


In [4]:
print_all_training_stats('','','results/precomputed_datasplits_test.json')

Unnamed: 0,model,parameters,data features,optimized score,avg score in cv,micro-precision,micro-recall,micro-f1,support
0,LogisticRegression,C:10__max_iter:200__multi_class:ovr__penalty:l...,x_topo_feats,f1_micro2019-09-06_14_51_11,0.093413,0.059302,0.0943334,0.0404312,6053.0
1,LogisticRegression,C:1__max_iter:100__multi_class:ovr__penalty:l2...,x_topo_feats,f1_micro2019-09-06_14_52_53,0.089722,0.0190616,0.0771518,0.0198481,6053.0
2,LogisticRegression,C:1__max_iter:100__multi_class:ovr__penalty:l2...,x_topo_feats,f1_micro2019-09-06_14_56_17,0.089722,0.0190616,0.0771518,0.0198481,6053.0
3,LogisticRegression,C:1__max_iter:100__multi_class:ovr__penalty:l2...,x_topo_feats,f1_micro2019-09-06_14_59_47,0.089722,0.0190616,0.0771518,0.0198481,6053.0
4,LogisticRegression,clf__C:1__clf__max_iter:100__clf__multi_class:...,document and topo feats,f1_micro2019-09-06_16_34_11,0.322703,0.296631,0.330415,0.293853,6053.0
5,LogisticRegression,clf__C:1__clf__max_iter:100__clf__multi_class:...,document and topo feats,f1_micro2019-09-06_17_20_29,0.322703,0.296631,0.330415,0.293853,6053.0
6,LogisticRegression,clf__C:1__clf__max_iter:100__clf__multi_class:...,document and topo feats,f1_micro2019-09-06_18_57_36,0.322703,0.296631,0.330415,0.293853,6053.0
7,LogisticRegression,clf__C:1__clf__max_iter:100__clf__multi_class:...,document and topo feats,f1_micro2019-09-06_19_42_16,0.322703,0.296631,0.330415,0.293853,6053.0
8,DecisionTreeClassifier,max_depth:4,x_topo_feats,f1_micro2019-09-06_14_51_12,0.159286,0.0828513,0.156782,0.0988086,6053.0
9,RandomForestClassifier,max_depth:8__n_estimators:16,x_topo_feats,f1_micro2019-09-06_14_52_55,0.18688,0.144338,0.19098,0.15097,6053.0
