# Convert tokenized codesamples to word2vec vectors

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import gensim 
from gensim.models import Word2Vec 


pd.set_option('display.max_columns', 15)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/scheuererra68323/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
FuncDef_TokenAnon = pd.read_parquet(r"/mnt/md0/user/scheuererra68323/JTT/JTT_TokenAnon_wExtFuncCalls_Labeled.parquet")
#FuncDef_TokenAnon = pd.read_parquet(r"/mnt/md0/user/scheuererra68323/LO_SARD102/LO_SARD102_TokenAnon_wExtFuncCalls_Labeled.parquet")

In [None]:
df = FuncDef_TokenAnon.copy()
#df.head()

## Apply word2vec

### Train the model

In [4]:
#
# Word2Vec implementation and t-SNE visualization derived from 
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
#

from gensim.test.utils import datapath
from gensim import utils

class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __init__(self, _sentences):
        self._sentences = _sentences
        
    def __iter__(self):
        for _sentence in self._sentences:
            yield _sentence


In [5]:
import gensim.models

sentences = MyCorpus(df['token_anon'])
model = gensim.models.Word2Vec(sentences=sentences, min_count=1)

### Retrieve vocabulary

In [6]:
print("Vocabulary contains {} tokens".format(len(model.wv.vocab)))
for i, word in enumerate(model.wv.vocab):
    if i == 20:
        break
    print(word)

Vocabulary contains 9697 tokens
unsigned
int
identifier0
;
=
<numeric_constant>
if
(
globalReturnsTrue
)
{
identifier2
identifier3
,
<string_literal>
&
}
identifier4
+
printUnsignedLine


### Visualize the results

In [7]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling
import datetime as dt


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_matplotlib(x_vals, y_vals, labels, num_annotate, filename=None):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(100,100))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, num_annotate)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))
    
    if filename is not None:
        plt.savefig(filename)
    plt.show()
    plt.show()


#x_vals, y_vals, labels = reduce_dimensions(model)
#plot_with_matplotlib(x_vals, y_vals, labels, num_annotate=len(labels), filename="JTT_word2vec.png")

## Convert tokenized into vector representation

In [None]:
def get_word2vec_vectors(token_anon):
    _vectors = []
    for tkn in token_anon:
        _vectors.append(model.wv.get_vector(tkn))
    return _vectors

FuncDef_TokenAnon['word2vec'] = FuncDef_TokenAnon.token_anon.apply(get_word2vec_vectors)

print(FuncDef_TokenAnon.shape)
#FuncDef_TokenAnon.head()

### A) Create a single vector for each sample (avg over all word vectors)

In [8]:
dataset = FuncDef_TokenAnon.copy()

dataset['word2vec_avg'] = dataset.word2vec.apply( lambda x : np.mean(x, axis=0) )

dataset = dataset.reset_index()[['word2vec_avg', 'is_vulnerable']]
dataset.head(), dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292724 entries, 0 to 292723
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   word2vec_avg   292724 non-null  object 
 1   is_vulnerable  292724 non-null  float64
dtypes: float64(1), object(1)
memory usage: 4.5+ MB


(                                        word2vec_avg  is_vulnerable
 0  [-1.0249654, 0.375369, 0.55047935, 0.7751315, ...            0.0
 1  [-1.0249654, 0.375369, 0.55047935, 0.7751315, ...            0.0
 2  [-0.5463387, 0.17863393, 0.30269822, 0.9715602...            0.0
 3  [-0.5565288, -0.19019793, 0.38513714, 0.728798...            0.0
 4  [-0.44383517, 1.0338036, -0.22511859, 0.872823...            0.0,
 None)

In [10]:
dataset.to_hdf("/mnt/md0/user/scheuererra68323/JTT_word2vec.h5", key="JTT_word2vec")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['word2vec_avg'], dtype='object')]

  pytables.to_hdf(


### B) Save zero-padded word2vec sequences

In [17]:
dataset_seq = FuncDef_TokenAnon.copy().drop(['code_snippet', 'dump_tokens_output', 'token_anon'], axis=1)
print("=== word2vec sequences ===")
print("max sequence length (original):", dataset_seq.word2vec.map(len).max())

print("using examples up to sequence length = 100")
max_sequence_length = 100
dataset_seq = dataset_seq.loc[dataset_seq.word2vec.map(len) <= max_sequence_length]
print(f"using {dataset_seq.shape[0]} of {FuncDef_TokenAnon.shape[0]} examples. ({dataset_seq.shape[0] / FuncDef_TokenAnon.shape[0] * 100:.2f}%)")
print("shape:", dataset_seq.shape)

=== word2vec sequences ===
max sequence length (original): 11398
using examples up to sequence length = 100
using 21851 of 24999 examples. (87.41%)
shape: (21851, 125)


In [18]:
print("1st example before padding: ", np.array(dataset_seq.word2vec.iloc[0]).shape)

dataset_seq['word2vec_seq'] = dataset_seq.word2vec.map(lambda _sequence:  
    # axis 0: zero-padding of 580-length to the right
    # axis 1: no padding
    np.pad(np.array(_sequence),
    pad_width= ( [[0, max_sequence_length - np.array(_sequence).shape[0]], [0, 0]]),
    mode='constant',
    constant_values=0)
)
print("1st example after padding: ", np.array(dataset_seq.word2vec_seq.iloc[0]).shape)

dataset_seq = dataset_seq.drop(['path', 'line_start', 'line_stop', 'external_function_names', 'word2vec'], axis=1)
dataset_seq.shape
#dataset_seq.head()

1st example before padding:  (49, 100)
1st example after padding:  (100, 100)


(21851, 121)

In [19]:
X_sequence = np.stack(dataset_seq.word2vec_seq)
y = dataset_seq.drop(['word2vec_seq'], axis=1)
print(X_sequence.shape)
print(y.shape)

(21851, 100, 100)
(21851, 120)


In [22]:
savename = 'JTT'

np.savez_compressed(
    f'/mnt/md0/user/scheuererra68323/{savename}_word2vec_X_sequence.npz',
    X_sequence
)
y.to_hdf(f'/mnt/md0/user/scheuererra68323/{savename}_word2vec_y_sequence.h5', key=f'{savename}_y')