In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

# load LOSARD
with np.load(f"/mnt/md0/user/scheuererra68323/LOSARD_word2vec_X_sequence.npz") as _file:
    losard_X = _file['arr_0']
losard_y = pd.read_hdf(f"/mnt/md0/user/scheuererra68323/LOSARD_word2vec_y_sequence.h5").is_vulnerable.values

# load JTT
with np.load(f"/mnt/md0/user/scheuererra68323/JTT_word2vec_X_sequence.npz") as _file:
    jtt_X = _file['arr_0']
jtt_y = pd.read_hdf(f"/mnt/md0/user/scheuererra68323/JTT_word2vec_y_sequence.h5").is_vulnerable.values



## Unprocessed datasets

In [2]:
# create train/val splits for testJTT
X_full_train, X_full_val, Y_full_train, Y_full_val = train_test_split(losard_X, losard_y, test_size=0.20, random_state=0)

# create train/val/test splits 
X_test = X_full_val
Y_test = Y_full_val
X_train, X_val, Y_train, Y_val = train_test_split(X_full_train, Y_full_train, test_size=0.25, random_state=0)

print("saving LOSARD datasets...")
# check and save
print("X_full_train:", X_full_train.shape)
print("Y_full_train:", Y_full_train.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_X_full_train.npy', X_val)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_Y_full_train.npy', Y_val)
print()
print("X_full_val:", X_full_val.shape)
print("Y_full_val:", Y_full_val.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_X_full_val.npy', X_val)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_Y_full_val.npy', Y_val)
print()
print("X_train:", X_train.shape)
print("Y_train:", Y_train.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_X_train.npy', X_train)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_Y_train.npy', Y_train)
print()
print("X_val:", X_val.shape)
print("Y_val:", Y_val.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_X_val.npy', X_val)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_Y_val.npy', Y_val)
print()
print("X_test:", X_test.shape)
print("Y_test:", Y_test.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_X_test.npy', X_test)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_Y_test.npy', Y_test)
print()
print("Saving JTT dataset..:")
print("X:", jtt_X.shape)
print("Y:", jtt_y.shape)
np.save(f'/mnt/md0/user/scheuererra68323/JTT_w2vseq_X.npy', jtt_X)
np.save(f'/mnt/md0/user/scheuererra68323/JTT_w2vseq_Y.npy', jtt_y)

saving LOSARD datasets...
X_full_train: (132859, 100, 100)
Y_full_train: (132859,)

X_full_val: (33215, 100, 100)
Y_full_val: (33215,)

X_train: (99644, 100, 100)
Y_train: (99644,)

X_val: (33215, 100, 100)
Y_val: (33215,)

X_test: (33215, 100, 100)
Y_test: (33215,)

Saving JTT dataset..:
X: (21851, 100, 100)
Y: (21851,)


## prepare class weights

In [4]:
print("train class weights:")
train_weight_0 = ( 1 / Counter(Y_train)[0] ) * Y_train.shape[0] / 2.0
train_weight_1 = ( 1 / Counter(Y_train)[1] ) * Y_train.shape[0] / 2.0
print([train_weight_0, train_weight_1])
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_weights_train.npy', 
        np.array([train_weight_0, train_weight_1])
       )

print("full_train class weights:")
full_weight_0 = ( 1 / Counter(Y_full_train)[0] ) * Y_full_train.shape[0] / 2.0
full_weight_1 = ( 1 / Counter(Y_full_train)[1] ) * Y_full_train.shape[0] / 2.0
print([full_weight_0, full_weight_1])
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_weights_full_train.npy', 
        np.array([full_weight_0, full_weight_1])
       )

train class weights:
[0.5203559417625802, 12.781426372498716]
full_train class weights:
[0.5201141550723843, 12.929057999221486]


## SMOTE Oversampled datasets

In [5]:
def oversample_smote(X, y):
    from imblearn.over_sampling import SMOTE
    # summarize the new class distribution
    counter = Counter(y)
    print("Counter output before SMOTE:", counter)
    
    # transform the dataset
    oversample = SMOTE()
    X, y = oversample.fit_resample(X, y)
    
    # summarize the new class distribution
    counter = Counter(y)
    print("Counter output after SMOTE:", counter)
    return X, y


print("train SMOTE:")
SMOTE_X_train, SMOTE_Y_train = oversample_smote(np.reshape(X_train, (-1, 100*100)), Y_train)
SMOTE_X_train = np.reshape(SMOTE_X_train, (-1, 100, 100))
print(SMOTE_X_train.shape)
print(SMOTE_Y_train.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_SMOTE_X_train.npy', SMOTE_X_train)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_SMOTE_Y_train.npy', SMOTE_Y_train)


print("full_train SMOTE:")
SMOTE_X_full_train, SMOTE_Y_full_train = oversample_smote(np.reshape(X_full_train, (-1, 100*100)), Y_full_train)
SMOTE_X_full_train = np.reshape(SMOTE_X_full_train, (-1, 100, 100))
print(SMOTE_X_full_train.shape)
print(SMOTE_X_full_train.shape)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_SMOTE_X_full_train.npy', SMOTE_X_full_train)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_SMOTE_Y_full_train.npy', SMOTE_Y_full_train)

train SMOTE:
Counter output before SMOTE: Counter({0.0: 95746, 1.0: 3898})
Counter output after SMOTE: Counter({0.0: 95746, 1.0: 95746})
(191492, 100, 100)
(191492,)
full_train SMOTE:
Counter output before SMOTE: Counter({0.0: 127721, 1.0: 5138})
Counter output after SMOTE: Counter({0.0: 127721, 1.0: 127721})
(255442, 100, 100)
(255442, 100, 100)


## Undersampled

In [9]:
def undersample(X, y):
    ''' Undersamples good code samples so we get equally large sets for both classes'''
    
    dataset = pd.DataFrame({"X": list(X), "y": y})
    
    good = dataset[dataset.y == 0].sample(n=dataset.y.value_counts()[1], random_state=0)
    bad  = dataset[dataset.y == 1]
    dataset_balanced = pd.concat([good,bad], axis=0)
    
    # shuffle samples
    dataset_balanced = dataset_balanced.sample(frac=1, random_state=1).reset_index(drop=True)
    return np.vstack(dataset_balanced.X), dataset_balanced.y.values


print("undersample train:")
undersample_X_train, undersample_Y_train = undersample(X_train, Y_train)
undersample_X_train = np.reshape(undersample_X_train, (-1, 100, 100))
print(undersample_X_train.shape)
print(undersample_Y_train.shape)
print(Counter(undersample_Y_train))
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_undersample_X_train.npy', undersample_X_train)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_undersample_Y_train.npy', undersample_Y_train)

print()
print("undersample full_train:")
undersample_X_full_train, undersample_Y_full_train = undersample(X_full_train, Y_full_train)
undersample_X_full_train = np.reshape(undersample_X_full_train, (-1, 100, 100))
print(undersample_X_full_train.shape)
print(undersample_Y_full_train.shape)
print(Counter(undersample_Y_full_train))
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_undersample_X_full_train.npy', undersample_X_full_train)
np.save(f'/mnt/md0/user/scheuererra68323/LOSARD_w2vseq_undersample_Y_full_train.npy', undersample_Y_full_train)


undersample train:
(7796, 100, 100)
(7796,)
Counter({1.0: 3898, 0.0: 3898})

undersample full_train:
(10276, 100, 100)
(10276,)
Counter({0.0: 5138, 1.0: 5138})
