In [None]:
import numpy as np
import os
import random
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import ensemble
import torch
import json

In [None]:
seed = 151836

def setSeed(seed=seed):
    """
    Setting the seed for reproducibility
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

setSeed()

In [None]:
path = 'dataset/finetuned.json'


with open(path,'r',encoding='utf-8') as f:
    dataset = json.load(f)

X = np.array([element.get('embedding',None) for element in dataset])
y = np.array([element.get('label',None) for element in dataset])

setSeed()

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size= 0.125, random_state=42)
print(len(X_train),len(X_test),len(X_val))

In [None]:
setSeed()
forest = ensemble.RandomForestClassifier(n_estimators=150,criterion='gini',n_jobs=-1,random_state=42)
forest.fit(X_train,y_train)
train_score = forest.score(X_train,y_train)
val_score = forest.score(X_val,y_val)
test_score = forest.score(X_test,y_test)
print('train accuracy: ',train_score)
print('val_accuracy: ',val_score)
print('test_accuracy: ',test_score)

In [None]:
setSeed()
xgboost = ensemble.HistGradientBoostingClassifier(learning_rate=0.3,l2_regularization=0,max_depth=None,random_state=42)
xgboost.fit(X_train,y_train)
train_score = xgboost.score(X_train,y_train)
val_score = xgboost.score(X_val,y_val)
test_score = xgboost.score(X_test,y_test)
print('train accuracy: ',train_score)
print('val_accuracy: ',val_score)
print('test_accuracy: ',test_score)

In [None]:
setSeed()
base_model = sklearn.svm.SVC(kernel='rbf',C=7.0)
svm_rbf = ensemble.BaggingClassifier(estimator=base_model,n_estimators=15,max_samples=0.5)
svm_rbf.fit(X_train,y_train)
train_score = svm_rbf.score(X_train,y_train)
val_score = svm_rbf.score(X_val,y_val)
test_score = svm_rbf.score(X_test,y_test)
print('train accuracy: ',train_score)
print('val_accuracy: ',val_score)
print('test_accuracy: ',test_score)

In [None]:
setSeed()
base_model = sklearn.svm.SVC(kernel='poly',C=7,degree=3)
svm_poly = ensemble.BaggingClassifier(estimator=base_model,n_estimators=15,max_samples=0.5)
svm_poly.fit(X_train,y_train)
train_score = svm_poly.score(X_train,y_train)
val_score = svm_poly.score(X_val,y_val)
test_score = svm_poly.score(X_test,y_test)
print('train accuracy: ',train_score)
print('val_accuracy: ',val_score)
print('test_accuracy: ',test_score)

In [None]:
path = 'dataset/poisoned.json'


with open(path,'r',encoding='utf-8') as f:
    dataset = json.load(f)

X = np.array([element.get('embedding',None) for element in dataset])
y = np.array([element.get('label',None) for element in dataset])

setSeed()

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size= 0.125, random_state=42)
print(len(X_train),len(X_test),len(X_val))

In [None]:
test_accuracy = forest.score(X_test,y_test)
print(f'RF test accuracy: {test_accuracy}')

In [None]:
test_accuracy = xgboost.score(X_test,y_test)
print(f'xgboost test accuracy: {test_accuracy}')

In [None]:
test_accuracy = svm_rbf.score(X_test,y_test)
print(f'SVM rbf test accuracy: {test_accuracy}')

test_accuracy = svm_poly.score(X_test,y_test)
print(f'SVM poly test accuracy: {test_accuracy}')