In [7]:
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import VotingClassifier

In [3]:
# Load the stored partitions

with open('./data/X_train_sample.pickle', 'rb') as f:
    X_train = pickle.load(f)

with open('./data/X_test_sample.pickle', 'rb') as f:
    X_test = pickle.load(f)

with open('./data/y_train_sample.pickle', 'rb') as f:
    y_train = pickle.load(f)

with open('./data/y_test_sample.pickle', 'rb') as f:
    y_test = pickle.load(f)

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [5]:
# Hash partitioning - Without Poisoning

def hash_partition_data(X, y, num_partitions=5, overlap=0):
    pre_hash_data = X[:, :-1]
    idxgroup_final = []
    
    for time in range(overlap + 1):
        hash_data = [hash(str(row) + str(time)) % num_partitions for row in pre_hash_data]
        
        if time != overlap:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions)]
        else:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions - overlap * num_partitions)]
        
        idxgroup_final += idxgroup
    
    X_partitions = []
    y_partitions = []
    
    for i in range(num_partitions):
        idx_train = np.concatenate([idxgroup_final[j] for j in range(num_partitions) if j != i])
        idx_test = idxgroup_final[i]
        X_train_partition, X_test_partition = X[idx_train], X[idx_test]
        y_train_partition, y_test_partition = y[idx_train], y[idx_test]
        
        X_partitions.append((X_train_partition, X_test_partition))
        y_partitions.append((y_train_partition, y_test_partition))
    
    return X_partitions, y_partitions

# Modified partition_data function
def partition_data(X, y, num_partitions=5, overlap=0):
    if overlap > 0:
        return hash_partition_data(X, y, num_partitions, overlap)
    else:
        X_partitions = []
        y_partitions = []
        
        for i in range(num_partitions):
            X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
        
        return X_partitions, y_partitions

def evaluate_ensemble(X_train, y_train, X_test, y_test):
    ensemble = VotingClassifier(estimators=[('dt{}'.format(i), DecisionTreeClassifier(random_state=0)) for i in range(5)], voting='hard')
    
    ensemble.fit(X_train, y_train)
    ensemble_score = ensemble.score(X_test, y_test)
    y_predict = ensemble.predict(X_test)
    y_true = y_test
    
    print('Accuracy of Ensemble: ' + str(ensemble_score))
    precision, recall, fscore, none = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    print('Precision of Ensemble: ' + str(precision))
    print('Recall of Ensemble: ' + str(recall))
    print('F1-score of Ensemble: ' + str(fscore))

# Example usage
X_partitions, y_partitions = partition_data(X, y)
X_train, X_test = X_partitions[0]
y_train, y_test = y_partitions[0]
evaluate_ensemble(X_train, y_train, X_test, y_test)

Accuracy of Ensemble: 0.9890815671162492
Precision of Ensemble: 0.9887818547656925
Recall of Ensemble: 0.9890815671162492
F1-score of Ensemble: 0.9886774566843135


In [12]:
# Hash partitioning - With Poisoning

def hash_partition_data(X, y, num_partitions=5, overlap=0, poison_fraction=0.1):
    pre_hash_data = X[:, :-1]
    idxgroup_final = []
    
    for time in range(overlap + 1):
        hash_data = [hash(str(row) + str(time)) % num_partitions for row in pre_hash_data]
        
        if time != overlap:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions)]
        else:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions - overlap * num_partitions)]
        
        idxgroup_final += idxgroup
    
    X_partitions = []
    y_partitions = []
    
    for i in range(num_partitions):
        idx_train = np.concatenate([idxgroup_final[j] for j in range(num_partitions) if j != i])
        idx_test = idxgroup_final[i]
        X_train_partition, X_test_partition = X[idx_train], X[idx_test]
        y_train_partition, y_test_partition = y[idx_train], y[idx_test]
        
        # Introduce poison into the training data
        n_poison_samples = int(len(X_train_partition) * poison_fraction)
        poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
        y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
        
        X_partitions.append((X_train_partition, X_test_partition))
        y_partitions.append((y_train_partition, y_test_partition))
    
    return X_partitions, y_partitions

# Modified partition_data function
def partition_data(X, y, num_partitions=6, overlap=0, poison_fraction=0.1):
    if overlap > 0:
        return hash_partition_data(X, y, num_partitions, overlap, poison_fraction)
    else:
        X_partitions = []
        y_partitions = []
        
        for i in range(num_partitions):
            X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
            
            # Introduce poison into the training data
            n_poison_samples = int(len(X_train_partition) * poison_fraction)
            poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
            y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
            
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
        
        return X_partitions, y_partitions

def evaluate_ensemble(X_train, y_train, X_test, y_test):
    ensemble = VotingClassifier(estimators=[('dt{}'.format(i), DecisionTreeClassifier(random_state=0)) for i in range(5)], voting='hard')
    
    ensemble.fit(X_train, y_train)
    ensemble_score = ensemble.score(X_test, y_test)
    y_predict = ensemble.predict(X_test)
    y_true = y_test
    
    print('Accuracy of Ensemble: ' + str(ensemble_score))
    precision, recall, fscore, none = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    print('Precision of Ensemble: ' + str(precision))
    print('Recall of Ensemble: ' + str(recall))
    print('F1-score of Ensemble: ' + str(fscore))


X_partitions, y_partitions = partition_data(X, y)
X_train, X_test = X_partitions[0]
y_train, y_test = y_partitions[0]
evaluate_ensemble(X_train, y_train, X_test, y_test)


Accuracy of Ensemble: 0.9062299293513166
Precision of Ensemble: 0.9134703907233391
Recall of Ensemble: 0.9062299293513166
F1-score of Ensemble: 0.9080415593572481


In [15]:
# Hash Partitionaing - With Poisoning - Robustness

def hash_partition_data(X, y, num_partitions=5, overlap=0, poison_fraction=0.1):
    pre_hash_data = X[:, :-1]
    idxgroup_final = []
    
    for time in range(overlap + 1):
        hash_data = [hash(str(row) + str(time)) % num_partitions for row in pre_hash_data]
        
        if time != overlap:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions)]
        else:
            idxgroup = [np.nonzero((hash_data == i))[0] for i in range(num_partitions - overlap * num_partitions)]
        
        idxgroup_final += idxgroup
    
    X_partitions = []
    y_partitions = []
    
    for i in range(num_partitions):
        idx_train = np.concatenate([idxgroup_final[j] for j in range(num_partitions) if j != i])
        idx_test = idxgroup_final[i]
        
        if len(idx_train) > 0 and len(idx_test) > 0:  # Check if partition has at least one sample
            X_train_partition, X_test_partition = X[idx_train], X[idx_test]
            y_train_partition, y_test_partition = y[idx_train], y[idx_test]
            
            # Introduce poison into the training data
            n_poison_samples = int(len(X_train_partition) * poison_fraction)
            poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
            y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
            
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
    
    return X_partitions, y_partitions


# Modified partition_data function
def partition_data(X, y, num_partitions=6, overlap=0, poison_fraction=0.1):
    if overlap > 0:
        return hash_partition_data(X, y, num_partitions, overlap, poison_fraction)
    else:
        X_partitions = []
        y_partitions = []
        
        for i in range(num_partitions):
            X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
            
            # Introduce poison into the training data
            n_poison_samples = int(len(X_train_partition) * poison_fraction)
            poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
            y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
            
            X_partitions.append((X_train_partition, X_test_partition))
            y_partitions.append((y_train_partition, y_test_partition))
        
        return X_partitions, y_partitions
    

def evaluate_ensemble(X_partitions, y_partitions, robustness_samples=100):
    ensemble_scores = []
    precision_scores = []
    recall_scores = []
    fscore_scores = []
    robustness_accuracies = []
    
    for i in range(len(X_partitions)):
        X_train, X_test = X_partitions[i]
        y_train, y_test = y_partitions[i]
        
        ensemble = DecisionTreeClassifier(random_state=0)
        ensemble.fit(X_train, y_train)
        ensemble_score = ensemble.score(X_test, y_test)
        y_predict = ensemble.predict(X_test)
        y_true = y_test
        
        ensemble_scores.append(ensemble_score)
        precision, recall, fscore, _ = precision_recall_fscore_support(y_true, y_predict, average='weighted')
        precision_scores.append(precision)
        recall_scores.append(recall)
        fscore_scores.append(fscore)
        
        # Evaluate robustness
        y_poison = np.random.randint(0, np.max(y_true) + 1, size=robustness_samples)
        y_predict_poison = ensemble.predict(X_test[:robustness_samples])
        
        accuracy_poison = accuracy_score(y_poison, y_predict_poison)
        robustness_accuracies.append(accuracy_poison)
    
    print('Average Accuracy of Ensemble: ' + str(np.mean(ensemble_scores)))
    print('Average Precision of Ensemble: ' + str(np.mean(precision_scores)))
    print('Average Recall of Ensemble: ' + str(np.mean(recall_scores)))
    print('Average F1-score of Ensemble: ' + str(np.mean(fscore_scores)))
    
    average_robustness = np.mean(robustness_accuracies)
    print('Average Robustness Accuracy: ' + str(average_robustness))
# Example usage
X_partitions, y_partitions = partition_data(X, y)
evaluate_ensemble(X_partitions, y_partitions)

Average Accuracy of Ensemble: 0.906979233568829
Average Precision of Ensemble: 0.916524249932777
Average Recall of Ensemble: 0.906979233568829
Average F1-score of Ensemble: 0.9096824325181158
Average Robustness Accuracy: 0.16333333333333336
