In [2]:
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss, precision_recall_fscore_support
from sklearn.ensemble import VotingClassifier

In [3]:
# Load the stored partitions
with open('./data/X_train_sample.pickle', 'rb') as f:
    X_train = pickle.load(f)

with open('./data/X_test_sample.pickle', 'rb') as f:
    X_test = pickle.load(f)

with open('./data/y_train_sample.pickle', 'rb') as f:
    y_train = pickle.load(f)

with open('./data/y_test_sample.pickle', 'rb') as f:
    y_test = pickle.load(f)

X = np.concatenate((X_train, X_test), axis=0)
y = np.concatenate((y_train, y_test), axis=0)

In [4]:
# No Partitions
dt = DecisionTreeClassifier(random_state = 0)
dt.fit(X_train,y_train) 
dt_score=dt.score(X_test,y_test)
y_predict=dt.predict(X_test)
y_true=y_test
print('Accuracy of DT: '+ str(dt_score))
precision,recall,fscore,none= precision_recall_fscore_support(y_true, y_predict, average='weighted') 
print('Precision of DT: '+(str(precision)))
print('Recall of DT: '+(str(recall)))
print('F1-score of DT: '+(str(fscore)))

Accuracy of DT: 0.9841646872525732
Precision of DT: 0.9843185764647819
Recall of DT: 0.9841646872525732
F1-score of DT: 0.9838635546769349


In [5]:
# Simple paritioning and evaluation
def partition_data(X, y, num_partitions=5):
    # Split the data into 5 partitions
    X_partitions = []
    y_partitions = []

    for i in range(num_partitions):
        X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
        X_partitions.append((X_train_partition, X_test_partition))
        y_partitions.append((y_train_partition, y_test_partition))
    
    return X_partitions, y_partitions

def evaluate_ensemble(X_train, y_train, X_test, y_test):
    ensemble = VotingClassifier(estimators=[('dt{}'.format(i), DecisionTreeClassifier(random_state=0)) for i in range(5)], voting='hard')
    
    ensemble.fit(X_train, y_train)
    ensemble_score = ensemble.score(X_test, y_test)
    y_predict = ensemble.predict(X_test)
    y_true = y_test
    
    print('Accuracy of Ensemble: ' + str(ensemble_score))
    precision, recall, fscore, none = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    print('Precision of Ensemble: ' + str(precision))
    print('Recall of Ensemble: ' + str(recall))
    print('F1-score of Ensemble: ' + str(fscore))

# Example usage
X_partitions, y_partitions = partition_data(X, y)
X_train, X_test = X_partitions[0]
y_train, y_test = y_partitions[0]
evaluate_ensemble(X_train, y_train, X_test, y_test)

Accuracy of Ensemble: 0.9890815671162492
Precision of Ensemble: 0.9887818547656925
Recall of Ensemble: 0.9890815671162492
F1-score of Ensemble: 0.9886774566843135


In [6]:
def partition_data_with_poision(X, y, poison_fraction=0.1, random_state=0, num_partitions=5):
    # Split the data into 5 partitions
    X_partitions = []
    y_partitions = []

    for i in range(num_partitions):
        X_train_partition, X_test_partition, y_train_partition, y_test_partition = train_test_split(X, y, test_size=0.2, random_state=i)
        X_partitions.append((X_train_partition, X_test_partition))
        y_partitions.append((y_train_partition, y_test_partition))
    
    # Introduce poison into the data
    for i in range(num_partitions):
        X_train_partition, _ = X_partitions[i]
        y_train_partition, _ = y_partitions[i]
        
        n_poison_samples = int(len(X_train_partition) * poison_fraction)
        np.random.seed(random_state)
        poison_indices = np.random.choice(len(X_train_partition), n_poison_samples, replace=False)
        
        # Change the labels of the poison samples
        y_train_partition[poison_indices] = np.random.randint(0, np.max(y) + 1, size=n_poison_samples)
        
        # Update the partition data
        X_partitions[i] = (X_train_partition, X_test_partition)
        y_partitions[i] = (y_train_partition, y_test_partition)
    
    return X_partitions, y_partitions

def introduce_poison_and_evaluate(X_train, y_train, X_test, y_test):
    ensemble = VotingClassifier(estimators=[('dt{}'.format(i), DecisionTreeClassifier(random_state=0)) for i in range(5)], voting='hard')
    
    ensemble.fit(X_train, y_train)
    ensemble_score = ensemble.score(X_test, y_test)
    y_predict = ensemble.predict(X_test)
    y_true = y_test
    
    print('Accuracy of Ensemble: ' + str(ensemble_score))
    precision, recall, fscore, none = precision_recall_fscore_support(y_true, y_predict, average='weighted')
    print('Precision of Ensemble: ' + str(precision))
    print('Recall of Ensemble: ' + str(recall))
    print('F1-score of Ensemble: ' + str(fscore))

# Example usage
X_partitions, y_partitions = partition_data_with_poision(X, y, poison_fraction=0.1, random_state=0)
X_train, X_test = X_partitions[0]
y_train, y_test = y_partitions[0]
introduce_poison_and_evaluate(X_train, y_train, X_test, y_test)

Accuracy of Ensemble: 0.9190751445086706
Precision of Ensemble: 0.9252356384176502
Recall of Ensemble: 0.9190751445086706
F1-score of Ensemble: 0.9206503672764206
