In [1]:
#loading the libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import tqdm

In [2]:
# loading the data
x_train = np.load('./data_federated/train_x.npy')
y_train = np.load('./data_federated/train_y.npy')
x_test = np.load('./data_federated/test_x.npy')
y_test = np.load('./data_federated/test_y.npy')

In [3]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(4474, 1472)
(4474, 2)
(3080, 1472)
(3080, 2)


#### Baseline model for comparision

In [4]:
clf = RandomForestClassifier(max_depth=5, random_state=0)
clf = clf.fit(x_train, y_train)

In [5]:
score = clf.score(x_test,y_test )
print("Accuracy on test is:", score)

Accuracy on test is: 0.8990259740259741


#### Federated Forest model (A version of the given implementation)

In [6]:
class Client:
    def __init__(self,name):
        self.name = name
        self.models = []
    
    '''creating a local model'''
    def initialize(self):
        self.clf = tree.DecisionTreeClassifier(max_depth=1) # maximum depth can be 2 at any decision time
    
    '''Initializing the data with which the local model will be trained'''
    def initialize_data(self,X,y,f):
        self.X = X
        self.y = y
        self.features = f
    
    '''Fitting the model on the data and returning the gini score which will be used for comparision'''
    def split(self):
        self.clf = self.clf.fit(self.X,self.y)
        #print(self.clf.predict(np.array([self.X[0]])))
        feature = np.argmax(self.clf.feature_importances_)
        gini = self.clf.tree_.impurity[0]
        
        return gini,feature,self.clf
    
    '''Saving the model locally if the gini score is good'''
    def save(self):
        self.models.append((self.clf,self.features))
    
    '''Predicting the accuracy using the local saved models'''
    def predict_values(self,X,y):
        scores = []
        for (model, features) in self.models:
            inp = X[:,features]
            score = model.score(inp,y)
            scores.append(score)
            
        return scores

In [7]:
class Master:
    def __init__(self,datax,datay,clients):
        self.X = datax 
        self.y = datay
        self.no_clients = clients
        #self.models = []
        
        self.clients = []
        for i in range(self.no_clients):
            self.clients.append(Client(str(i)))        
    
    '''Splitting the data into parts for different clients'''
    def divide(self):
        ids = [i for i in range(len(self.X))]
        np.random.shuffle(ids)
        l = ids[:1000] # choosing random ids for data
        datax = self.X[l]
        datay = self.y[l]
        
        all_data = []
        for i in range(self.no_clients):
            feature_ids = [i for i in range(self.X.shape[1])]
            np.random.shuffle(feature_ids)
            m = feature_ids[:500] # choosing random ids for features
            client_datax = datax[:,m]
            all_data.append((client_datax,datay,m))
        
        return all_data
    
    '''return the index where the score is maximum'''
    def decide(self,scores):
        return np.argmax(scores)
    
    '''Created the defined number of trees in the clients'''
    def create_trees(self,num_trees):
        for j in tqdm.tqdm (range(num_trees), desc="Preparing trees..."):
            data = self.divide()
            
            scores,clfs,feature_list = [],[],[]
            for i in range(self.no_clients):
                self.clients[i].initialize()
                self.clients[i].initialize_data(data[i][0],data[i][1],data[i][2])
                gini,feature,clf = self.clients[i].split()
                scores.append(gini)
                clfs.append(clf)
                feature_list.append(data[i][2])
            
            best = self.decide(scores)
            self.clients[best].save()
    
    '''Uses the client models to predict the accuracy of the resulting model'''
    def predict_and_score(self,X,y):
        count = 0
        predictions = []
        for i in range(self.no_clients):
            predictions.extend(self.clients[i].predict_values(X,y))
        pred = np.mean(predictions)
        
        print("The Accuracy is",pred)
            

In [8]:
a = Master(x_train,y_train,2)

In [9]:
a.create_trees(100)

Preparing trees...: 100%|██████████| 100/100 [00:10<00:00,  9.21it/s]


In [10]:
a.predict_and_score(x_test,y_test)

The Accuracy is 0.8902240259740261
