In [248]:
%matplotlib inline
from anytree import Node, RenderTree
from random import random
from anytree.exporter import DotExporter
from IPython.display import Image, display
import os
from datetime import datetime
import pandas as pd
import numpy as np

In [311]:
'''
class for generating a binary tree with the given rule:

The top root has one node and it divides into to two nodes then 4 nodes appear and then 8 then 16.
Give values to all nodes (1 + 2 + 4 + 8 + 16 = 31 nodes). The top node's value should be 1.
The sum of values of each layer should be 1. The sum of two nodes should correspond to the node from which they are produced.
'''
class binarytree():
    '''
    binary tree class
    should have used the networkx package to write this class
    '''
    def __init__(self,layers):
        '''
        input the number of layers, then all the values are 
        randomly assigned according to the given rules
        '''
        self.__root=Node("1")
        self.__vertices=[]
        self.__vertices.append(self.__root)
        prt_index=0
        for i in range(layers):
            for j in range(2**i):
                curr_parent=self.__vertices[prt_index]
                prt_value=float(curr_parent.name)
                #Can use a random number drawn from a bimodal distribution to further avoid evenness
                left_value=prt_value*random()
                right_value=prt_value-left_value
                left=Node(left_value,parent=curr_parent)
                self.__vertices.append(left)
                right=Node(right_value,parent=curr_parent)
                self.__vertices.append(right)
                prt_index+=1
    def access(self,row,number):
        '''
        access the number-th vertex counting from the left-handed side in the row-th row
        '''
        index=sum(range(2**(row-1)))+number-1
        return self.__vertices[index]
    def root(self):
        '''
        return the unique root of the binary tree
        '''
        return self.__root
    def __repr__(self):
        output=''
        for pre, fill, node in RenderTree(self.root()):
            output+="%s%.3f" % (pre, float(node.name))
            output+="\n "
        return output
    
    def visualize(self):
        '''
        This function has a major drawback
        When two values are of the same, the output image would be incorrect
        Anyway it will not be used for now.
        '''
        filename=str(datetime.now())
        filename=filename.replace(" ","_")
        filename=filename.replace(":","_")
        filename+="_binarytree"
        filename+=".png"
        DotExporter(self.root()).to_picture(filename)
        display(Image(filename))
        os.remove(filename)
        return None
    
    '''
    def newvisualize(self):
        
        Write graph to a temporary file and invoke `dot`.

        The output file type is automatically detected from the file suffix.

        *`graphviz` needs to be installed, before usage of this method.*
        """
        filename=str(datetime.now())
        filename=filename.replace(" ","_")
        filename=filename.replace(":","_")
        filename+="_binarytree"
        filename+=".png"
        
        fileformat = os.path.splitext(filename)[1][1:]
        DotE=DotExporter(self.root())
        with NamedTemporaryFile("wb") as dotfile:
            for line in DotE:
                print(line)
                dotfile.write(("%s\n" % line).encode("utf-8"))
            dotfile.flush()
            cmd = ["dot", dotfile.name, "-T", fileformat, "-o", filename]
            check_call(cmd)
    '''

In [None]:
#Make a bifurcation tree with 16 leaf nodes.
#Generate 3000 of such trees.
btrees=[]
for i in range(3000):
    temp=binarytree(4)
    btrees.append(temp)
print(btrees[0])
btrees[0].visualize()

In [304]:
def initialization(num=4000):
    '''
    Total sample size N = 4000; 3000 with Y=0 and 1000 with Y=1. Y is the target variable.
    '''
    # value of Age ranges from 15 to 74
    X1 = pd.Series(np.random.randint(0,2,size=num),name='X1')
    X2 = pd.Series(np.random.randint(0,2,size=num),name='X2')
    X3 = pd.Series(np.random.randint(0,2,size=num),name='X3')
    samples =int(3*num/4)*[0]
    samples.extend((num-int(3*num/4))*[1])
    samples=np.array(samples)
    Y = pd.Series(np.random.permutation(samples),name='Y')
    # Create a pandas dataframe by Concatenate all series
    df = pd.concat([X1,X2,X3,Y],axis=1)
    df.index+=1
    return df

In [308]:
samples=initialization(4000)
samples[0:20]

Unnamed: 0,X1,X2,X3,Y
1,1,0,1,0
2,0,0,1,0
3,0,0,1,0
4,1,0,1,0
5,1,1,0,0
6,0,1,0,0
7,1,0,0,0
8,0,1,0,0
9,0,0,0,0
10,1,1,0,0
