#### This code snippet is to interaction numpy and tensorflow

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]="3"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
def sort_by(dataset, by):
    
    dataset = dataset.sort_values(by=[by])        
    dataset = dataset.set_index(np.arange(dataset.shape[0]))   
    
    return dataset    

def read_drop_duplicate_sort_by(path_to_file, file_name, by=''):
    
    dataset = pd.read_csv(path_to_file + file_name)
    dataset = dataset.drop_duplicates()
    dataset = dataset.dropna(axis=0, how='any')
    
    if by != '':
        dataset = sort_by(dataset, by)        
    
    return dataset

### Read data from files 

In [None]:
path_to_file = 'D:\\computer_science\\'    
    
nodes_train = read_drop_duplicate_sort_by(path_to_file, 'nodes_train.csv',     by='Node')
nodes_train.to_csv('nodes_train.csv', index=False)

nodes_test  = read_drop_duplicate_sort_by(path_to_file, 'nodes_test.csv', by='Node')
nodes_test  = nodes_test[96:]
nodes_test  = nodes_test.set_index(np.arange(nodes_test.shape[0])) 
nodes_test.to_csv('nodes_test.csv', index=False)

edges       = read_drop_duplicate_sort_by(path_to_file, 'edges_orig.csv')
edges       = sort_by(edges, by='NodeLeft')
edges.to_csv('edges.csv', index=False)

### How many classes are in dataset? 

In [None]:
class_labels = nodes_train['Y'].drop_duplicates()

print('Count of class labels is %d' % class_labels.shape[0])
print('Max class label is %d' % class_labels.max())
print('Min class label is %d' % class_labels.min())

### Estimate some parameters

In [None]:
max_feat1  = edges['Feat1'].max()
min_feat1  = edges['Feat1'].min()
mean_feat1 = edges['Feat1'].mean()

print(max_feat1, min_feat1, mean_feat1)

max_feat2 = edges['Feat2'].max()
min_feat2 = edges['Feat2'].min()
mean_feat2 = edges['Feat2'].mean()

print(max_feat2, min_feat2, mean_feat2)

### Create dataset to train

In [None]:
def make_dataset_train(edges, nodes, num_rows=edges.shape[0]):        
        
    rows = edges.shape[0]
        
    dataset = pd.DataFrame(index=np.arange(num_rows), columns=['Node', 'Feat1', 'Feat2', 'Feat3', 'Feat4', 'Y'])    
                                           
    i, start = 0, time.time()
    
    count_equ = 0
                        
    for row, (node_train, node_test, feat1, feat2) in enumerate(zip(edges.NodeLeft, edges.NodeRight, edges.Feat1, edges.Feat2)):
            
        left  = nodes['Node'].searchsorted(node_train, side='left' )[0]
        right = nodes['Node'].searchsorted(node_train, side='right')[0]
                                           
        for ind in range(left, right):                            

            dataset.set_value(i, 'Node',  node_train)
            dataset.set_value(i, 'Feat1', feat1)
            dataset.set_value(i, 'Feat2', feat2)
            dataset.set_value(i, 'Feat3', nodes.at[ind, 'Feat3'])
            dataset.set_value(i, 'Feat4', nodes.at[ind, 'Feat4'])
            dataset.set_value(i, 'Y',     nodes.at[ind, 'Y'])

            i += 1       
                                     
        print('Search train node features %.1f %% ...' % (row/rows*100))

    end = time.time()
    
    print('Find %d train nodes.' % i)
    
    dataset = dataset.dropna(axis=0, how='any')
    dataset = sort_by(dataset, by='Node')
    
    file_name, path_to_save = 'dataset_train.csv', os.getcwd()
    
    print('%s is created by %d sec' % (file_name, end - start))
    
    dataset.to_csv('.\\' + file_name, index=False)
    
    print('%s is saved to %s' % (file_name, path_to_save))   

In [None]:
# if not os.path.isfile('data_train.csv'):
make_dataset_train(edges, nodes_train) 

### Create dataset to test

In [None]:
def make_dataset_test(edges, nodes_train, nodes_test, num_rows=edges.shape[0]):        
        
    rows = edges.shape[0]
        
    dataset = pd.DataFrame(index=np.arange(num_rows), columns=['Node', 'Feat1', 'Feat2', 'Feat4'])    
                                           
    i, start = 0, time.time()
                        
    for row, (node_train, node_test, feat1, feat2) in enumerate(zip(edges.NodeLeft, edges.NodeRight, edges.Feat1, edges.Feat2)):
        
        left1  = nodes_test['Node'].searchsorted(node_train, side='left' )[0]
        right1 = nodes_test['Node'].searchsorted(node_train, side='right')[0]        
         
        for ind in range(left1, right1):  
                
            dataset.set_value(i, 'Node',  node_train)
            dataset.set_value(i, 'Feat1', feat1)
            dataset.set_value(i, 'Feat2', feat2)
            dataset.set_value(i, 'Feat4', nodes_test.at[ind, 'Feat4'])

            i += 1       
                                           
        left2  = nodes_test['Node'].searchsorted(node_test, side='left' )[0]
        right2 = nodes_test['Node'].searchsorted(node_test, side='right')[0]        
                     
        for ind in range(left2, right2):  
                
            dataset.set_value(i, 'Node',  node_test)
            dataset.set_value(i, 'Feat1', feat1)
            dataset.set_value(i, 'Feat2', feat2)
            dataset.set_value(i, 'Feat4', nodes_test.at[ind, 'Feat4'])

            i += 1       
                                   
        print('Search test node features %.1f %% ...' % (row/rows*100))

    end = time.time()
    
    print('Find %d test nodes.' % i)
    
    dataset = dataset.dropna(axis=0, how='any')
    dataset = sort_by(dataset, by='Node')
    
    file_name, path_to_save = 'dataset_test.csv', os.getcwd()
    
    print('%s is created by %d sec' % (file_name, end - start))
    
    dataset.to_csv('.\\' + file_name, index=False)
    
    print('%s is saved to %s' % (file_name, path_to_save))   

In [None]:
make_dataset_test(edges, nodes_train, nodes_test) 

### Check intersection between nodes

In [None]:
nodes_test_left = pd.Index(nodes_test['Node']).intersection(pd.Index(edges['NodeLeft']))
print(nodes_test_left.shape)

nodes_test_right = pd.Index(nodes_test['Node']).intersection(pd.Index(edges['NodeRight']))
print(nodes_test_right.shape)

nodes_test_left_train  = pd.Index(nodes_train['Node']).intersection(nodes_test_left)
nodes_test_right_train = pd.Index(nodes_train['Node']).intersection(nodes_test_right)

print(nodes_test_left_train, nodes_test_right_train)